diff --git a/.hgtags b/.hgtags
index 0bca1f09abecabefff85f44fa43bf0f0fc795645..952d2c2da2cdbca2edd9377566420b98966edfd9 100644
--- a/.hgtags
+++ b/.hgtags
@@ -328,5 +328,6 @@ af5f3e43e6e4424b1da19d9e16f6b853a7b822ed DRTVWR-169
 4b3c68199a86cabaa5d9466d7b0f7e141e901d7a 3.3.3-beta3
 6428242e124b523813bfaf4c45b3d422f0298c81 3.3.3-release
 09ef7fd1b0781f33b8a3a9af6236b7bcb4831910 DRTVWR-170
+005dfe5c4c377207d065fb27858d2eb0b53b143a DRTVWR-167
 f87bfbe0b62d26f451d02a47c80ebef6b9168fc2 3.3.4-beta1
 f87bfbe0b62d26f451d02a47c80ebef6b9168fc2 DRTVWR-158
diff --git a/indra/cmake/Copy3rdPartyLibs.cmake b/indra/cmake/Copy3rdPartyLibs.cmake
index 224e0a8b5163a32b4efbfd941113dcc0402ef12b..9f05c4cff21c4b6cdffa66232bc2982f8a59c675 100644
--- a/indra/cmake/Copy3rdPartyLibs.cmake
+++ b/indra/cmake/Copy3rdPartyLibs.cmake
@@ -57,10 +57,10 @@ if(WINDOWS)
         libhunspell.dll
         )
 
-    if(USE_GOOGLE_PERFTOOLS)
+    if(USE_TCMALLOC)
       set(debug_files ${debug_files} libtcmalloc_minimal-debug.dll)
       set(release_files ${release_files} libtcmalloc_minimal.dll)
-    endif(USE_GOOGLE_PERFTOOLS)
+    endif(USE_TCMALLOC)
 
     if (FMOD)
       set(debug_files ${debug_files} fmod.dll)
@@ -272,13 +272,16 @@ elseif(LINUX)
         libopenal.so
         libopenjpeg.so
         libssl.so
-        libtcmalloc_minimal.so
         libuuid.so.16
         libuuid.so.16.0.22
         libssl.so.1.0.0
         libfontconfig.so.1.4.4
        )
 
+    if (USE_TCMALLOC)
+      set(release_files ${release_files} "libtcmalloc_minimal.so")
+    endif (USE_TCMALLOC)
+
     if (FMOD)
       set(release_files ${release_files} "libfmod-3.75.so")
     endif (FMOD)
diff --git a/indra/cmake/FindGooglePerfTools.cmake b/indra/cmake/FindGooglePerfTools.cmake
old mode 100644
new mode 100755
diff --git a/indra/cmake/GooglePerfTools.cmake b/indra/cmake/GooglePerfTools.cmake
old mode 100644
new mode 100755
index d9f91193be10762c23e7926b214890aecd66a99f..09501e0406e637ceea1e1c92d10291504e9fe2da
--- a/indra/cmake/GooglePerfTools.cmake
+++ b/indra/cmake/GooglePerfTools.cmake
@@ -1,20 +1,34 @@
 # -*- cmake -*-
 include(Prebuilt)
 
+# If you want to enable or disable TCMALLOC in viewer builds, this is the place.
+# set ON or OFF as desired.
+set (USE_TCMALLOC ON)
+
 if (STANDALONE)
   include(FindGooglePerfTools)
 else (STANDALONE)
   if (WINDOWS)
-    use_prebuilt_binary(tcmalloc)
-    set(TCMALLOC_LIBRARIES 
-        debug libtcmalloc_minimal-debug
-        optimized libtcmalloc_minimal)
+    if (USE_TCMALLOC)
+       use_prebuilt_binary(tcmalloc)
+       set(TCMALLOC_LIBRARIES 
+         debug libtcmalloc_minimal-debug
+         optimized libtcmalloc_minimal)
+       set(TCMALLOC_LINK_FLAGS  "/INCLUDE:__tcmalloc")
+    else (USE_TCMALLOC)
+      set(TCMALLOC_LIBRARIES)
+      set(TCMALLOC_LINK_FLAGS)
+    endif (USE_TCMALLOC)
     set(GOOGLE_PERFTOOLS_FOUND "YES")
   endif (WINDOWS)
   if (LINUX)
-    use_prebuilt_binary(tcmalloc)
-    set(TCMALLOC_LIBRARIES 
-    tcmalloc)
+    if (USE_TCMALLOC)
+      use_prebuilt_binary(tcmalloc)
+      set(TCMALLOC_LIBRARIES 
+        tcmalloc)
+    else (USE_TCMALLOC)
+      set(TCMALLOC_LIBRARIES)
+    endif (USE_TCMALLOC)
     set(PROFILER_LIBRARIES profiler)
     set(GOOGLE_PERFTOOLS_INCLUDE_DIR
         ${LIBS_PREBUILT_DIR}/include)
@@ -29,13 +43,19 @@ if (GOOGLE_PERFTOOLS_FOUND)
 endif (GOOGLE_PERFTOOLS_FOUND)
 
 if (WINDOWS)
-    set(USE_GOOGLE_PERFTOOLS ON)
+   set(USE_GOOGLE_PERFTOOLS ON)
 endif (WINDOWS)
 
 if (USE_GOOGLE_PERFTOOLS)
-  set(TCMALLOC_FLAG -ULL_USE_TCMALLOC=1)
+  if (USE_TCMALLOC)
+    set(TCMALLOC_FLAG -DLL_USE_TCMALLOC=1)
+  else (USE_TCMALLOC)
+    set(TCMALLOC_FLAG -ULL_USE_TCMALLOC)
+  endif (USE_TCMALLOC)
+endif (USE_GOOGLE_PERFTOOLS)
+
+if (USE_GOOGLE_PERFTOOLS)
   include_directories(${GOOGLE_PERFTOOLS_INCLUDE_DIR})
   set(GOOGLE_PERFTOOLS_LIBRARIES ${TCMALLOC_LIBRARIES} ${STACKTRACE_LIBRARIES} ${PROFILER_LIBRARIES})
 else (USE_GOOGLE_PERFTOOLS)
-  set(TCMALLOC_FLAG -ULL_USE_TCMALLOC)
 endif (USE_GOOGLE_PERFTOOLS)
diff --git a/indra/cmake/LLAddBuildTest.cmake b/indra/cmake/LLAddBuildTest.cmake
old mode 100644
new mode 100755
index 08feab6e36998c559cadb4ce01006787759c4537..a6f69a09e9300daeecca1065f5166dcf8a87d826
--- a/indra/cmake/LLAddBuildTest.cmake
+++ b/indra/cmake/LLAddBuildTest.cmake
@@ -205,6 +205,15 @@ FUNCTION(LL_ADD_INTEGRATION_TEST
     SET_TARGET_PROPERTIES(INTEGRATION_TEST_${testname} PROPERTIES COMPILE_FLAGS -I"${TUT_INCLUDE_DIR}")
   endif(STANDALONE)
 
+  if (WINDOWS)
+    SET_TARGET_PROPERTIES(INTEGRATION_TEST_${testname}
+        PROPERTIES
+        LINK_FLAGS "/debug /NODEFAULTLIB:LIBCMT /SUBSYSTEM:WINDOWS ${TCMALLOC_LINK_FLAGS}"
+        LINK_FLAGS_DEBUG "/NODEFAULTLIB:\"LIBCMT;LIBCMTD;MSVCRT\" /INCREMENTAL:NO"
+        LINK_FLAGS_RELEASE ""
+        )
+  endif (WINDOWS)
+
   # Add link deps to the executable
   if(TEST_DEBUG)
     message(STATUS "TARGET_LINK_LIBRARIES(INTEGRATION_TEST_${testname} ${libraries})")
diff --git a/indra/llcommon/llallocator.cpp b/indra/llcommon/llallocator.cpp
old mode 100644
new mode 100755
index 6f6abefc67cc1ff74c04d4e05a8ed920dc13a4cf..87654b5b97dc7b6bbab4c65aa25355f19cb41d40
--- a/indra/llcommon/llallocator.cpp
+++ b/indra/llcommon/llallocator.cpp
@@ -27,7 +27,7 @@
 #include "linden_common.h"
 #include "llallocator.h"
 
-#if LL_USE_TCMALLOC
+#if (LL_USE_TCMALLOC && LL_USE_HEAP_PROFILER)
 
 #include "google/heap-profiler.h"
 #include "google/commandlineflags_public.h"
diff --git a/indra/llcommon/llmemory.cpp b/indra/llcommon/llmemory.cpp
old mode 100644
new mode 100755
index 3b9758f996a9f07e71b34f3e2f038782dc7d7554..afaf36666876aac84354637155364afd3f355e4d
--- a/indra/llcommon/llmemory.cpp
+++ b/indra/llcommon/llmemory.cpp
@@ -61,6 +61,18 @@ BOOL LLMemory::sEnableMemoryFailurePrevention = FALSE;
 LLPrivateMemoryPoolManager::mem_allocation_info_t LLPrivateMemoryPoolManager::sMemAllocationTracker;
 #endif
 
+void ll_assert_aligned_func(uintptr_t ptr,U32 alignment)
+{
+#ifdef SHOW_ASSERT
+	// Redundant, place to set breakpoints.
+	if (ptr%alignment!=0)
+	{
+		llwarns << "alignment check failed" << llendl;
+	}
+	llassert(ptr%alignment==0);
+#endif
+}
+
 //static
 void LLMemory::initClass()
 {
diff --git a/indra/llcommon/llmemory.h b/indra/llcommon/llmemory.h
old mode 100644
new mode 100755
index bbbdaa6497fc951d41cb3b2596f0258d5ad8a9df..9dd776ff57922271068e38ebd2ac82cafd537f23
--- a/indra/llcommon/llmemory.h
+++ b/indra/llcommon/llmemory.h
@@ -27,7 +27,6 @@
 #define LLMEMORY_H
 
 #include "llmemtype.h"
-#if LL_DEBUG
 inline void* ll_aligned_malloc( size_t size, int align )
 {
 	void* mem = malloc( size + (align - 1) + sizeof(void*) );
@@ -43,10 +42,11 @@ inline void ll_aligned_free( void* ptr )
 	free( ((void**)ptr)[-1] );
 }
 
+#if !LL_USE_TCMALLOC
 inline void* ll_aligned_malloc_16(size_t size) // returned hunk MUST be freed with ll_aligned_free_16().
 {
 #if defined(LL_WINDOWS)
-	return _mm_malloc(size, 16);
+	return _aligned_malloc(size, 16);
 #elif defined(LL_DARWIN)
 	return malloc(size); // default osx malloc is 16 byte aligned.
 #else
@@ -58,21 +58,38 @@ inline void* ll_aligned_malloc_16(size_t size) // returned hunk MUST be freed wi
 #endif
 }
 
+inline void* ll_aligned_realloc_16(void* ptr, size_t size) // returned hunk MUST be freed with ll_aligned_free_16().
+{
+#if defined(LL_WINDOWS)
+	return _aligned_realloc(ptr, size, 16);
+#elif defined(LL_DARWIN)
+	return realloc(ptr,size); // default osx malloc is 16 byte aligned.
+#else
+	return realloc(ptr,size); // FIXME not guaranteed to be aligned.
+#endif
+}
+
 inline void ll_aligned_free_16(void *p)
 {
 #if defined(LL_WINDOWS)
-	_mm_free(p);
+	_aligned_free(p);
 #elif defined(LL_DARWIN)
 	return free(p);
 #else
 	free(p); // posix_memalign() is compatible with heap deallocator
 #endif
 }
+#else // USE_TCMALLOC
+// ll_aligned_foo_16 are not needed with tcmalloc
+#define ll_aligned_malloc_16 malloc
+#define ll_aligned_realloc_16 realloc
+#define ll_aligned_free_16 free
+#endif // USE_TCMALLOC
 
 inline void* ll_aligned_malloc_32(size_t size) // returned hunk MUST be freed with ll_aligned_free_32().
 {
 #if defined(LL_WINDOWS)
-	return _mm_malloc(size, 32);
+	return _aligned_malloc(size, 32);
 #elif defined(LL_DARWIN)
 	return ll_aligned_malloc( size, 32 );
 #else
@@ -87,22 +104,13 @@ inline void* ll_aligned_malloc_32(size_t size) // returned hunk MUST be freed wi
 inline void ll_aligned_free_32(void *p)
 {
 #if defined(LL_WINDOWS)
-	_mm_free(p);
+	_aligned_free(p);
 #elif defined(LL_DARWIN)
 	ll_aligned_free( p );
 #else
 	free(p); // posix_memalign() is compatible with heap deallocator
 #endif
 }
-#else // LL_DEBUG
-// ll_aligned_foo are noops now that we use tcmalloc everywhere (tcmalloc aligns automatically at appropriate intervals)
-#define ll_aligned_malloc( size, align ) malloc(size)
-#define ll_aligned_free( ptr ) free(ptr)
-#define ll_aligned_malloc_16 malloc
-#define ll_aligned_free_16 free
-#define ll_aligned_malloc_32 malloc
-#define ll_aligned_free_32 free
-#endif // LL_DEBUG
 
 #ifndef __DEBUG_PRIVATE_MEM__
 #define __DEBUG_PRIVATE_MEM__  0
@@ -512,4 +520,13 @@ void  LLPrivateMemoryPoolTester::operator delete[](void* addr)
 
 // LLSingleton moved to llsingleton.h
 
+LL_COMMON_API void ll_assert_aligned_func(uintptr_t ptr,U32 alignment);
+
+#ifdef SHOW_ASSERT
+#define ll_assert_aligned(ptr,alignment) ll_assert_aligned_func(reinterpret_cast<uintptr_t>(ptr),((U32)alignment))
+#else
+#define ll_assert_aligned(ptr,alignment)
+#endif
+
+
 #endif
diff --git a/indra/llmath/CMakeLists.txt b/indra/llmath/CMakeLists.txt
old mode 100644
new mode 100755
index b5e59c1ca350285bf9394089c41a2767820c0d8c..5865ae030c2cac900207fc5e221036488e3ee351
--- a/indra/llmath/CMakeLists.txt
+++ b/indra/llmath/CMakeLists.txt
@@ -117,6 +117,7 @@ if (LL_TESTS)
   # INTEGRATION TESTS
   set(test_libs llmath llcommon ${LLCOMMON_LIBRARIES} ${WINDOWS_LIBRARIES})
   # TODO: Some of these need refactoring to be proper Unit tests rather than Integration tests.
+  LL_ADD_INTEGRATION_TEST(alignment "" "${test_libs}")
   LL_ADD_INTEGRATION_TEST(llbbox llbbox.cpp "${test_libs}")
   LL_ADD_INTEGRATION_TEST(llquaternion llquaternion.cpp "${test_libs}")
   LL_ADD_INTEGRATION_TEST(mathmisc "" "${test_libs}")
diff --git a/indra/llmath/llcamera.h b/indra/llmath/llcamera.h
old mode 100644
new mode 100755
index ec67b91d05f0c14f7f5ad7f3c062a29e542e0143..0b591be622e453915809b3e557caeb04310a937c
--- a/indra/llmath/llcamera.h
+++ b/indra/llmath/llcamera.h
@@ -60,7 +60,7 @@ static const F32 MAX_FIELD_OF_VIEW = 175.f * DEG_TO_RAD;
 // roll(), pitch(), yaw()
 // etc...
 
-
+LL_ALIGN_PREFIX(16)
 class LLCamera
 : 	public LLCoordFrame
 {
@@ -108,7 +108,7 @@ class LLCamera
 	};
 
 private:
-	LLPlane mAgentPlanes[7];  //frustum planes in agent space a la gluUnproject (I'm a bastard, I know) - DaveP
+	LL_ALIGN_16(LLPlane mAgentPlanes[7]);  //frustum planes in agent space a la gluUnproject (I'm a bastard, I know) - DaveP
 	U8 mPlaneMask[8];         // 8 for alignment	
 	
 	F32 mView;					// angle between top and bottom frustum planes in radians.
@@ -116,13 +116,13 @@ class LLCamera
 	S32 mViewHeightInPixels;	// for ViewHeightInPixels() only
 	F32 mNearPlane;
 	F32 mFarPlane;
-	LLPlane mLocalPlanes[4];
+	LL_ALIGN_16(LLPlane mLocalPlanes[4]);
 	F32 mFixedDistance;			// Always return this distance, unless < 0
 	LLVector3 mFrustCenter;		// center of frustum and radius squared for ultra-quick exclusion test
 	F32 mFrustRadiusSquared;
 	
-	LLPlane mWorldPlanes[PLANE_NUM];
-	LLPlane mHorizPlanes[HORIZ_PLANE_NUM];
+	LL_ALIGN_16(LLPlane mWorldPlanes[PLANE_NUM]);
+	LL_ALIGN_16(LLPlane mHorizPlanes[HORIZ_PLANE_NUM]);
 
 	U32 mPlaneCount;  //defaults to 6, if setUserClipPlane is called, uses user supplied clip plane in
 
@@ -208,7 +208,7 @@ class LLCamera
 	void calculateFrustumPlanes(F32 left, F32 right, F32 top, F32 bottom);
 	void calculateFrustumPlanesFromWindow(F32 x1, F32 y1, F32 x2, F32 y2);
 	void calculateWorldFrustumPlanes();
-};
+} LL_ALIGN_POSTFIX(16);
 
 
 #endif
diff --git a/indra/llmath/llmatrix3a.cpp b/indra/llmath/llmatrix3a.cpp
old mode 100644
new mode 100755
diff --git a/indra/llmath/llmatrix3a.h b/indra/llmath/llmatrix3a.h
index adb7e3389d5e54e1fa7695ae6700fadad83dcfe2..9916cfd2daba816a0bef6b0123e1df3f4276bd27 100644
--- a/indra/llmath/llmatrix3a.h
+++ b/indra/llmath/llmatrix3a.h
@@ -111,7 +111,7 @@ class LLMatrix3a
 
 protected:
 
-	LLVector4a mColumns[3];
+	LL_ALIGN_16(LLVector4a mColumns[3]);
 
 };
 
diff --git a/indra/llmath/llmatrix4a.h b/indra/llmath/llmatrix4a.h
old mode 100644
new mode 100755
index 27cf5b79f648939809fb0f2ab35b473748138698..c4cefdb4fa74ad58203b7b6cb903aafb50d9c1e4
--- a/indra/llmath/llmatrix4a.h
+++ b/indra/llmath/llmatrix4a.h
@@ -34,7 +34,7 @@
 class LLMatrix4a
 {
 public:
-	LLVector4a mMatrix[4];
+	LL_ALIGN_16(LLVector4a mMatrix[4]);
 
 	inline void clear()
 	{
diff --git a/indra/llmath/lloctree.h b/indra/llmath/lloctree.h
index 1b11e83b4af369cd4119d19e81d79f486f90014d..6c7744cdf11cf3ba5ddf6858b612166b0db6236e 100644
--- a/indra/llmath/lloctree.h
+++ b/indra/llmath/lloctree.h
@@ -88,7 +88,7 @@ class LLOctreeNode : public LLTreeNode<T>
 	typedef LLOctreeNode<T>		oct_node;
 	typedef LLOctreeListener<T>	oct_listener;
 
-	/*void* operator new(size_t size)
+	void* operator new(size_t size)
 	{
 		return ll_aligned_malloc_16(size);
 	}
@@ -96,7 +96,7 @@ class LLOctreeNode : public LLTreeNode<T>
 	void operator delete(void* ptr)
 	{
 		ll_aligned_free_16(ptr);
-	}*/
+	}
 
 	LLOctreeNode(	const LLVector4a& center, 
 					const LLVector4a& size, 
diff --git a/indra/llmath/llplane.h b/indra/llmath/llplane.h
old mode 100644
new mode 100755
index a611894721c9207d829fdea615feb0a13c6616ec..3c32441b11875f9503c7de564ccb7f85e346a081
--- a/indra/llmath/llplane.h
+++ b/indra/llmath/llplane.h
@@ -36,6 +36,8 @@
 // The plane normal = [A, B, C]
 // The closest approach = D / sqrt(A*A + B*B + C*C)
 
+
+LL_ALIGN_PREFIX(16)
 class LLPlane
 {
 public:
@@ -94,7 +96,7 @@ class LLPlane
 		
 private:
 	LLVector4a mV;
-};
+} LL_ALIGN_POSTFIX(16);
 
 
 
diff --git a/indra/llmath/llsimdmath.h b/indra/llmath/llsimdmath.h
old mode 100644
new mode 100755
index c7cdf7b32cb5d8d729fb42f7ef383424e9923268..01458521ec357cf6ef2cebbef0a13ab3a3d65774
--- a/indra/llmath/llsimdmath.h
+++ b/indra/llmath/llsimdmath.h
@@ -67,11 +67,10 @@ template <typename T> T* LL_NEXT_ALIGNED_ADDRESS_64(T* address)
 
 #define LL_ALIGN_16(var) LL_ALIGN_PREFIX(16) var LL_ALIGN_POSTFIX(16)
 
-
-
 #include <xmmintrin.h>
 #include <emmintrin.h>
 
+#include "llmemory.h"
 #include "llsimdtypes.h"
 #include "llsimdtypes.inl"
 
diff --git a/indra/llmath/llsimdtypes.inl b/indra/llmath/llsimdtypes.inl
index 712239e4258b6bb4ed2ecb41ac45db5e2908a7ab..e905c84954b2be420c29d0c792b0dbcc814e1413 100644
--- a/indra/llmath/llsimdtypes.inl
+++ b/indra/llmath/llsimdtypes.inl
@@ -62,6 +62,7 @@ inline LLSimdScalar operator/(const LLSimdScalar& a, const LLSimdScalar& b)
 inline LLSimdScalar operator-(const LLSimdScalar& a)
 {
 	static LL_ALIGN_16(const U32 signMask[4]) = {0x80000000, 0x80000000, 0x80000000, 0x80000000 };
+	ll_assert_aligned(signMask,16);
 	return _mm_xor_ps(*reinterpret_cast<const LLQuad*>(signMask), a);
 }
 
@@ -146,6 +147,7 @@ inline LLSimdScalar& LLSimdScalar::operator/=(const LLSimdScalar& rhs)
 inline LLSimdScalar LLSimdScalar::getAbs() const
 {
 	static const LL_ALIGN_16(U32 F_ABS_MASK_4A[4]) = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF };
+	ll_assert_aligned(F_ABS_MASK_4A,16);
 	return _mm_and_ps( mQ, *reinterpret_cast<const LLQuad*>(F_ABS_MASK_4A));
 }
 
diff --git a/indra/llmath/llvector4a.cpp b/indra/llmath/llvector4a.cpp
old mode 100644
new mode 100755
index b66b7a70764f414480d4ab5c5c6cf7ce2ba848ca..6edeb0fefe5e37998514e1c83cc9eb00115256d2
--- a/indra/llmath/llvector4a.cpp
+++ b/indra/llmath/llvector4a.cpp
@@ -24,6 +24,7 @@
  * $/LicenseInfo$
  */
 
+#include "llmemory.h"
 #include "llmath.h"
 #include "llquantize.h"
 
@@ -44,7 +45,10 @@ extern const LLVector4a LL_V4A_EPSILON = reinterpret_cast<const LLVector4a&> ( F
 	assert(dst != NULL);
 	assert(bytes > 0);
 	assert((bytes % sizeof(F32))== 0); 
-	
+	ll_assert_aligned(src,16);
+	ll_assert_aligned(dst,16);
+	assert(bytes%16==0);
+
 	F32* end = dst + (bytes / sizeof(F32) );
 
 	if (bytes > 64)
@@ -189,6 +193,8 @@ void LLVector4a::quantize16( const LLVector4a& low, const LLVector4a& high )
 		LLVector4a oneOverDelta;
 		{
 			static LL_ALIGN_16( const F32 F_TWO_4A[4] ) = { 2.f, 2.f, 2.f, 2.f };
+			ll_assert_aligned(F_TWO_4A,16);
+			
 			LLVector4a two; two.load4a( F_TWO_4A );
 
 			// Here we use _mm_rcp_ps plus one round of newton-raphson
diff --git a/indra/llmath/llvector4a.h b/indra/llmath/llvector4a.h
old mode 100644
new mode 100755
index 596082509df104fd09651ac3366975885f6f3d2e..0526793d3a6130cd23857815c58ec0435ea629b8
--- a/indra/llmath/llvector4a.h
+++ b/indra/llmath/llvector4a.h
@@ -32,6 +32,7 @@ class LLRotation;
 
 #include <assert.h>
 #include "llpreprocessor.h"
+#include "llmemory.h"
 
 ///////////////////////////////////
 // FIRST TIME USERS PLEASE READ
@@ -46,6 +47,7 @@ class LLRotation;
 // LLVector3/LLVector4. 
 /////////////////////////////////
 
+LL_ALIGN_PREFIX(16)
 class LLVector4a
 {
 public:
@@ -82,6 +84,7 @@ class LLVector4a
 	}
 
 	// Copy words 16-byte blocks from src to dst. Source and destination must not overlap. 
+	// Source and dest must be 16-byte aligned and size must be multiple of 16.
 	static void memcpyNonAliased16(F32* __restrict dst, const F32* __restrict src, size_t bytes);
 
 	////////////////////////////////////
@@ -90,6 +93,7 @@ class LLVector4a
 	
 	LLVector4a()
 	{ //DO NOT INITIALIZE -- The overhead is completely unnecessary
+		ll_assert_aligned(this,16);
 	}
 	
 	LLVector4a(F32 x, F32 y, F32 z, F32 w = 0.f)
@@ -313,7 +317,7 @@ class LLVector4a
 
 private:
 	LLQuad mQ;
-};
+} LL_ALIGN_POSTFIX(16);
 
 inline void update_min_max(LLVector4a& min, LLVector4a& max, const LLVector4a& p)
 {
diff --git a/indra/llmath/llvector4a.inl b/indra/llmath/llvector4a.inl
index 7ad22a563156dcbd1845816d2d97aff9ead85471..7c52ffef2171c6caecd8c16af7793def0d99f74b 100644
--- a/indra/llmath/llvector4a.inl
+++ b/indra/llmath/llvector4a.inl
@@ -475,6 +475,7 @@ inline void LLVector4a::setLerp(const LLVector4a& lhs, const LLVector4a& rhs, F3
 inline LLBool32 LLVector4a::isFinite3() const
 {
 	static LL_ALIGN_16(const U32 nanOrInfMask[4]) = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
+	ll_assert_aligned(nanOrInfMask,16);
 	const __m128i nanOrInfMaskV = *reinterpret_cast<const __m128i*> (nanOrInfMask);
 	const __m128i maskResult = _mm_and_si128( _mm_castps_si128(mQ), nanOrInfMaskV );
 	const LLVector4Logical equalityCheck = _mm_castsi128_ps(_mm_cmpeq_epi32( maskResult, nanOrInfMaskV ));
diff --git a/indra/llmath/llvector4logical.h b/indra/llmath/llvector4logical.h
index dd66b09d4391608eda6e4affa0674b4825cc288b..c5698f7cea62b1d56084906cf18a0801792070b7 100644
--- a/indra/llmath/llvector4logical.h
+++ b/indra/llmath/llvector4logical.h
@@ -27,6 +27,7 @@
 #ifndef	LL_VECTOR4LOGICAL_H
 #define	LL_VECTOR4LOGICAL_H
 
+#include "llmemory.h"
 
 ////////////////////////////
 // LLVector4Logical
@@ -77,6 +78,7 @@ class LLVector4Logical
 	inline LLVector4Logical& invert()
 	{
 		static const LL_ALIGN_16(U32 allOnes[4]) = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
+		ll_assert_aligned(allOnes,16);
 		mQ = _mm_andnot_ps( mQ, *(LLQuad*)(allOnes) );
 		return *this;
 	}
diff --git a/indra/llmath/llvolume.cpp b/indra/llmath/llvolume.cpp
old mode 100644
new mode 100755
index cc9744756fd4d4ba19ffb11c83229f9b3f4a0d1a..11fa7080ced9dd02ed4c0165bfdb9d3b54a17a36
--- a/indra/llmath/llvolume.cpp
+++ b/indra/llmath/llvolume.cpp
@@ -95,17 +95,6 @@ const S32 SCULPT_MIN_AREA_DETAIL = 1;
 
 extern BOOL gDebugGL;
 
-void assert_aligned(void* ptr, uintptr_t alignment)
-{
-#if 0
-	uintptr_t t = (uintptr_t) ptr;
-	if (t%alignment != 0)
-	{
-		llerrs << "Alignment check failed." << llendl;
-	}
-#endif
-}
-
 BOOL check_same_clock_dir( const LLVector3& pt1, const LLVector3& pt2, const LLVector3& pt3, const LLVector3& norm)
 {    
 	LLVector3 test = (pt2-pt1)%(pt3-pt2);
@@ -6962,14 +6951,14 @@ void LLVolumeFace::resizeVertices(S32 num_verts)
 	if (num_verts)
 	{
 		mPositions = (LLVector4a*) ll_aligned_malloc_16(sizeof(LLVector4a)*num_verts);
-		assert_aligned(mPositions, 16);
+		ll_assert_aligned(mPositions, 16);
 		mNormals = (LLVector4a*) ll_aligned_malloc_16(sizeof(LLVector4a)*num_verts);
-		assert_aligned(mNormals, 16);
+		ll_assert_aligned(mNormals, 16);
 
 		//pad texture coordinate block end to allow for QWORD reads
 		S32 size = ((num_verts*sizeof(LLVector2)) + 0xF) & ~0xF;
 		mTexCoords = (LLVector2*) ll_aligned_malloc_16(size);
-		assert_aligned(mTexCoords, 16);
+		ll_assert_aligned(mTexCoords, 16);
 	}
 	else
 	{
@@ -6993,14 +6982,17 @@ void LLVolumeFace::pushVertex(const LLVector4a& pos, const LLVector4a& norm, con
 //	S32 old_size = mNumVertices*16;
 
 	//positions
-	mPositions = (LLVector4a*) realloc(mPositions, new_size);
+	mPositions = (LLVector4a*) ll_aligned_realloc_16(mPositions, new_size);
+	ll_assert_aligned(mPositions,16);
 	
 	//normals
-	mNormals = (LLVector4a*) realloc(mNormals, new_size);
-	
+	mNormals = (LLVector4a*) ll_aligned_realloc_16(mNormals, new_size);
+	ll_assert_aligned(mNormals,16);
+
 	//tex coords
 	new_size = ((new_verts*8)+0xF) & ~0xF;
-	mTexCoords = (LLVector2*) realloc(mTexCoords, new_size);
+	mTexCoords = (LLVector2*) ll_aligned_realloc_16(mTexCoords, new_size);
+	ll_assert_aligned(mTexCoords,16);
 	
 
 	//just clear binormals
@@ -7053,7 +7045,8 @@ void LLVolumeFace::pushIndex(const U16& idx)
 	S32 old_size = ((mNumIndices*2)+0xF) & ~0xF;
 	if (new_size != old_size)
 	{
-		mIndices = (U16*) realloc(mIndices, new_size);
+		mIndices = (U16*) ll_aligned_realloc_16(mIndices, new_size);
+		ll_assert_aligned(mIndices,16);
 	}
 	
 	mIndices[mNumIndices++] = idx;
@@ -7094,12 +7087,12 @@ void LLVolumeFace::appendFace(const LLVolumeFace& face, LLMatrix4& mat_in, LLMat
 	}
 
 	//allocate new buffer space
-	mPositions = (LLVector4a*) realloc(mPositions, new_count*sizeof(LLVector4a));
-	assert_aligned(mPositions, 16);
-	mNormals = (LLVector4a*) realloc(mNormals, new_count*sizeof(LLVector4a));
-	assert_aligned(mNormals, 16);
-	mTexCoords = (LLVector2*) realloc(mTexCoords, (new_count*sizeof(LLVector2)+0xF) & ~0xF);
-	assert_aligned(mTexCoords, 16);
+	mPositions = (LLVector4a*) ll_aligned_realloc_16(mPositions, new_count*sizeof(LLVector4a));
+	ll_assert_aligned(mPositions, 16);
+	mNormals = (LLVector4a*) ll_aligned_realloc_16(mNormals, new_count*sizeof(LLVector4a));
+	ll_assert_aligned(mNormals, 16);
+	mTexCoords = (LLVector2*) ll_aligned_realloc_16(mTexCoords, (new_count*sizeof(LLVector2)+0xF) & ~0xF);
+	ll_assert_aligned(mTexCoords, 16);
 	
 	mNumVertices = new_count;
 
@@ -7145,7 +7138,7 @@ void LLVolumeFace::appendFace(const LLVolumeFace& face, LLMatrix4& mat_in, LLMat
 	new_count = mNumIndices + face.mNumIndices;
 
 	//allocate new index buffer
-	mIndices = (U16*) realloc(mIndices, (new_count*sizeof(U16)+0xF) & ~0xF);
+	mIndices = (U16*) ll_aligned_realloc_16(mIndices, (new_count*sizeof(U16)+0xF) & ~0xF);
 	
 	//get destination address into new index buffer
 	U16* dst_idx = mIndices+mNumIndices;
diff --git a/indra/llmath/llvolumeoctree.h b/indra/llmath/llvolumeoctree.h
old mode 100644
new mode 100755
index 688d91dc40b6c0f805187d808d45e1e9644aba42..dac97b14b5f23bc97a5a1a4891d3e0f356a00f5d
--- a/indra/llmath/llvolumeoctree.h
+++ b/indra/llmath/llvolumeoctree.h
@@ -37,6 +37,16 @@
 class LLVolumeTriangle : public LLRefCount
 {
 public:
+	void* operator new(size_t size)
+	{
+		return ll_aligned_malloc_16(size);
+	}
+
+	void operator delete(void* ptr)
+	{
+		ll_aligned_free_16(ptr);
+	}
+
 	LLVolumeTriangle()
 	{
 		
@@ -58,7 +68,7 @@ class LLVolumeTriangle : public LLRefCount
 	
 	}
 
-	LLVector4a mPositionGroup;
+	LL_ALIGN_16(LLVector4a mPositionGroup);
 
 	const LLVector4a* mV[3];
 	U16 mIndex[3];
@@ -73,6 +83,16 @@ class LLVolumeOctreeListener : public LLOctreeListener<LLVolumeTriangle>
 {
 public:
 	
+	void* operator new(size_t size)
+	{
+		return ll_aligned_malloc_16(size);
+	}
+
+	void operator delete(void* ptr)
+	{
+		ll_aligned_free_16(ptr);
+	}
+
 	LLVolumeOctreeListener(LLOctreeNode<LLVolumeTriangle>* node);
 	~LLVolumeOctreeListener();
 	
@@ -99,8 +119,8 @@ class LLVolumeOctreeListener : public LLOctreeListener<LLVolumeTriangle>
 	
 
 public:
-	LLVector4a mBounds[2]; // bounding box (center, size) of this node and all its children (tight fit to objects)
-	LLVector4a mExtents[2]; // extents (min, max) of this node and all its children
+	LL_ALIGN_16(LLVector4a mBounds[2]); // bounding box (center, size) of this node and all its children (tight fit to objects)
+	LL_ALIGN_16(LLVector4a mExtents[2]); // extents (min, max) of this node and all its children
 };
 
 class LLOctreeTriangleRayIntersect : public LLOctreeTraveler<LLVolumeTriangle>
diff --git a/indra/llmath/tests/alignment_test.cpp b/indra/llmath/tests/alignment_test.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..dbb42c4a3d96cf07f203db45caaa3d17ed93b49a
--- /dev/null
+++ b/indra/llmath/tests/alignment_test.cpp
@@ -0,0 +1,120 @@
+/**
+ * @file v3dmath_test.cpp
+ * @author Vir
+ * @date 2011-12
+ * @brief v3dmath test cases.
+ *
+ * $LicenseInfo:firstyear=2011&license=viewerlgpl$
+ * Second Life Viewer Source Code
+ * Copyright (C) 2011, Linden Research, Inc.
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License only.
+ * 
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ * 
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * 
+ * Linden Research, Inc., 945 Battery Street, San Francisco, CA  94111  USA
+ * $/LicenseInfo$
+ */
+
+// Tests related to allocating objects with alignment constraints, particularly for SSE support.
+
+#include "linden_common.h"
+#include "../test/lltut.h"
+#include "../llmath.h"
+#include "../llsimdmath.h"
+#include "../llvector4a.h"
+
+void* operator new(size_t size)
+{
+	return ll_aligned_malloc_16(size);
+}
+
+void operator delete(void *p)
+{
+	ll_aligned_free_16(p);
+}
+
+namespace tut
+{
+
+#define is_aligned(ptr,alignment) ((reinterpret_cast<uintptr_t>(ptr))%(alignment)==0)
+#define is_aligned_relative(ptr,base_ptr,alignment) ((reinterpret_cast<uintptr_t>(ptr)-reinterpret_cast<uintptr_t>(base_ptr))%(alignment)==0)
+
+struct alignment_test {};
+
+typedef test_group<alignment_test> alignment_test_t;
+typedef alignment_test_t::object alignment_test_object_t;
+tut::alignment_test_t tut_alignment_test("LLAlignment");
+
+LL_ALIGN_PREFIX(16)
+class MyVector4a
+{
+	LLQuad mQ;
+} LL_ALIGN_POSTFIX(16);
+
+
+// Verify that aligned allocators perform as advertised.
+template<> template<>
+void alignment_test_object_t::test<1>()
+{
+	const int num_tests = 7;
+	void *align_ptr;
+	for (int i=0; i<num_tests; i++)
+	{
+		align_ptr = ll_aligned_malloc_16(sizeof(MyVector4a));
+		ensure("ll_aligned_malloc_16 failed", is_aligned(align_ptr,16));
+
+		align_ptr = ll_aligned_realloc_16(align_ptr,2*sizeof(MyVector4a));
+		ensure("ll_aligned_realloc_16 failed", is_aligned(align_ptr,16));
+
+		ll_aligned_free_16(align_ptr);
+
+		align_ptr = ll_aligned_malloc_32(sizeof(MyVector4a));
+		ensure("ll_aligned_malloc_32 failed", is_aligned(align_ptr,32));
+		ll_aligned_free_32(align_ptr);
+	}
+}
+
+// In-place allocation of objects and arrays.
+template<> template<>
+void alignment_test_object_t::test<2>()
+{
+	MyVector4a vec1;
+	ensure("LLAlignment vec1 unaligned", is_aligned(&vec1,16));
+	
+	MyVector4a veca[12];
+	ensure("LLAlignment veca unaligned", is_aligned(veca,16));
+}
+
+// Heap allocation of objects and arrays.
+template<> template<>
+void alignment_test_object_t::test<3>()
+{
+	const int ARR_SIZE = 7;
+	for(int i=0; i<ARR_SIZE; i++)
+	{
+		MyVector4a *vecp = new MyVector4a;
+		ensure("LLAlignment vecp unaligned", is_aligned(vecp,16));
+		delete vecp;
+	}
+
+	MyVector4a *veca = new MyVector4a[ARR_SIZE];
+	ensure("LLAligment veca base", is_aligned(veca,16));
+	for(int i=0; i<ARR_SIZE; i++)
+	{
+		std::cout << "veca[" << i << "]" << std::endl;
+		ensure("LLAlignment veca member unaligned", is_aligned(&veca[i],16));
+	}
+}
+
+}
diff --git a/indra/llprimitive/llmodel.cpp b/indra/llprimitive/llmodel.cpp
index cb32a510b80b62585cd3cb1863b83e178a4270e8..28ed051c5537a606873ecd8db38b48f0efd4f4ae 100644
--- a/indra/llprimitive/llmodel.cpp
+++ b/indra/llprimitive/llmodel.cpp
@@ -1026,7 +1026,8 @@ void LLModel::setVolumeFaceData(
 
 	if (tc.get())
 	{
-		LLVector4a::memcpyNonAliased16((F32*) face.mTexCoords, (F32*) tc.get(), num_verts*2*sizeof(F32));
+		U32 tex_size = (num_verts*2*sizeof(F32)+0xF)&~0xF;
+		LLVector4a::memcpyNonAliased16((F32*) face.mTexCoords, (F32*) tc.get(), tex_size);
 	}
 	else
 	{
diff --git a/indra/newview/CMakeLists.txt b/indra/newview/CMakeLists.txt
old mode 100644
new mode 100755
index 65170a214099dc8511d1b4e2900649a463e533a0..9783601696f90787240c58c2a44d3cdd0305c82d
--- a/indra/newview/CMakeLists.txt
+++ b/indra/newview/CMakeLists.txt
@@ -1521,9 +1521,7 @@ set(PACKAGE ON CACHE BOOL
 if (WINDOWS)
     set_target_properties(${VIEWER_BINARY_NAME}
         PROPERTIES
-        # *TODO -reenable this once we get server usage sorted out
-        #LINK_FLAGS "/debug /NODEFAULTLIB:LIBCMT /SUBSYSTEM:WINDOWS /INCLUDE:\"__tcmalloc\""
-        LINK_FLAGS "/debug /NODEFAULTLIB:LIBCMT /SUBSYSTEM:WINDOWS /INCLUDE:__tcmalloc"
+        LINK_FLAGS "/debug /NODEFAULTLIB:LIBCMT /SUBSYSTEM:WINDOWS ${TCMALLOC_LINK_FLAGS}"
         LINK_FLAGS_DEBUG "/NODEFAULTLIB:\"LIBCMT;LIBCMTD;MSVCRT\" /INCREMENTAL:NO"
         LINK_FLAGS_RELEASE ""
         )
@@ -1542,7 +1540,7 @@ if (WINDOWS)
     # In the meantime, if you have any ideas on how to easily maintain one list, either here or in viewer_manifest.py
     # and have the build deps get tracked *please* tell me about it.
 
-    if(USE_GOOGLE_PERFTOOLS)
+    if(USE_TCMALLOC)
       # Configure a var for tcmalloc location, if used.
       # Note the need to specify multiple names explicitly.
       set(GOOGLE_PERF_TOOLS_SOURCE
@@ -1550,7 +1548,7 @@ if (WINDOWS)
         ${SHARED_LIB_STAGING_DIR}/RelWithDebInfo/libtcmalloc_minimal.dll
         ${SHARED_LIB_STAGING_DIR}/Debug/libtcmalloc_minimal-debug.dll
         )
-     endif(USE_GOOGLE_PERFTOOLS)
+     endif(USE_TCMALLOC)
 
 
     set(COPY_INPUT_DEPENDENCIES
diff --git a/indra/newview/llappviewerwin32.cpp b/indra/newview/llappviewerwin32.cpp
old mode 100644
new mode 100755
index bad60a9757af3e116170e330d8d1891f136b4d80..53c77fa22e52e09d13448a06c980c3903a78f488
--- a/indra/newview/llappviewerwin32.cpp
+++ b/indra/newview/llappviewerwin32.cpp
@@ -130,6 +130,8 @@ int APIENTRY WINMAIN(HINSTANCE hInstance,
 	// This results in a 2-3x improvement in opening a new Inventory window (which uses a large numebr of allocations)
 	// Note: This won't work when running from the debugger unless the _NO_DEBUG_HEAP environment variable is set to 1
 
+	// Enable to get mem debugging within visual studio.
+	//_CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF);
 	_CrtSetDbgFlag(0); // default, just making explicit
 	
 	ULONG ulEnableLFH = 2;
diff --git a/indra/newview/lldrawable.h b/indra/newview/lldrawable.h
old mode 100644
new mode 100755
index e2064b79f875037881b9b3ffdfa5611f7e74d3e8..8c7db61502136a70980d17474695eb219cf74c27
--- a/indra/newview/lldrawable.h
+++ b/indra/newview/lldrawable.h
@@ -59,6 +59,7 @@ class LLViewerTexture;
 const U32 SILHOUETTE_HIGHLIGHT = 0;
 
 // All data for new renderer goes into this class.
+LL_ALIGN_PREFIX(16)
 class LLDrawable : public LLRefCount
 {
 public:
@@ -75,6 +76,16 @@ class LLDrawable : public LLRefCount
 
 	static void initClass();
 
+	void* operator new(size_t size)
+	{
+		return ll_aligned_malloc_16(size);
+	}
+
+	void operator delete(void* ptr)
+	{
+		ll_aligned_free_16(ptr);
+	}
+
 	LLDrawable()				{ init(); }
 	MEM_TYPE_NEW(LLMemType::MTYPE_DRAWABLE);
 	
@@ -281,8 +292,8 @@ class LLDrawable : public LLRefCount
 	} EDrawableFlags;
 
 private: //aligned members
-	LLVector4a		mExtents[2];
-	LLVector4a		mPositionGroup;
+	LL_ALIGN_16(LLVector4a		mExtents[2]);
+	LL_ALIGN_16(LLVector4a		mPositionGroup);
 	
 public:
 	LLXformMatrix       mXform;
@@ -323,7 +334,7 @@ class LLDrawable : public LLRefCount
 
 	static U32 sNumZombieDrawables;
 	static LLDynamicArrayPtr<LLPointer<LLDrawable> > sDeadList;
-};
+} LL_ALIGN_POSTFIX(16);
 
 
 inline LLFace* LLDrawable::getFace(const S32 i) const
diff --git a/indra/newview/lldynamictexture.h b/indra/newview/lldynamictexture.h
old mode 100644
new mode 100755
index e18090545d5b95a91ffdc61544102ce51cacb264..c51e7d1e1a76ff64171cdd483ffc49c5d47c1bb1
--- a/indra/newview/lldynamictexture.h
+++ b/indra/newview/lldynamictexture.h
@@ -36,6 +36,16 @@
 class LLViewerDynamicTexture : public LLViewerTexture
 {
 public:
+	void* operator new(size_t size)
+	{
+		return ll_aligned_malloc_16(size);
+	}
+
+	void operator delete(void* ptr)
+	{
+		ll_aligned_free_16(ptr);
+	}
+
 	enum
 	{
 		LL_VIEWER_DYNAMIC_TEXTURE = LLViewerTexture::DYNAMIC_TEXTURE,
@@ -85,7 +95,7 @@ class LLViewerDynamicTexture : public LLViewerTexture
 protected:
 	BOOL mClamp;
 	LLCoordGL mOrigin;
-	LLCamera mCamera;
+	LL_ALIGN_16(LLCamera mCamera);
 	
 	typedef std::set<LLViewerDynamicTexture*> instance_list_t;
 	static instance_list_t sInstances[ LLViewerDynamicTexture::ORDER_COUNT ];
diff --git a/indra/newview/llface.cpp b/indra/newview/llface.cpp
old mode 100644
new mode 100755
index 49a20d5ef99f854f76357798063a689da07f07c4..d32994cd31d6d8554fb39d39f50e5cdb95510611
--- a/indra/newview/llface.cpp
+++ b/indra/newview/llface.cpp
@@ -1638,7 +1638,8 @@ BOOL LLFace::getGeometryVolume(const LLVolume& volume,
 						if (!do_xform)
 						{
 							LLFastTimer t(FTM_FACE_TEX_QUICK_NO_XFORM);
-							LLVector4a::memcpyNonAliased16((F32*) tex_coords.get(), (F32*) vf.mTexCoords, num_vertices*2*sizeof(F32));
+							S32 tc_size = (num_vertices*2*sizeof(F32)+0xF) & ~0xF;
+							LLVector4a::memcpyNonAliased16((F32*) tex_coords.get(), (F32*) vf.mTexCoords, tc_size);
 						}
 						else
 						{
diff --git a/indra/newview/llface.h b/indra/newview/llface.h
old mode 100644
new mode 100755
index 76ea5c853ac9bb389ae7206bca94cd3052cb981a..5dca27487f32a000703c21826d741061d75b71e8
--- a/indra/newview/llface.h
+++ b/indra/newview/llface.h
@@ -59,6 +59,17 @@ class LLFace
 {
 public:
 
+	void* operator new(size_t size)
+	{
+		return ll_aligned_malloc_16(size);
+	}
+
+	void operator delete(void* ptr)
+	{
+		ll_aligned_free_16(ptr);
+	}
+
+
 	LLFace(const LLFace& rhs)
 	{
 		*this = rhs;
diff --git a/indra/newview/llfloatermodelpreview.cpp b/indra/newview/llfloatermodelpreview.cpp
index 0a5171245ae1a08dfacce8ec60f1943258382780..35341ef0eab6b13581e8469c12ca9df5eda6d45c 100755
--- a/indra/newview/llfloatermodelpreview.cpp
+++ b/indra/newview/llfloatermodelpreview.cpp
@@ -4774,7 +4774,8 @@ void LLModelPreview::genBuffers(S32 lod, bool include_skin_weights)
 			if (vf.mTexCoords)
 			{
 				vb->getTexCoord0Strider(tc_strider);
-				LLVector4a::memcpyNonAliased16((F32*) tc_strider.get(), (F32*) vf.mTexCoords, num_vertices*2*sizeof(F32));
+				S32 tex_size = (num_vertices*2*sizeof(F32)+0xF) & ~0xF;
+				LLVector4a::memcpyNonAliased16((F32*) tc_strider.get(), (F32*) vf.mTexCoords, tex_size);
 			}
 			
 			if (vf.mNormals)
diff --git a/indra/newview/llpolymesh.cpp b/indra/newview/llpolymesh.cpp
old mode 100644
new mode 100755
index 450f9b2be7238a61f594dee75d7a1a3416f3a46e..0860506086153bc8482c345c5946d666781bde09
--- a/indra/newview/llpolymesh.cpp
+++ b/indra/newview/llpolymesh.cpp
@@ -129,22 +129,22 @@ void LLPolyMeshSharedData::freeMeshData()
         {
                 mNumVertices = 0;
 
-                delete [] mBaseCoords;
+                ll_aligned_free_16(mBaseCoords);
                 mBaseCoords = NULL;
 
-                delete [] mBaseNormals;
+                ll_aligned_free_16(mBaseNormals);
                 mBaseNormals = NULL;
 
-                delete [] mBaseBinormals;
+                ll_aligned_free_16(mBaseBinormals);
                 mBaseBinormals = NULL;
 
-                delete [] mTexCoords;
+                ll_aligned_free_16(mTexCoords);
                 mTexCoords = NULL;
 
-                delete [] mDetailTexCoords;
+                ll_aligned_free_16(mDetailTexCoords);
                 mDetailTexCoords = NULL;
 
-                delete [] mWeights;
+                ll_aligned_free_16(mWeights);
                 mWeights = NULL;
         }
 
@@ -229,12 +229,12 @@ U32 LLPolyMeshSharedData::getNumKB()
 BOOL LLPolyMeshSharedData::allocateVertexData( U32 numVertices )
 {
         U32 i;
-        mBaseCoords = new LLVector3[ numVertices ];
-        mBaseNormals = new LLVector3[ numVertices ];
-        mBaseBinormals = new LLVector3[ numVertices ];
-        mTexCoords = new LLVector2[ numVertices ];
-        mDetailTexCoords = new LLVector2[ numVertices ];
-        mWeights = new F32[ numVertices ];
+        mBaseCoords = (LLVector3*) ll_aligned_malloc_16(numVertices*sizeof(LLVector3));
+        mBaseNormals = (LLVector3*) ll_aligned_malloc_16(numVertices*sizeof(LLVector3));
+        mBaseBinormals = (LLVector3*) ll_aligned_malloc_16(numVertices*sizeof(LLVector3));
+        mTexCoords = (LLVector2*) ll_aligned_malloc_16(numVertices*sizeof(LLVector2));
+        mDetailTexCoords = (LLVector2*) ll_aligned_malloc_16(numVertices*sizeof(LLVector2));
+        mWeights = (F32*) ll_aligned_malloc_16(numVertices*sizeof(F32));
         for (i = 0; i < numVertices; i++)
         {
                 mWeights[i] = 0.f;
diff --git a/indra/newview/llspatialpartition.cpp b/indra/newview/llspatialpartition.cpp
old mode 100644
new mode 100755
index 325a2d300453a0dce2dd4e22e2c5470576aa20ae..45ef8f1a6dc9e9c42c04a489fe216b4618f83c6b
--- a/indra/newview/llspatialpartition.cpp
+++ b/indra/newview/llspatialpartition.cpp
@@ -529,6 +529,7 @@ void LLSpatialGroup::setVisible()
 
 void LLSpatialGroup::validate()
 {
+	ll_assert_aligned(this,64);
 #if LL_OCTREE_PARANOIA_CHECK
 
 	sg_assert(!isState(DIRTY));
@@ -1195,6 +1196,8 @@ LLSpatialGroup::LLSpatialGroup(OctreeNode* node, LLSpatialPartition* part) :
 	mCurUpdatingSlotp(NULL),
 	mCurUpdatingTexture (NULL)
 {
+	ll_assert_aligned(this,16);
+	
 	sNodeCount++;
 	LLMemType mt(LLMemType::MTYPE_SPACE_PARTITION);
 
diff --git a/indra/newview/llspatialpartition.h b/indra/newview/llspatialpartition.h
old mode 100644
new mode 100755
index f0e4f15a83b579c855acc68c0c706905653965cf..7968c28900d487425e406e39597e1845b61052e4
--- a/indra/newview/llspatialpartition.h
+++ b/indra/newview/llspatialpartition.h
@@ -68,6 +68,16 @@ class LLDrawInfo : public LLRefCount
 	~LLDrawInfo();	
 	
 public:
+	void* operator new(size_t size)
+	{
+		return ll_aligned_malloc_16(size);
+	}
+
+	void operator delete(void* ptr)
+	{
+		ll_aligned_free_16(ptr);
+	}
+
 
 	LLDrawInfo(const LLDrawInfo& rhs)
 	{
@@ -106,7 +116,7 @@ class LLDrawInfo : public LLRefCount
 	F32 mPartSize;
 	F32 mVSize;
 	LLSpatialGroup* mGroup;
-	LLFace* mFace; //associated face
+	LL_ALIGN_16(LLFace* mFace); //associated face
 	F32 mDistance;
 	U32 mDrawMode;
 
@@ -181,7 +191,7 @@ class LLDrawInfo : public LLRefCount
 	};
 };
 
-LL_ALIGN_PREFIX(64)
+LL_ALIGN_PREFIX(16)
 class LLSpatialGroup : public LLOctreeListener<LLDrawable>
 {
 	friend class LLSpatialPartition;
@@ -193,6 +203,16 @@ class LLSpatialGroup : public LLOctreeListener<LLDrawable>
 		*this = rhs;
 	}
 
+	void* operator new(size_t size)
+	{
+		return ll_aligned_malloc_16(size);
+	}
+
+	void operator delete(void* ptr)
+	{
+		ll_aligned_free_16(ptr);
+	}
+
 	const LLSpatialGroup& operator=(const LLSpatialGroup& rhs)
 	{
 		llerrs << "Illegal operation!" << llendl;
@@ -370,12 +390,12 @@ class LLSpatialGroup : public LLOctreeListener<LLDrawable>
 		V4_COUNT = 10
 	} eV4Index;
 
-	LLVector4a mBounds[2]; // bounding box (center, size) of this node and all its children (tight fit to objects)
-	LLVector4a mExtents[2]; // extents (min, max) of this node and all its children
-	LLVector4a mObjectExtents[2]; // extents (min, max) of objects in this node
-	LLVector4a mObjectBounds[2]; // bounding box (center, size) of objects in this node
-	LLVector4a mViewAngle;
-	LLVector4a mLastUpdateViewAngle;
+	LL_ALIGN_16(LLVector4a mBounds[2]); // bounding box (center, size) of this node and all its children (tight fit to objects)
+	LL_ALIGN_16(LLVector4a mExtents[2]); // extents (min, max) of this node and all its children
+	LL_ALIGN_16(LLVector4a mObjectExtents[2]); // extents (min, max) of objects in this node
+	LL_ALIGN_16(LLVector4a mObjectBounds[2]); // bounding box (center, size) of objects in this node
+	LL_ALIGN_16(LLVector4a mViewAngle);
+	LL_ALIGN_16(LLVector4a mLastUpdateViewAngle);
 
 	F32 mObjectBoxSize; //cached mObjectBounds[1].getLength3()
 		
diff --git a/indra/newview/llviewercamera.h b/indra/newview/llviewercamera.h
old mode 100644
new mode 100755
index 184033de424bc6b1ea9d21536ba816971dc4985d..b857c7fe89fb108c0c2abf7730b8739531022602
--- a/indra/newview/llviewercamera.h
+++ b/indra/newview/llviewercamera.h
@@ -51,9 +51,19 @@ const BOOL NOT_FOR_SELECTION = FALSE;
 extern template class LLViewerCamera* LLSingleton<class LLViewerCamera>::getInstance();
 #endif
 
+LL_ALIGN_PREFIX(16)
 class LLViewerCamera : public LLCamera, public LLSingleton<LLViewerCamera>
 {
 public:
+	void* operator new(size_t size)
+	{
+		return ll_aligned_malloc_16(size);
+	}
+
+	void operator delete(void* ptr)
+	{
+		ll_aligned_free_16(ptr);
+	}
 
 	typedef enum
 	{
@@ -137,6 +147,7 @@ class LLViewerCamera : public LLCamera, public LLSingleton<LLViewerCamera>
 	S16					mZoomSubregion;
 
 public:
-};
+} LL_ALIGN_POSTFIX(16);
+
 
 #endif // LL_LLVIEWERCAMERA_H
diff --git a/indra/newview/llviewerjointmesh.cpp b/indra/newview/llviewerjointmesh.cpp
old mode 100644
new mode 100755
index f029ae5302f040785f4390eb5a8c723b6e4f90f9..5d1aa870a3165ee455c8d2dc4e90825cec78658a
--- a/indra/newview/llviewerjointmesh.cpp
+++ b/indra/newview/llviewerjointmesh.cpp
@@ -729,8 +729,10 @@ void LLViewerJointMesh::updateFaceData(LLFace *face, F32 pixel_area, BOOL damp_w
 				F32* vw = (F32*) vertex_weightsp.get();
 				F32* cw = (F32*) clothing_weightsp.get();	
 
-				LLVector4a::memcpyNonAliased16(tc, (F32*) mMesh->getTexCoords(), num_verts*2*sizeof(F32));
-				LLVector4a::memcpyNonAliased16(vw, (F32*) mMesh->getWeights(), num_verts*sizeof(F32));	
+				S32 tc_size = (num_verts*2*sizeof(F32)+0xF) & ~0xF;
+				LLVector4a::memcpyNonAliased16(tc, (F32*) mMesh->getTexCoords(), tc_size);
+				S32 vw_size = (num_verts*sizeof(F32)+0xF) & ~0xF;	
+				LLVector4a::memcpyNonAliased16(vw, (F32*) mMesh->getWeights(), vw_size);	
 				LLVector4a::memcpyNonAliased16(cw, (F32*) mMesh->getClothingWeights(), num_verts*4*sizeof(F32));	
 			}
 
diff --git a/indra/newview/llvoavatar.cpp b/indra/newview/llvoavatar.cpp
old mode 100644
new mode 100755
index 33dc12c473a64062c2180dde1d8696119daf3155..1b7009a5c2b0b33423e2f72101b17ea4c85c253d
--- a/indra/newview/llvoavatar.cpp
+++ b/indra/newview/llvoavatar.cpp
@@ -2692,7 +2692,7 @@ void LLVOAvatar::idleUpdateMisc(bool detailed_update)
 
 	if (isImpostor() && !mNeedsImpostorUpdate)
 	{
-		LLVector4a ext[2];
+		LL_ALIGN_16(LLVector4a ext[2]);
 		F32 distance;
 		LLVector3 angle;
 
diff --git a/indra/newview/llvoavatar.h b/indra/newview/llvoavatar.h
old mode 100644
new mode 100755
index 6fb56a4c0bb2bf70a0134746170612c59e665079..4081a1408d2a186619769948d4d105e7ddefedf6
--- a/indra/newview/llvoavatar.h
+++ b/indra/newview/llvoavatar.h
@@ -93,6 +93,16 @@ class LLVOAvatar :
  **/
 
 public:
+	void* operator new(size_t size)
+	{
+		return ll_aligned_malloc_16(size);
+	}
+
+	void operator delete(void* ptr)
+	{
+		ll_aligned_free_16(ptr);
+	}
+
 	LLVOAvatar(const LLUUID &id, const LLPCode pcode, LLViewerRegion *regionp);
 	virtual void		markDead();
 	static void			initClass(); // Initialize data that's only init'd once per class.
@@ -215,7 +225,7 @@ class LLVOAvatar :
 	bool isBuilt() const { return mIsBuilt; }
 
 private: //aligned members
-	LLVector4a	mImpostorExtents[2];
+	LL_ALIGN_16(LLVector4a	mImpostorExtents[2]);
 
 private:
 	BOOL			mSupportsAlphaLayers; // For backwards compatibility, TRUE for 1.23+ clients
diff --git a/indra/newview/llvoavatarself.h b/indra/newview/llvoavatarself.h
old mode 100644
new mode 100755
index 543891ca632366a759fdcaa83b12e60449c330b7..2b273e616c963a4fc926c7a353726613f83e10fa
--- a/indra/newview/llvoavatarself.h
+++ b/indra/newview/llvoavatarself.h
@@ -49,6 +49,16 @@ class LLVOAvatarSelf :
  **/
 
 public:
+	void* operator new(size_t size)
+	{
+		return ll_aligned_malloc_16(size);
+	}
+
+	void operator delete(void* ptr)
+	{
+		ll_aligned_free_16(ptr);
+	}
+
 	LLVOAvatarSelf(const LLUUID &id, const LLPCode pcode, LLViewerRegion *regionp);
 	virtual 				~LLVOAvatarSelf();
 	virtual void			markDead();
diff --git a/indra/newview/viewer_manifest.py b/indra/newview/viewer_manifest.py
old mode 100644
new mode 100755
index 7c6b5403e1df44391369b6ebd40a95e2c477abc7..d1c952ac3bd275fdc410bffe2996327e46a4f902
--- a/indra/newview/viewer_manifest.py
+++ b/indra/newview/viewer_manifest.py
@@ -1062,6 +1062,7 @@ def construct(self):
             self.path("libalut.so")
             self.path("libopenal.so", "libopenal.so.1")
             self.path("libopenal.so", "libvivoxoal.so.1") # vivox's sdk expects this soname
+
             # KLUDGE: As of 2012-04-11, the 'fontconfig' package installs
             # libfontconfig.so.1.4.4, along with symlinks libfontconfig.so.1
             # and libfontconfig.so. Before we added support for library-file