diff --git a/indra/newview/llpolymesh.cpp b/indra/newview/llpolymesh.cpp
index d5a2d66bcfc6444273b7e49d2d9644d50e04ef05..b8bdbfb2f81b6999c5fbbfbdcae3126ab5875529 100644
--- a/indra/newview/llpolymesh.cpp
+++ b/indra/newview/llpolymesh.cpp
@@ -708,15 +708,17 @@ LLPolyMesh::LLPolyMesh(LLPolyMeshSharedData *shared_data, LLPolyMesh *reference_
 		mClothingWeights = reference_mesh->mClothingWeights;
 	}
 	else
-	{
+	{ 	 
 #if 1	// Allocate memory without initializing every vector
 		// NOTE: This makes asusmptions about the size of LLVector[234]
 		int nverts = mSharedData->mNumVertices;
-		int nfloats = nverts * (3*5 + 2 + 4);
-		mVertexData = new F32[nfloats];
+		int nfloats = nverts * (2*4 + 3*3 + 2 + 4);
+
+		//use aligned vertex data to make LLPolyMesh SSE friendly
+		mVertexData = (F32*) _mm_malloc(nfloats*4, 16);
 		int offset = 0;
-		mCoords = 				(LLVector3*)(mVertexData + offset); offset += 3*nverts;
-		mNormals = 				(LLVector3*)(mVertexData + offset); offset += 3*nverts;
+		mCoords = 				(LLVector4*)(mVertexData + offset); offset += 4*nverts;
+		mNormals = 				(LLVector4*)(mVertexData + offset); offset += 4*nverts;
 		mScaledNormals = 		(LLVector3*)(mVertexData + offset); offset += 3*nverts;
 		mBinormals = 			(LLVector3*)(mVertexData + offset); offset += 3*nverts;
 		mScaledBinormals = 		(LLVector3*)(mVertexData + offset); offset += 3*nverts;
@@ -757,7 +759,7 @@ LLPolyMesh::~LLPolyMesh()
 	delete [] mClothingWeights;
 	delete [] mTexCoords;
 #else
-	delete [] mVertexData;
+	_mm_free(mVertexData);
 #endif
 }
 
@@ -864,7 +866,7 @@ void LLPolyMesh::dumpDiagInfo()
 //-----------------------------------------------------------------------------
 // getWritableCoords()
 //-----------------------------------------------------------------------------
-LLVector3 *LLPolyMesh::getWritableCoords()
+LLVector4 *LLPolyMesh::getWritableCoords()
 {
 	return mCoords;
 }
@@ -872,7 +874,7 @@ LLVector3 *LLPolyMesh::getWritableCoords()
 //-----------------------------------------------------------------------------
 // getWritableNormals()
 //-----------------------------------------------------------------------------
-LLVector3 *LLPolyMesh::getWritableNormals()
+LLVector4 *LLPolyMesh::getWritableNormals()
 {
 	return mNormals;
 }
@@ -927,8 +929,12 @@ void LLPolyMesh::initializeForMorph()
 	if (!mSharedData)
 		return;
 
-	memcpy(mCoords, mSharedData->mBaseCoords, sizeof(LLVector3) * mSharedData->mNumVertices);	/*Flawfinder: ignore*/
-	memcpy(mNormals, mSharedData->mBaseNormals, sizeof(LLVector3) * mSharedData->mNumVertices);	/*Flawfinder: ignore*/
+	for (U32 i = 0; i < mSharedData->mNumVertices; ++i)
+	{
+		mCoords[i] = LLVector4(mSharedData->mBaseCoords[i]);
+		mNormals[i] = LLVector4(mSharedData->mBaseNormals[i]);
+	}
+
 	memcpy(mScaledNormals, mSharedData->mBaseNormals, sizeof(LLVector3) * mSharedData->mNumVertices);	/*Flawfinder: ignore*/
 	memcpy(mBinormals, mSharedData->mBaseBinormals, sizeof(LLVector3) * mSharedData->mNumVertices);	/*Flawfinder: ignore*/
 	memcpy(mScaledBinormals, mSharedData->mBaseBinormals, sizeof(LLVector3) * mSharedData->mNumVertices);		/*Flawfinder: ignore*/
diff --git a/indra/newview/llpolymesh.h b/indra/newview/llpolymesh.h
index c2e5451dfe3f085ea21ef1ad7dfffe3ad2381ace..d86568a1ba76eb9e8f1711540c5010805cfce1d0 100644
--- a/indra/newview/llpolymesh.h
+++ b/indra/newview/llpolymesh.h
@@ -223,15 +223,15 @@ class LLPolyMesh
 	}
 
 	// Get coords
-	const LLVector3	*getCoords() const{
+	const LLVector4	*getCoords() const{
 		return mCoords;
 	}
 
 	// non const version
-	LLVector3 *getWritableCoords();
+	LLVector4 *getWritableCoords();
 
 	// Get normals
-	const LLVector3	*getNormals() const{ 
+	const LLVector4	*getNormals() const{ 
 		return mNormals; 
 	}
 
@@ -253,7 +253,7 @@ class LLPolyMesh
 	}
 
 	// intermediate morphed normals and output normals
-	LLVector3 *getWritableNormals();
+	LLVector4 *getWritableNormals();
 	LLVector3 *getScaledNormals();
 
 	LLVector3 *getWritableBinormals();
@@ -347,11 +347,11 @@ class LLPolyMesh
 	// Single array of floats for allocation / deletion
 	F32						*mVertexData;
 	// deformed vertices (resulting from application of morph targets)
-	LLVector3				*mCoords;
+	LLVector4				*mCoords;
 	// deformed normals (resulting from application of morph targets)
 	LLVector3				*mScaledNormals;
 	// output normals (after normalization)
-	LLVector3				*mNormals;
+	LLVector4				*mNormals;
 	// deformed binormals (resulting from application of morph targets)
 	LLVector3				*mScaledBinormals;
 	// output binormals (after normalization)
diff --git a/indra/newview/llpolymorph.cpp b/indra/newview/llpolymorph.cpp
index 80983cad2434d0be8002ce9834a1973fc8f8d5cb..2058c351c4ea8fefdcb4cbd84a5cdba8233a1079 100644
--- a/indra/newview/llpolymorph.cpp
+++ b/indra/newview/llpolymorph.cpp
@@ -461,10 +461,10 @@ void LLPolyMorphTarget::apply( ESex avatar_sex )
 	if (delta_weight != 0.f)
 	{
 		llassert(!mMesh->isLOD());
-		LLVector3 *coords = mMesh->getWritableCoords();
+		LLVector4 *coords = mMesh->getWritableCoords();
 
 		LLVector3 *scaled_normals = mMesh->getScaledNormals();
-		LLVector3 *normals = mMesh->getWritableNormals();
+		LLVector4 *normals = mMesh->getWritableNormals();
 
 		LLVector3 *scaled_binormals = mMesh->getScaledBinormals();
 		LLVector3 *binormals = mMesh->getWritableBinormals();
@@ -484,7 +484,8 @@ void LLPolyMorphTarget::apply( ESex avatar_sex )
 				maskWeight = maskWeightArray[vert_index_morph];
 			}
 
-			coords[vert_index_mesh] += mMorphData->mCoords[vert_index_morph] * delta_weight * maskWeight;
+			coords[vert_index_mesh] += LLVector4(mMorphData->mCoords[vert_index_morph] * delta_weight * maskWeight);
+
 			if (getInfo()->mIsClothingMorph && clothing_weights)
 			{
 				LLVector3 clothing_offset = mMorphData->mCoords[vert_index_morph] * delta_weight * maskWeight;
@@ -499,7 +500,7 @@ void LLPolyMorphTarget::apply( ESex avatar_sex )
 			scaled_normals[vert_index_mesh] += mMorphData->mNormals[vert_index_morph] * delta_weight * maskWeight * NORMAL_SOFTEN_FACTOR;
 			LLVector3 normalized_normal = scaled_normals[vert_index_mesh];
 			normalized_normal.normVec();
-			normals[vert_index_mesh] = normalized_normal;
+			normals[vert_index_mesh] = LLVector4(normalized_normal);
 
 			// calculate new binormals
 			scaled_binormals[vert_index_mesh] += mMorphData->mBinormals[vert_index_morph] * delta_weight * maskWeight * NORMAL_SOFTEN_FACTOR;
@@ -548,7 +549,7 @@ void	LLPolyMorphTarget::applyMask(U8 *maskTextureData, S32 width, S32 height, S3
 
 		if (maskWeights)
 		{
-			LLVector3 *coords = mMesh->getWritableCoords();
+			LLVector4 *coords = mMesh->getWritableCoords();
 			LLVector3 *scaled_normals = mMesh->getScaledNormals();
 			LLVector3 *scaled_binormals = mMesh->getScaledBinormals();
 			LLVector2 *tex_coords = mMesh->getWritableTexCoords();
@@ -559,7 +560,7 @@ void	LLPolyMorphTarget::applyMask(U8 *maskTextureData, S32 width, S32 height, S3
 				S32 out_vert = mMorphData->mVertexIndices[vert];
 
 				// remove effect of existing masked morph
-				coords[out_vert] -= mMorphData->mCoords[vert] * lastMaskWeight;
+				coords[out_vert] -= LLVector4(mMorphData->mCoords[vert]) * lastMaskWeight;
 				scaled_normals[out_vert] -= mMorphData->mNormals[vert] * lastMaskWeight * NORMAL_SOFTEN_FACTOR;
 				scaled_binormals[out_vert] -= mMorphData->mBinormals[vert] * lastMaskWeight * NORMAL_SOFTEN_FACTOR;
 				tex_coords[out_vert] -= mMorphData->mTexCoords[vert] * lastMaskWeight;
diff --git a/indra/newview/llviewerjointmesh.cpp b/indra/newview/llviewerjointmesh.cpp
index deb3d8fd97efd5f3280fb5cc64782877f320c9f8..294dfdcb55512dec92ea96a07acfad50777ba314 100644
--- a/indra/newview/llviewerjointmesh.cpp
+++ b/indra/newview/llviewerjointmesh.cpp
@@ -61,6 +61,7 @@
 #include "v4math.h"
 #include "m3math.h"
 #include "m4math.h"
+#include "llmatrix4a.h"
 
 #if !LL_DARWIN && !LL_LINUX && !LL_SOLARIS
 extern PFNGLWEIGHTPOINTERARBPROC glWeightPointerARB;
@@ -382,6 +383,7 @@ const S32 NUM_AXES = 3;
 // pivot parent 0-n -- child = n+1
 
 static LLMatrix4	gJointMatUnaligned[32];
+static LLMatrix4a	gJointMatAligned[32];
 static LLMatrix3	gJointRotUnaligned[32];
 static LLVector4	gJointPivot[32];
 
@@ -467,6 +469,14 @@ void LLViewerJointMesh::uploadJointMatrices()
 		glUniform4fvARB(gAvatarMatrixParam, 45, mat);
 		stop_glerror();
 	}
+	else
+	{
+		//load gJointMatUnaligned into gJointMatAligned
+		for (joint_num = 0; joint_num < reference_mesh->mJointRenderData.count(); ++joint_num)
+		{
+			gJointMatAligned[joint_num].loadu(gJointMatUnaligned[joint_num]);
+		}
+	}
 }
 
 //--------------------------------------------------------------------
@@ -723,7 +733,7 @@ void LLViewerJointMesh::updateFaceData(LLFace *face, F32 pixel_area, BOOL damp_w
 					v[0] = coords[0]; 
 					v[1] = coords[1]; 
 					v[2] = coords[2];		
-					coords += 3;
+					coords += 4;
 					v += skip;
 				}
 
@@ -732,12 +742,12 @@ void LLViewerJointMesh::updateFaceData(LLFace *face, F32 pixel_area, BOOL damp_w
 					n[0] = normals[0]; 
 					n[1] = normals[1];
 					n[2] = normals[2];
-					normals += 3;
+					normals += 4;
 					n += skip;
 				}
 			}
 			else
-				{
+			{
 
 				U32* __restrict tc = (U32*) tex_coordsp.get();
 				U32* __restrict vw = (U32*) vertex_weightsp.get();
@@ -745,18 +755,20 @@ void LLViewerJointMesh::updateFaceData(LLFace *face, F32 pixel_area, BOOL damp_w
 				
 				do
 				{
-					v[0] = *(coords++); 
-					v[1] = *(coords++); 
-					v[2] = *(coords++);
+					v[0] = coords[0]; 
+					v[1] = coords[1]; 
+					v[2] = coords[2];		
+					coords += 4;
 					v += skip;
 
 					tc[0] = *(tex_coords++); 
 					tc[1] = *(tex_coords++);
 					tc += skip;
 
-					n[0] = *(normals++); 
-					n[1] = *(normals++);
-					n[2] = *(normals++);
+					n[0] = normals[0]; 
+					n[1] = normals[1];
+					n[2] = normals[2];
+					normals += 4;
 					n += skip;
 
 					vw[0] = *(weights++);
@@ -808,17 +820,17 @@ void LLViewerJointMesh::updateGeometryOriginal(LLFace *mFace, LLPolyMesh *mMesh)
 	LLStrider<LLVector3> o_normals;
 
 	//get vertex and normal striders
-	LLVertexBuffer *buffer = mFace->mVertexBuffer;
+	LLVertexBuffer* buffer = mFace->mVertexBuffer;
 	buffer->getVertexStrider(o_vertices,  0);
 	buffer->getNormalStrider(o_normals,   0);
 
-	F32 last_weight = F32_MAX;
-	LLMatrix4 gBlendMat;
-	LLMatrix3 gBlendRotMat;
+	//F32 last_weight = F32_MAX;
+	LLMatrix4a gBlendMat;
+
+	__restrict const F32* weights = mMesh->getWeights();
+	__restrict const LLVector4* coords = mMesh->getCoords();
+	__restrict const LLVector4* normals = mMesh->getNormals();
 
-	const F32* weights = mMesh->getWeights();
-	const LLVector3* coords = mMesh->getCoords();
-	const LLVector3* normals = mMesh->getNormals();
 	for (U32 index = 0; index < mMesh->getNumVertices(); index++)
 	{
 		U32 bidx = index + mMesh->mFaceVertexOffset;
@@ -826,71 +838,54 @@ void LLViewerJointMesh::updateGeometryOriginal(LLFace *mFace, LLPolyMesh *mMesh)
 		// blend by first matrix
 		F32 w = weights[index]; 
 		
+		LLVector4a coord;
+		coord.load4a(coords[index].mV);
+
+		LLVector4a norm;
+		norm.load4a(normals[index].mV);
+
 		// Maybe we don't have to change gBlendMat.
 		// Profiles of a single-avatar scene on a Mac show this to be a very
 		// common case.  JC
-		if (w == last_weight)
+		//if (w != last_weight)
 		{
-			o_vertices[bidx] = coords[index] * gBlendMat;
-			o_normals[bidx] = normals[index] * gBlendRotMat;
-			continue;
-		}
-		
-		last_weight = w;
+			//last_weight = w;
 
-		S32 joint = llfloor(w);
-		w -= joint;
-		
-		// No lerp required in this case.
-		if (w == 1.0f)
-		{
-			gBlendMat = gJointMatUnaligned[joint+1];
-			o_vertices[bidx] = coords[index] * gBlendMat;
-			gBlendRotMat = gJointRotUnaligned[joint+1];
-			o_normals[bidx] = normals[index] * gBlendRotMat;
-			continue;
+			S32 joint = llfloor(w);
+			w -= joint;
+				
+			
+			if (w >= 0.f)
+			{
+				// Try to keep all the accesses to the matrix data as close
+				// together as possible.  This function is a hot spot on the
+				// Mac. JC
+				gBlendMat.setLerp(gJointMatAligned[joint+0],
+								  gJointMatAligned[joint+1], w);
+
+				LLVector4a res;
+				gBlendMat.affineTransform(coord, res);
+				o_vertices[bidx].setVec(res[0], res[1], res[2]);
+				gBlendMat.rotate(norm, res);
+				o_normals[bidx].setVec(res[0], res[1], res[2]);
+			}
+			else
+			{  // No lerp required in this case.
+				LLVector4a res;
+				gJointMatAligned[joint].affineTransform(coord, res);
+				o_vertices[bidx].setVec(res[0], res[1], res[2]);
+				gJointMatAligned[joint].rotate(norm, res);
+				o_normals[bidx].setVec(res[0], res[1], res[2]);
+			}
 		}
-		
-		// Try to keep all the accesses to the matrix data as close
-		// together as possible.  This function is a hot spot on the
-		// Mac. JC
-		LLMatrix4 &m0 = gJointMatUnaligned[joint+1];
-		LLMatrix4 &m1 = gJointMatUnaligned[joint+0];
-		
-		gBlendMat.mMatrix[VX][VX] = lerp(m1.mMatrix[VX][VX], m0.mMatrix[VX][VX], w);
-		gBlendMat.mMatrix[VX][VY] = lerp(m1.mMatrix[VX][VY], m0.mMatrix[VX][VY], w);
-		gBlendMat.mMatrix[VX][VZ] = lerp(m1.mMatrix[VX][VZ], m0.mMatrix[VX][VZ], w);
-
-		gBlendMat.mMatrix[VY][VX] = lerp(m1.mMatrix[VY][VX], m0.mMatrix[VY][VX], w);
-		gBlendMat.mMatrix[VY][VY] = lerp(m1.mMatrix[VY][VY], m0.mMatrix[VY][VY], w);
-		gBlendMat.mMatrix[VY][VZ] = lerp(m1.mMatrix[VY][VZ], m0.mMatrix[VY][VZ], w);
-
-		gBlendMat.mMatrix[VZ][VX] = lerp(m1.mMatrix[VZ][VX], m0.mMatrix[VZ][VX], w);
-		gBlendMat.mMatrix[VZ][VY] = lerp(m1.mMatrix[VZ][VY], m0.mMatrix[VZ][VY], w);
-		gBlendMat.mMatrix[VZ][VZ] = lerp(m1.mMatrix[VZ][VZ], m0.mMatrix[VZ][VZ], w);
-
-		gBlendMat.mMatrix[VW][VX] = lerp(m1.mMatrix[VW][VX], m0.mMatrix[VW][VX], w);
-		gBlendMat.mMatrix[VW][VY] = lerp(m1.mMatrix[VW][VY], m0.mMatrix[VW][VY], w);
-		gBlendMat.mMatrix[VW][VZ] = lerp(m1.mMatrix[VW][VZ], m0.mMatrix[VW][VZ], w);
-
-		o_vertices[bidx] = coords[index] * gBlendMat;
-		
-		LLMatrix3 &n0 = gJointRotUnaligned[joint+1];
-		LLMatrix3 &n1 = gJointRotUnaligned[joint+0];
-		
-		gBlendRotMat.mMatrix[VX][VX] = lerp(n1.mMatrix[VX][VX], n0.mMatrix[VX][VX], w);
-		gBlendRotMat.mMatrix[VX][VY] = lerp(n1.mMatrix[VX][VY], n0.mMatrix[VX][VY], w);
-		gBlendRotMat.mMatrix[VX][VZ] = lerp(n1.mMatrix[VX][VZ], n0.mMatrix[VX][VZ], w);
-
-		gBlendRotMat.mMatrix[VY][VX] = lerp(n1.mMatrix[VY][VX], n0.mMatrix[VY][VX], w);
-		gBlendRotMat.mMatrix[VY][VY] = lerp(n1.mMatrix[VY][VY], n0.mMatrix[VY][VY], w);
-		gBlendRotMat.mMatrix[VY][VZ] = lerp(n1.mMatrix[VY][VZ], n0.mMatrix[VY][VZ], w);
-
-		gBlendRotMat.mMatrix[VZ][VX] = lerp(n1.mMatrix[VZ][VX], n0.mMatrix[VZ][VX], w);
-		gBlendRotMat.mMatrix[VZ][VY] = lerp(n1.mMatrix[VZ][VY], n0.mMatrix[VZ][VY], w);
-		gBlendRotMat.mMatrix[VZ][VZ] = lerp(n1.mMatrix[VZ][VZ], n0.mMatrix[VZ][VZ], w);
-		
-		o_normals[bidx] = normals[index] * gBlendRotMat;
+		/*else
+		{ //weight didn't change
+			LLVector4a res;
+			gBlendMat.affineTransform(coord, res);
+			o_vertices[bidx].setVec(res[0], res[1], res[2]);
+			gBlendMat.rotate(norm, res);
+			o_normals[bidx].setVec(res[0], res[1], res[2]);
+		}*/
 	}
 
 	buffer->setBuffer(0);
diff --git a/indra/newview/llviewerjointmesh_vec.cpp b/indra/newview/llviewerjointmesh_vec.cpp
index 8fb9d1cf68fc745e78b37fe2e172c54fd28b6b08..a1225c9d1c93f7e8d7172105a8bbef246aeb71a4 100644
--- a/indra/newview/llviewerjointmesh_vec.cpp
+++ b/indra/newview/llviewerjointmesh_vec.cpp
@@ -52,6 +52,7 @@
 // static
 void LLViewerJointMesh::updateGeometryVectorized(LLFace *face, LLPolyMesh *mesh)
 {
+#if 0
 	static LLV4Matrix4	sJointMat[32];
 	LLDynamicArray<LLJointRenderData*>& joint_data = mesh->getReferenceMesh()->mJointRenderData;
 	S32 j, joint_num, joint_end = joint_data.count();
@@ -98,4 +99,5 @@ void LLViewerJointMesh::updateGeometryVectorized(LLFace *face, LLPolyMesh *mesh)
 	}
 
 	buffer->setBuffer(0);
+#endif
 }