diff --git a/indra/llcharacter/lljoint.cpp b/indra/llcharacter/lljoint.cpp
index dee642310e0bea014d2569c5f2bf775251981d42..d72282ab4210759dc728632dc99415582bed243a 100644
--- a/indra/llcharacter/lljoint.cpp
+++ b/indra/llcharacter/lljoint.cpp
@@ -922,6 +922,13 @@ const LLMatrix4 &LLJoint::getWorldMatrix()
 	return mXform.getWorldMatrix();
 }
 
+const LLMatrix4a& LLJoint::getWorldMatrix4a()
+{
+    updateWorldMatrixParent();
+
+    return mWorldMatrix;
+}
+
 
 //--------------------------------------------------------------------
 // setWorldMatrix()
@@ -1003,6 +1010,7 @@ void LLJoint::updateWorldMatrix()
 	{
 		sNumUpdates++;
 		mXform.updateMatrix(FALSE);
+        mWorldMatrix.loadu(mXform.getWorldMatrix());
 		mDirtyFlags = 0x0;
 	}
 }
diff --git a/indra/llcharacter/lljoint.h b/indra/llcharacter/lljoint.h
index 1b646b641f4bb39696c7864e18f601fea490175e..ba821667c79449109e08ea69d24ea78c8ecd8c2a 100644
--- a/indra/llcharacter/lljoint.h
+++ b/indra/llcharacter/lljoint.h
@@ -38,6 +38,7 @@
 #include "m4math.h"
 #include "llquaternion.h"
 #include "xform.h"
+#include "llmatrix4a.h"
 
 const S32 LL_CHARACTER_MAX_JOINTS_PER_MESH = 15;
 // Need to set this to count of animate-able joints,
@@ -123,6 +124,7 @@ class LLJoint
 
 	// explicit transformation members
 	LLXformMatrix		mXform;
+    LLMatrix4a          mWorldMatrix;
 
     LLVector3       mDefaultPosition;
     LLVector3       mDefaultScale;
@@ -259,6 +261,8 @@ class LLJoint
 	const LLMatrix4 &getWorldMatrix();
 	void setWorldMatrix( const LLMatrix4& mat );
 
+    const LLMatrix4a& getWorldMatrix4a();
+
 	void updateWorldMatrixChildren();
 	void updateWorldMatrixParent();
 
diff --git a/indra/llmath/llmatrix4a.h b/indra/llmath/llmatrix4a.h
index 7ba347062f724c99b8df1589cb5bee169daa8fa1..5291a05607b5ca6c230c2012263cd893fbc58beb 100644
--- a/indra/llmath/llmatrix4a.h
+++ b/indra/llmath/llmatrix4a.h
@@ -36,6 +36,26 @@ class LLMatrix4a
 public:
 	LL_ALIGN_16(LLVector4a mMatrix[4]);
 
+    LLMatrix4a()
+    {
+
+    }
+
+    explicit LLMatrix4a(const LLMatrix4& val)
+    {
+        loadu(val);
+    }
+
+    inline F32* getF32ptr()
+    {
+        return (F32*) &mMatrix;
+    }
+
+    inline const F32* getF32ptr() const
+    {
+        return (F32*)&mMatrix;
+    }
+
 	inline void clear()
 	{
 		mMatrix[0].clear();
@@ -44,6 +64,14 @@ class LLMatrix4a
 		mMatrix[3].clear();
 	}
 
+    inline void setIdentity()
+    {
+        mMatrix[0].set(1.f, 0.f, 0.f, 0.f);
+        mMatrix[1].set(0.f, 1.f, 0.f, 0.f);
+        mMatrix[2].set(0.f, 0.f, 1.f, 0.f);
+        mMatrix[3].set(0.f, 0.f, 0.f, 1.f);
+    }
+
 	inline void loadu(const LLMatrix4& src)
 	{
 		mMatrix[0] = _mm_loadu_ps(src.mMatrix[0]);
@@ -105,7 +133,7 @@ class LLMatrix4a
 		mMatrix[3].setAdd(a.mMatrix[3],d3);
 	}
 
-	inline void rotate(const LLVector4a& v, LLVector4a& res)
+	inline void rotate(const LLVector4a& v, LLVector4a& res) const
 	{
 		LLVector4a y,z;
 
@@ -151,6 +179,8 @@ class LLMatrix4a
     {
         affineTransformSSE(v,res);
     }
+
+    const LLVector4a& getTranslation() const { return mMatrix[3]; }
 };
 
 inline LLVector4a rowMul(const LLVector4a &row, const LLMatrix4a &mat)
@@ -176,6 +206,15 @@ inline void matMul(const LLMatrix4a &a, const LLMatrix4a &b, LLMatrix4a &res)
     res.mMatrix[3] = row3;
 }
 
+//Faster version of matMul wehere res must not be a or b
+inline void matMulUnsafe(const LLMatrix4a &a, const LLMatrix4a &b, LLMatrix4a &res)
+{
+    res.mMatrix[0] = rowMul(a.mMatrix[0], b);
+    res.mMatrix[1] = rowMul(a.mMatrix[1], b);
+    res.mMatrix[2] = rowMul(a.mMatrix[2], b);
+    res.mMatrix[3] = rowMul(a.mMatrix[3], b);
+}
+
 inline std::ostream& operator<<(std::ostream& s, const LLMatrix4a& m)
 {
     s << "[" << m.mMatrix[0] << ", " << m.mMatrix[1] << ", " << m.mMatrix[2] << ", " << m.mMatrix[3] << "]";
diff --git a/indra/llmath/llvector4a.h b/indra/llmath/llvector4a.h
index 27abf395376b13c0f7cb84a85a27a493382c56af..5a029283740c3172118950439b7763005c0e87f7 100644
--- a/indra/llmath/llvector4a.h
+++ b/indra/llmath/llvector4a.h
@@ -46,10 +46,9 @@ class LLRotation;
 // of this writing, July 08, 2010) about getting it implemented before you resort to
 // LLVector3/LLVector4. 
 /////////////////////////////////
-struct LLVector4a;
 
 LL_ALIGN_PREFIX(16)
-struct LLVector4a
+class LLVector4a
 {
 public:
 
@@ -138,10 +137,10 @@ struct LLVector4a
 	// BASIC GET/SET 
 	////////////////////////////////////
 	
-	// Return a "this" as an F32 pointer. Do not use unless you have a very good reason.  (Not sure? Ask Falcon)
+	// Return a "this" as an F32 pointer.
 	inline F32* getF32ptr();
 	
-	// Return a "this" as a const F32 pointer. Do not use unless you have a very good reason.  (Not sure? Ask Falcon)
+	// Return a "this" as a const F32 pointer.
 	inline const F32* const getF32ptr() const;
 	
 	// Read-only access a single float in this vector. Do not use in proximity to any function call that manipulates
diff --git a/indra/llmath/llvector4a.inl b/indra/llmath/llvector4a.inl
index 69d3d01efe485d3a4c677d1c6e730f0ea573b1e8..8be1c1b11484c3161810fc1616fadfa8540ab886 100644
--- a/indra/llmath/llvector4a.inl
+++ b/indra/llmath/llvector4a.inl
@@ -58,13 +58,13 @@ inline void LLVector4a::store4a(F32* dst) const
 // BASIC GET/SET 
 ////////////////////////////////////
 
-// Return a "this" as an F32 pointer. Do not use unless you have a very good reason.  (Not sure? Ask Falcon)
+// Return a "this" as an F32 pointer.
 F32* LLVector4a::getF32ptr()
 {
 	return (F32*) &mQ;
 }
 
-// Return a "this" as a const F32 pointer. Do not use unless you have a very good reason.  (Not sure? Ask Falcon)
+// Return a "this" as a const F32 pointer.
 const F32* const LLVector4a::getF32ptr() const
 {
 	return (const F32* const) &mQ;
diff --git a/indra/llmath/m4math.cpp b/indra/llmath/m4math.cpp
index 3baf1bad18bc5ac0f6450793f6353831135b6b7a..6e40dae30b7fa35d071ac5546278bd713729f797 100644
--- a/indra/llmath/m4math.cpp
+++ b/indra/llmath/m4math.cpp
@@ -32,8 +32,7 @@
 #include "m4math.h"
 #include "m3math.h"
 #include "llquaternion.h"
-
-
+#include "llmatrix4a.h"
 
 
 // LLMatrix4
@@ -115,6 +114,12 @@ LLMatrix4::LLMatrix4(const LLQuaternion &q)
 	*this = initRotation(q);
 }
 
+LLMatrix4::LLMatrix4(const LLMatrix4a& mat)
+    : LLMatrix4(mat.getF32ptr())
+{
+    
+}
+
 LLMatrix4::LLMatrix4(const LLQuaternion &q, const LLVector4 &pos)
 {
 	*this = initRotTrans(q, pos);
diff --git a/indra/llmath/m4math.h b/indra/llmath/m4math.h
index bf60adb9b6e9083a4bd684c3b8be98306144be71..b9da970cde11cb12c4236e91a7f0d3205a47137b 100644
--- a/indra/llmath/m4math.h
+++ b/indra/llmath/m4math.h
@@ -32,6 +32,7 @@
 class LLVector4;
 class LLMatrix3;
 class LLQuaternion;
+class LLMatrix4a;
 
 // NOTA BENE: Currently assuming a right-handed, x-forward, y-left, z-up universe
 
@@ -104,6 +105,7 @@ class LLMatrix4
 	explicit LLMatrix4(const F32 *mat);								// Initializes Matrix to values in mat
 	explicit LLMatrix4(const LLMatrix3 &mat);						// Initializes Matrix to values in mat and sets position to (0,0,0)
 	explicit LLMatrix4(const LLQuaternion &q);						// Initializes Matrix with rotation q and sets position to (0,0,0)
+    explicit LLMatrix4(const LLMatrix4a& mat);
 
 	LLMatrix4(const LLMatrix3 &mat, const LLVector4 &pos);	// Initializes Matrix to values in mat and pos
 
diff --git a/indra/llmath/v3math.cpp b/indra/llmath/v3math.cpp
index b04c67d92631fac6535c0f161631487c66c66e1f..93010d2250a6471d717f7a2460280a256353fec6 100644
--- a/indra/llmath/v3math.cpp
+++ b/indra/llmath/v3math.cpp
@@ -316,6 +316,12 @@ LLVector3::LLVector3(const LLVector4 &vec)
 	mV[VZ] = (F32)vec.mV[VZ];
 }
 
+LLVector3::LLVector3(const LLVector4a& vec)
+    : LLVector3(vec.getF32ptr())
+{
+
+}
+
 LLVector3::LLVector3(const LLSD& sd)
 {
 	setValue(sd);
diff --git a/indra/llmath/v3math.h b/indra/llmath/v3math.h
index 6f857d70617167ade434c4235e76860124b15465..068f489020667be2b8498e86eedc6fb35a4c5910 100644
--- a/indra/llmath/v3math.h
+++ b/indra/llmath/v3math.h
@@ -33,6 +33,7 @@
 #include "llsd.h"
 class LLVector2;
 class LLVector4;
+class LLVector4a;
 class LLMatrix3;
 class LLMatrix4;
 class LLVector3d;
@@ -62,7 +63,9 @@ class LLVector3
 		explicit LLVector3(const LLVector2 &vec);				// Initializes LLVector3 to (vec[0]. vec[1], 0)
 		explicit LLVector3(const LLVector3d &vec);				// Initializes LLVector3 to (vec[0]. vec[1], vec[2])
 		explicit LLVector3(const LLVector4 &vec);				// Initializes LLVector4 to (vec[0]. vec[1], vec[2])
-		explicit LLVector3(const LLSD& sd);
+        explicit LLVector3(const LLVector4a& vec);              // Initializes LLVector4 to (vec[0]. vec[1], vec[2])
+        explicit LLVector3(const LLSD& sd);
+        
 
 		LLSD getValue() const;
 
diff --git a/indra/llprimitive/lldaeloader.cpp b/indra/llprimitive/lldaeloader.cpp
index dfa29fb539e2664d671d1355c7f8022b5ef1a9ac..8343de0cbc3f63f43fbbb9bf3642b524a6907d26 100644
--- a/indra/llprimitive/lldaeloader.cpp
+++ b/indra/llprimitive/lldaeloader.cpp
@@ -1173,17 +1173,19 @@ void LLDAELoader::processDomModel(LLModel* model, DAE* dae, daeElement* root, do
 
 			LLMeshSkinInfo& skin_info = model->mSkinInfo;
 
+            LLMatrix4 mat;
 			for (int i = 0; i < 4; i++)
 			{
 				for(int j = 0; j < 4; j++)
 				{
-					skin_info.mBindShapeMatrix.mMatrix[i][j] = dom_value[i + j*4];
+                    mat.mMatrix[i][j] = dom_value[i + j*4];
 				}
 			}
 
-			LLMatrix4 trans = normalized_transformation;
-			trans *= skin_info.mBindShapeMatrix;
-			skin_info.mBindShapeMatrix = trans;							
+            skin_info.mBindShapeMatrix.loadu(mat);
+
+			LLMatrix4a trans(normalized_transformation);
+            matMul(trans, skin_info.mBindShapeMatrix, skin_info.mBindShapeMatrix);
 		}
 
 
@@ -1401,7 +1403,7 @@ void LLDAELoader::processDomModel(LLModel* model, DAE* dae, daeElement* root, do
 									mat.mMatrix[i][j] = transform[k*16 + i + j*4];
 								}
 							}
-							model->mSkinInfo.mInvBindMatrix.push_back(mat);
+							model->mSkinInfo.mInvBindMatrix.push_back(LLMatrix4a(mat));
 						}
 					}
 				}
@@ -1475,9 +1477,9 @@ void LLDAELoader::processDomModel(LLModel* model, DAE* dae, daeElement* root, do
 			if (mJointMap.find(lookingForJoint) != mJointMap.end()
 				&& model->mSkinInfo.mInvBindMatrix.size() > i)
 			{
-				LLMatrix4 newInverse = model->mSkinInfo.mInvBindMatrix[i];
+				LLMatrix4 newInverse = LLMatrix4(model->mSkinInfo.mInvBindMatrix[i].getF32ptr());
 				newInverse.setTranslation( mJointList[lookingForJoint].getTranslation() );
-				model->mSkinInfo.mAlternateBindMatrix.push_back( newInverse );
+				model->mSkinInfo.mAlternateBindMatrix.push_back( LLMatrix4a(newInverse) );
             }
 			else
 			{
diff --git a/indra/llprimitive/llmodel.cpp b/indra/llprimitive/llmodel.cpp
index 702a1b523843864d24d1f88690aed0cabfe5008d..a23b991f1d4d0896a521cde5082395526836eb52 100644
--- a/indra/llprimitive/llmodel.cpp
+++ b/indra/llprimitive/llmodel.cpp
@@ -1396,7 +1396,7 @@ void LLMeshSkinInfo::fromLLSD(LLSD& skin)
 				}
 			}
 
-			mInvBindMatrix.push_back(mat);
+			mInvBindMatrix.push_back(LLMatrix4a(mat));
 		}
 
         if (mJointNames.size() != mInvBindMatrix.size())
@@ -1410,13 +1410,15 @@ void LLMeshSkinInfo::fromLLSD(LLSD& skin)
 
 	if (skin.has("bind_shape_matrix"))
 	{
+        LLMatrix4 mat;
 		for (U32 j = 0; j < 4; j++)
 		{
 			for (U32 k = 0; k < 4; k++)
 			{
-				mBindShapeMatrix.mMatrix[j][k] = skin["bind_shape_matrix"][j*4+k].asReal();
+				mat.mMatrix[j][k] = skin["bind_shape_matrix"][j*4+k].asReal();
 			}
 		}
+        mBindShapeMatrix.loadu(mat);
 	}
 
 	if (skin.has("alt_inverse_bind_matrix"))
@@ -1432,7 +1434,7 @@ void LLMeshSkinInfo::fromLLSD(LLSD& skin)
 				}
 			}
 			
-			mAlternateBindMatrix.push_back(mat);
+			mAlternateBindMatrix.push_back(LLMatrix4a(mat));
 		}
 	}
 
diff --git a/indra/llprimitive/llmodel.h b/indra/llprimitive/llmodel.h
index 51fa2f8079cfef5f9182df9a68e24280dce4f88b..96d4582b4f327bffe4dc5e99e0d0963a89f25d1b 100644
--- a/indra/llprimitive/llmodel.h
+++ b/indra/llprimitive/llmodel.h
@@ -33,6 +33,8 @@
 #include "m4math.h"
 #include <queue>
 
+#include <boost/align/aligned_allocator.hpp>
+
 class daeElement;
 class domMesh;
 
@@ -49,10 +51,11 @@ class LLMeshSkinInfo
 	LLUUID mMeshID;
 	std::vector<std::string> mJointNames;
     mutable std::vector<S32> mJointNums;
-	std::vector<LLMatrix4> mInvBindMatrix;
-	std::vector<LLMatrix4> mAlternateBindMatrix;
+    typedef std::vector<LLMatrix4a, boost::alignment::aligned_allocator<LLMatrix4a, 16>> matrix_list_t;
+	matrix_list_t mInvBindMatrix;
+	matrix_list_t mAlternateBindMatrix;
 
-	LLMatrix4 mBindShapeMatrix;
+	LLMatrix4a mBindShapeMatrix;
 	float mPelvisOffset;
     bool mLockScaleIfJointPosition;
     bool mInvalidJointsScrubbed;
diff --git a/indra/llrender/llglslshader.cpp b/indra/llrender/llglslshader.cpp
index 4351f6e2c86cdde783f128c835c95f4f6f4e4604..8bd9dbf9b8c324e50e75ebcea8af4bd8cca9e133 100644
--- a/indra/llrender/llglslshader.cpp
+++ b/indra/llrender/llglslshader.cpp
@@ -1346,6 +1346,7 @@ void LLGLSLShader::uniformMatrix3fv(U32 index, U32 count, GLboolean transpose, c
 
 void LLGLSLShader::uniformMatrix3x4fv(U32 index, U32 count, GLboolean transpose, const GLfloat *v)
 {
+    LL_PROFILE_ZONE_SCOPED;
 	if (mProgramObject)
 	{	
 		if (mUniform.size() <= index)
diff --git a/indra/newview/llcontrolavatar.cpp b/indra/newview/llcontrolavatar.cpp
index fab249f98807a865b7d4c21732eecd4c5443b440..606e6708053039540f7980a33f8daeae593192de 100644
--- a/indra/newview/llcontrolavatar.cpp
+++ b/indra/newview/llcontrolavatar.cpp
@@ -241,7 +241,7 @@ void LLControlAvatar::matchVolumeTransform()
 			if (skin_info)
 			{
                 LL_DEBUGS("BindShape") << getFullname() << " bind shape " << skin_info->mBindShapeMatrix << LL_ENDL;
-                bind_rot = LLSkinningUtil::getUnscaledQuaternion(skin_info->mBindShapeMatrix);
+                bind_rot = LLSkinningUtil::getUnscaledQuaternion(LLMatrix4(skin_info->mBindShapeMatrix));
 			}
 #endif
 			setRotation(bind_rot*obj_rot);
diff --git a/indra/newview/lldrawpoolavatar.cpp b/indra/newview/lldrawpoolavatar.cpp
index 6c4844f9ee4664ee3bf8412735e9fce0524923be..c7aa104ca53ee6034a39748118c158fb7889437f 100644
--- a/indra/newview/lldrawpoolavatar.cpp
+++ b/indra/newview/lldrawpoolavatar.cpp
@@ -163,6 +163,7 @@ void LLDrawPoolAvatar::prerender()
 		{
 			LLVOAvatar* avatarp = (LLVOAvatar *)facep->getDrawable()->getVObj().get();
 			updateRiggedVertexBuffers(avatarp);
+            updateSkinInfoMatrixPalettes(avatarp);
 		}
 	}
 }
@@ -428,221 +429,230 @@ S32 LLDrawPoolAvatar::getNumShadowPasses()
 void LLDrawPoolAvatar::beginShadowPass(S32 pass)
 {
 	LL_RECORD_BLOCK_TIME(FTM_SHADOW_AVATAR);
+    {
+        LL_PROFILE_ZONE_SCOPED;
 
-	if (pass == SHADOW_PASS_AVATAR_OPAQUE)
-	{
-		sVertexProgram = &gDeferredAvatarShadowProgram;
-		
-		if ((sShaderLevel > 0))  // for hardware blending
-		{
-			sRenderingSkinned = TRUE;
-			sVertexProgram->bind();
-		}
+        if (pass == SHADOW_PASS_AVATAR_OPAQUE)
+        {
+            sVertexProgram = &gDeferredAvatarShadowProgram;
 
-		gGL.diffuseColor4f(1,1,1,1);
-	}
-    else if (pass == SHADOW_PASS_AVATAR_ALPHA_BLEND)
-	{
-		sVertexProgram = &gDeferredAvatarAlphaShadowProgram;
+            if ((sShaderLevel > 0))  // for hardware blending
+            {
+                sRenderingSkinned = TRUE;
+                sVertexProgram->bind();
+            }
 
-        // bind diffuse tex so we can reference the alpha channel...
-        S32 loc = sVertexProgram->getUniformLocation(LLViewerShaderMgr::DIFFUSE_MAP);
-        sDiffuseChannel = 0;
-        if (loc != -1)
+            gGL.diffuseColor4f(1, 1, 1, 1);
+        }
+        else if (pass == SHADOW_PASS_AVATAR_ALPHA_BLEND)
         {
-            sDiffuseChannel = sVertexProgram->enableTexture(LLViewerShaderMgr::DIFFUSE_MAP);
-		}
+            sVertexProgram = &gDeferredAvatarAlphaShadowProgram;
 
-		if ((sShaderLevel > 0))  // for hardware blending
-		{
-			sRenderingSkinned = TRUE;
-			sVertexProgram->bind();
-		}
+            // bind diffuse tex so we can reference the alpha channel...
+            S32 loc = sVertexProgram->getUniformLocation(LLViewerShaderMgr::DIFFUSE_MAP);
+            sDiffuseChannel = 0;
+            if (loc != -1)
+            {
+                sDiffuseChannel = sVertexProgram->enableTexture(LLViewerShaderMgr::DIFFUSE_MAP);
+            }
 
-		gGL.diffuseColor4f(1,1,1,1);
-	}
-    else if (pass == SHADOW_PASS_AVATAR_ALPHA_MASK)
-	{
-		sVertexProgram = &gDeferredAvatarAlphaMaskShadowProgram;
+            if ((sShaderLevel > 0))  // for hardware blending
+            {
+                sRenderingSkinned = TRUE;
+                sVertexProgram->bind();
+            }
 
-        // bind diffuse tex so we can reference the alpha channel...
-        S32 loc = sVertexProgram->getUniformLocation(LLViewerShaderMgr::DIFFUSE_MAP);
-        sDiffuseChannel = 0;
-        if (loc != -1)
+            gGL.diffuseColor4f(1, 1, 1, 1);
+        }
+        else if (pass == SHADOW_PASS_AVATAR_ALPHA_MASK)
         {
-            sDiffuseChannel = sVertexProgram->enableTexture(LLViewerShaderMgr::DIFFUSE_MAP);
-		}
+            sVertexProgram = &gDeferredAvatarAlphaMaskShadowProgram;
 
-		if ((sShaderLevel > 0))  // for hardware blending
-		{
-			sRenderingSkinned = TRUE;
-			sVertexProgram->bind();
-		}
+            // bind diffuse tex so we can reference the alpha channel...
+            S32 loc = sVertexProgram->getUniformLocation(LLViewerShaderMgr::DIFFUSE_MAP);
+            sDiffuseChannel = 0;
+            if (loc != -1)
+            {
+                sDiffuseChannel = sVertexProgram->enableTexture(LLViewerShaderMgr::DIFFUSE_MAP);
+            }
 
-		gGL.diffuseColor4f(1,1,1,1);
-	}
-    else if (pass == SHADOW_PASS_ATTACHMENT_ALPHA_BLEND)
-	{
-		sVertexProgram = &gDeferredAttachmentAlphaShadowProgram;
+            if ((sShaderLevel > 0))  // for hardware blending
+            {
+                sRenderingSkinned = TRUE;
+                sVertexProgram->bind();
+            }
 
-        // bind diffuse tex so we can reference the alpha channel...
-        S32 loc = sVertexProgram->getUniformLocation(LLViewerShaderMgr::DIFFUSE_MAP);
-        sDiffuseChannel = 0;
-        if (loc != -1)
+            gGL.diffuseColor4f(1, 1, 1, 1);
+        }
+        else if (pass == SHADOW_PASS_ATTACHMENT_ALPHA_BLEND)
         {
-            sDiffuseChannel = sVertexProgram->enableTexture(LLViewerShaderMgr::DIFFUSE_MAP);
-		}
-		
-		if ((sShaderLevel > 0))  // for hardware blending
-		{
-			sRenderingSkinned = TRUE;
-			sVertexProgram->bind();
-		}
+            sVertexProgram = &gDeferredAttachmentAlphaShadowProgram;
 
-		gGL.diffuseColor4f(1,1,1,1);
-	}
-    else if (pass == SHADOW_PASS_ATTACHMENT_ALPHA_MASK)
-	{
-		sVertexProgram = &gDeferredAttachmentAlphaMaskShadowProgram;
+            // bind diffuse tex so we can reference the alpha channel...
+            S32 loc = sVertexProgram->getUniformLocation(LLViewerShaderMgr::DIFFUSE_MAP);
+            sDiffuseChannel = 0;
+            if (loc != -1)
+            {
+                sDiffuseChannel = sVertexProgram->enableTexture(LLViewerShaderMgr::DIFFUSE_MAP);
+            }
+
+            if ((sShaderLevel > 0))  // for hardware blending
+            {
+                sRenderingSkinned = TRUE;
+                sVertexProgram->bind();
+            }
 
-        // bind diffuse tex so we can reference the alpha channel...
-		S32 loc = sVertexProgram->getUniformLocation(LLViewerShaderMgr::DIFFUSE_MAP);
-        sDiffuseChannel = 0;
-        if (loc != -1)
+            gGL.diffuseColor4f(1, 1, 1, 1);
+        }
+        else if (pass == SHADOW_PASS_ATTACHMENT_ALPHA_MASK)
         {
-            sDiffuseChannel = sVertexProgram->enableTexture(LLViewerShaderMgr::DIFFUSE_MAP);
-		}
+            sVertexProgram = &gDeferredAttachmentAlphaMaskShadowProgram;
 
-		if ((sShaderLevel > 0))  // for hardware blending
-		{
-			sRenderingSkinned = TRUE;
-			sVertexProgram->bind();
-		}
+            // bind diffuse tex so we can reference the alpha channel...
+            S32 loc = sVertexProgram->getUniformLocation(LLViewerShaderMgr::DIFFUSE_MAP);
+            sDiffuseChannel = 0;
+            if (loc != -1)
+            {
+                sDiffuseChannel = sVertexProgram->enableTexture(LLViewerShaderMgr::DIFFUSE_MAP);
+            }
 
-		gGL.diffuseColor4f(1,1,1,1);
-	}
-	else // SHADOW_PASS_ATTACHMENT_OPAQUE
-	{
-		sVertexProgram = &gDeferredAttachmentShadowProgram;
-		S32 loc = sVertexProgram->getUniformLocation(LLViewerShaderMgr::DIFFUSE_MAP);
-        sDiffuseChannel = 0;
-        if (loc != -1)
+            if ((sShaderLevel > 0))  // for hardware blending
+            {
+                sRenderingSkinned = TRUE;
+                sVertexProgram->bind();
+            }
+
+            gGL.diffuseColor4f(1, 1, 1, 1);
+        }
+        else // SHADOW_PASS_ATTACHMENT_OPAQUE
         {
-            sDiffuseChannel = sVertexProgram->enableTexture(LLViewerShaderMgr::DIFFUSE_MAP);
-		}
-		sVertexProgram->bind();
-	}
+            sVertexProgram = &gDeferredAttachmentShadowProgram;
+            S32 loc = sVertexProgram->getUniformLocation(LLViewerShaderMgr::DIFFUSE_MAP);
+            sDiffuseChannel = 0;
+            if (loc != -1)
+            {
+                sDiffuseChannel = sVertexProgram->enableTexture(LLViewerShaderMgr::DIFFUSE_MAP);
+            }
+            sVertexProgram->bind();
+        }
+    }
 }
 
 void LLDrawPoolAvatar::endShadowPass(S32 pass)
 {
 	LL_RECORD_BLOCK_TIME(FTM_SHADOW_AVATAR);
+    {
+        LL_PROFILE_ZONE_SCOPED;
 
-	if (pass == SHADOW_PASS_ATTACHMENT_OPAQUE)
-	{
-		LLVertexBuffer::unbind();
-	}
+        if (pass == SHADOW_PASS_ATTACHMENT_OPAQUE)
+        {
+            LLVertexBuffer::unbind();
+        }
 
-    if (sShaderLevel > 0)
-	{			
-		sVertexProgram->unbind();
-	}
-    sVertexProgram = NULL;
-    sRenderingSkinned = FALSE;
-    LLDrawPoolAvatar::sShadowPass = -1;
+        if (sShaderLevel > 0)
+        {
+            sVertexProgram->unbind();
+        }
+        sVertexProgram = NULL;
+        sRenderingSkinned = FALSE;
+        LLDrawPoolAvatar::sShadowPass = -1;
+    }
 }
 
 void LLDrawPoolAvatar::renderShadow(S32 pass)
 {
-	LL_RECORD_BLOCK_TIME(FTM_SHADOW_AVATAR);
+    LL_RECORD_BLOCK_TIME(FTM_SHADOW_AVATAR);
+    {
+        LL_PROFILE_ZONE_SCOPED;
 
-	if (mDrawFace.empty())
-	{
-		return;
-	}
+        if (mDrawFace.empty())
+        {
+            return;
+        }
 
-	const LLFace *facep = mDrawFace[0];
-	if (!facep->getDrawable())
-	{
-		return;
-	}
-	LLVOAvatar *avatarp = (LLVOAvatar *)facep->getDrawable()->getVObj().get();
+        const LLFace *facep = mDrawFace[0];
+        if (!facep->getDrawable())
+        {
+            return;
+        }
+        LLVOAvatar *avatarp = (LLVOAvatar *)facep->getDrawable()->getVObj().get();
 
-	if (avatarp->isDead() || avatarp->isUIAvatar() || avatarp->mDrawable.isNull())
-	{
-		return;
-	}
-	LLVOAvatar::AvatarOverallAppearance oa = avatarp->getOverallAppearance();
-	BOOL impostor = !LLPipeline::sImpostorRender && avatarp->isImpostor();
-	if (oa == LLVOAvatar::AOA_INVISIBLE ||
-		(impostor && oa == LLVOAvatar::AOA_JELLYDOLL))
-	{
-		// No shadows for jellydolled or invisible avs.
-		return;
-	}
-	
-    LLDrawPoolAvatar::sShadowPass = pass;
+        if (avatarp->isDead() || avatarp->isUIAvatar() || avatarp->mDrawable.isNull())
+        {
+            return;
+        }
+        LLVOAvatar::AvatarOverallAppearance oa = avatarp->getOverallAppearance();
+        BOOL impostor = !LLPipeline::sImpostorRender && avatarp->isImpostor();
+        if (oa == LLVOAvatar::AOA_INVISIBLE ||
+            (impostor && oa == LLVOAvatar::AOA_JELLYDOLL))
+        {
+            // No shadows for jellydolled or invisible avs.
+            return;
+        }
 
-	if (pass == SHADOW_PASS_AVATAR_OPAQUE)
-	{
-        LLDrawPoolAvatar::sSkipTransparent = true;
-		avatarp->renderSkinned();
-        LLDrawPoolAvatar::sSkipTransparent = false;
-	}
-    else if (pass == SHADOW_PASS_AVATAR_ALPHA_BLEND)
-	{
-        LLDrawPoolAvatar::sSkipOpaque = true;
-		avatarp->renderSkinned();
-        LLDrawPoolAvatar::sSkipOpaque = false;
-	}
-    else if (pass == SHADOW_PASS_AVATAR_ALPHA_MASK)
-	{
-        LLDrawPoolAvatar::sSkipOpaque = true;
-		avatarp->renderSkinned();
-        LLDrawPoolAvatar::sSkipOpaque = false;
-	}
-    else if (pass == SHADOW_PASS_ATTACHMENT_ALPHA_BLEND) // rigged alpha
-	{
-        LLDrawPoolAvatar::sSkipOpaque = true;
-        renderRigged(avatarp, RIGGED_MATERIAL_ALPHA);
-        renderRigged(avatarp, RIGGED_MATERIAL_ALPHA_EMISSIVE);
-        renderRigged(avatarp, RIGGED_ALPHA);
-        renderRigged(avatarp, RIGGED_FULLBRIGHT_ALPHA);
-        renderRigged(avatarp, RIGGED_GLOW);
-        renderRigged(avatarp, RIGGED_SPECMAP_BLEND);
-        renderRigged(avatarp, RIGGED_NORMMAP_BLEND);
-        renderRigged(avatarp, RIGGED_NORMSPEC_BLEND);
-        LLDrawPoolAvatar::sSkipOpaque = false;
-	}
-    else if (pass == SHADOW_PASS_ATTACHMENT_ALPHA_MASK) // rigged alpha mask
-	{
-        LLDrawPoolAvatar::sSkipOpaque = true;
-        renderRigged(avatarp, RIGGED_MATERIAL_ALPHA_MASK);
-        renderRigged(avatarp, RIGGED_NORMMAP_MASK);
-        renderRigged(avatarp, RIGGED_SPECMAP_MASK);
-		renderRigged(avatarp, RIGGED_NORMSPEC_MASK);    
-        renderRigged(avatarp, RIGGED_GLOW);
-        LLDrawPoolAvatar::sSkipOpaque = false;
-	}
-	else // rigged opaque (SHADOW_PASS_ATTACHMENT_OPAQUE
-	{
-        LLDrawPoolAvatar::sSkipTransparent = true;
-		renderRigged(avatarp, RIGGED_MATERIAL);
-        renderRigged(avatarp, RIGGED_SPECMAP);
-		renderRigged(avatarp, RIGGED_SPECMAP_EMISSIVE);
-		renderRigged(avatarp, RIGGED_NORMMAP);		
-		renderRigged(avatarp, RIGGED_NORMMAP_EMISSIVE);
-		renderRigged(avatarp, RIGGED_NORMSPEC);
-		renderRigged(avatarp, RIGGED_NORMSPEC_EMISSIVE);
-		renderRigged(avatarp, RIGGED_SIMPLE);
-		renderRigged(avatarp, RIGGED_FULLBRIGHT);
-		renderRigged(avatarp, RIGGED_SHINY);
-		renderRigged(avatarp, RIGGED_FULLBRIGHT_SHINY);
-		renderRigged(avatarp, RIGGED_GLOW);
-		renderRigged(avatarp, RIGGED_DEFERRED_BUMP);
-		renderRigged(avatarp, RIGGED_DEFERRED_SIMPLE);
-        LLDrawPoolAvatar::sSkipTransparent = false;
-	}
+        LLDrawPoolAvatar::sShadowPass = pass;
+
+        if (pass == SHADOW_PASS_AVATAR_OPAQUE)
+        {
+            LLDrawPoolAvatar::sSkipTransparent = true;
+            avatarp->renderSkinned();
+            LLDrawPoolAvatar::sSkipTransparent = false;
+        }
+        else if (pass == SHADOW_PASS_AVATAR_ALPHA_BLEND)
+        {
+            LLDrawPoolAvatar::sSkipOpaque = true;
+            avatarp->renderSkinned();
+            LLDrawPoolAvatar::sSkipOpaque = false;
+        }
+        else if (pass == SHADOW_PASS_AVATAR_ALPHA_MASK)
+        {
+            LLDrawPoolAvatar::sSkipOpaque = true;
+            avatarp->renderSkinned();
+            LLDrawPoolAvatar::sSkipOpaque = false;
+        }
+        else if (pass == SHADOW_PASS_ATTACHMENT_ALPHA_BLEND) // rigged alpha
+        {
+            LLDrawPoolAvatar::sSkipOpaque = true;
+            renderRigged(avatarp, RIGGED_MATERIAL_ALPHA);
+            renderRigged(avatarp, RIGGED_MATERIAL_ALPHA_EMISSIVE);
+            renderRigged(avatarp, RIGGED_ALPHA);
+            renderRigged(avatarp, RIGGED_FULLBRIGHT_ALPHA);
+            renderRigged(avatarp, RIGGED_GLOW);
+            renderRigged(avatarp, RIGGED_SPECMAP_BLEND);
+            renderRigged(avatarp, RIGGED_NORMMAP_BLEND);
+            renderRigged(avatarp, RIGGED_NORMSPEC_BLEND);
+            LLDrawPoolAvatar::sSkipOpaque = false;
+        }
+        else if (pass == SHADOW_PASS_ATTACHMENT_ALPHA_MASK) // rigged alpha mask
+        {
+            LLDrawPoolAvatar::sSkipOpaque = true;
+            renderRigged(avatarp, RIGGED_MATERIAL_ALPHA_MASK);
+            renderRigged(avatarp, RIGGED_NORMMAP_MASK);
+            renderRigged(avatarp, RIGGED_SPECMAP_MASK);
+            renderRigged(avatarp, RIGGED_NORMSPEC_MASK);
+            renderRigged(avatarp, RIGGED_GLOW);
+            LLDrawPoolAvatar::sSkipOpaque = false;
+        }
+        else // rigged opaque (SHADOW_PASS_ATTACHMENT_OPAQUE
+        {
+            LLDrawPoolAvatar::sSkipTransparent = true;
+            renderRigged(avatarp, RIGGED_MATERIAL);
+            renderRigged(avatarp, RIGGED_SPECMAP);
+            renderRigged(avatarp, RIGGED_SPECMAP_EMISSIVE);
+            renderRigged(avatarp, RIGGED_NORMMAP);
+            renderRigged(avatarp, RIGGED_NORMMAP_EMISSIVE);
+            renderRigged(avatarp, RIGGED_NORMSPEC);
+            renderRigged(avatarp, RIGGED_NORMSPEC_EMISSIVE);
+            renderRigged(avatarp, RIGGED_SIMPLE);
+            renderRigged(avatarp, RIGGED_FULLBRIGHT);
+            renderRigged(avatarp, RIGGED_SHINY);
+            renderRigged(avatarp, RIGGED_FULLBRIGHT_SHINY);
+            renderRigged(avatarp, RIGGED_GLOW);
+            renderRigged(avatarp, RIGGED_DEFERRED_BUMP);
+            renderRigged(avatarp, RIGGED_DEFERRED_SIMPLE);
+            LLDrawPoolAvatar::sSkipTransparent = false;
+        }
+    }
 }
 
 S32 LLDrawPoolAvatar::getNumPasses()
@@ -1794,7 +1804,7 @@ void LLDrawPoolAvatar::getRiggedGeometry(
 
 	U16 offset = 0;
 		
-	LLMatrix4 mat_vert = skin->mBindShapeMatrix;
+	LLMatrix4 mat_vert = LLMatrix4(skin->mBindShapeMatrix);
 	glh::matrix4f m((F32*) mat_vert.mMatrix);
 	m = m.inverse().transpose();
 		
@@ -1968,14 +1978,11 @@ void LLDrawPoolAvatar::updateRiggedFaceVertexBuffer(
             skin = vobj->getSkinInfo();
         }
 
-		//build matrix palette
-		LLMatrix4a mat[LL_MAX_JOINTS_PER_MESH_OBJECT];
-        U32 count = LLSkinningUtil::getMeshJointCount(skin);
-        LLSkinningUtil::initSkinningMatrixPalette((LLMatrix4*)mat, count, skin, avatar);
-        LLSkinningUtil::checkSkinWeights(weights, buffer->getNumVerts(), skin);
+        const MatrixPaletteCache& mpc = updateSkinInfoMatrixPalette(avatar, skin);
+        const LLMatrix4a* mat = &(mpc.mMatrixPalette[0]);
 
-		LLMatrix4a bind_shape_matrix;
-		bind_shape_matrix.loadu(skin->mBindShapeMatrix);
+        LLSkinningUtil::checkSkinWeights(weights, buffer->getNumVerts(), skin);
+		const LLMatrix4a& bind_shape_matrix = skin->mBindShapeMatrix;
 
 #if USE_SEPARATE_JOINT_INDICES_AND_WEIGHTS
         U8* joint_indices_cursor = vol_face.mJointIndices;
@@ -2043,6 +2050,8 @@ void LLDrawPoolAvatar::updateRiggedFaceVertexBuffer(
 
 void LLDrawPoolAvatar::renderRigged(LLVOAvatar* avatar, U32 type, bool glow)
 {
+    LL_PROFILE_ZONE_SCOPED;
+
 	if (!avatar->shouldRenderRigged())
 	{
 		return;
@@ -2050,15 +2059,18 @@ void LLDrawPoolAvatar::renderRigged(LLVOAvatar* avatar, U32 type, bool glow)
 
 	stop_glerror();
 
+    const LLMeshSkinInfo* lastSkin = nullptr;
+
 	for (U32 i = 0; i < mRiggedFace[type].size(); ++i)
 	{
+        LL_PROFILE_ZONE_NAMED("Render Rigged Face");
 		LLFace* face = mRiggedFace[type][i];
 
         S32 offset = face->getIndicesStart();
 		U32 count = face->getIndicesCount();
 
         U16 start = face->getGeomStart();
-		U16 end = start + face->getGeomCount()-1;			
+		U16 end = start + face->getGeomCount()-1;
 
 		LLDrawable* drawable = face->getDrawable();
 		if (!drawable)
@@ -2180,52 +2192,32 @@ void LLDrawPoolAvatar::renderRigged(LLVOAvatar* avatar, U32 type, bool glow)
         }
 
 		if (buff)
-		{        
+		{
 			if (sShaderLevel > 0)
 			{
-                // upload matrix palette to shader
-				LLMatrix4a mat[LL_MAX_JOINTS_PER_MESH_OBJECT];
-				U32 count = LLSkinningUtil::getMeshJointCount(skin);
-                LLSkinningUtil::initSkinningMatrixPalette((LLMatrix4*)mat, count, skin, avatar);
-
-				stop_glerror();
-
-				F32 mp[LL_MAX_JOINTS_PER_MESH_OBJECT*12];
-
-				for (U32 i = 0; i < count; ++i)
-				{
-					F32* m = (F32*) mat[i].mMatrix[0].getF32ptr();
-
-					U32 idx = i*12;
-
-					mp[idx+0] = m[0];
-					mp[idx+1] = m[1];
-					mp[idx+2] = m[2];
-					mp[idx+3] = m[12];
-
-					mp[idx+4] = m[4];
-					mp[idx+5] = m[5];
-					mp[idx+6] = m[6];
-					mp[idx+7] = m[13];
+                if (lastSkin != skin) // <== only upload matrix palette to GL if the skininfo changed
+                {
+                    // upload matrix palette to shader
+                    const MatrixPaletteCache& mpc = updateSkinInfoMatrixPalette(avatar, skin);
+                    U32 count = mpc.mMatrixPalette.size();
 
-					mp[idx+8] = m[8];
-					mp[idx+9] = m[9];
-					mp[idx+10] = m[10];
-					mp[idx+11] = m[14];
-				}
+                    stop_glerror();
 
-				LLDrawPoolAvatar::sVertexProgram->uniformMatrix3x4fv(LLViewerShaderMgr::AVATAR_MATRIX, 
-					count,
-					FALSE,
-					(GLfloat*) mp);
+                    LLDrawPoolAvatar::sVertexProgram->uniformMatrix3x4fv(LLViewerShaderMgr::AVATAR_MATRIX,
+                        count,
+                        FALSE,
+                        (GLfloat*) &(mpc.mGLMp[0]));
 
-				stop_glerror();
+                    stop_glerror();
+                }
 			}
 			else
 			{
 				data_mask &= ~LLVertexBuffer::MAP_WEIGHT4;
 			}
 
+            lastSkin = skin;
+
 			/*if (glow)
 			{
 				gGL.diffuseColor4f(0,0,0,face->getTextureEntry()->getGlow());
@@ -2391,6 +2383,70 @@ void LLDrawPoolAvatar::updateRiggedVertexBuffers(LLVOAvatar* avatar)
 	}
 }
 
+void LLDrawPoolAvatar::updateSkinInfoMatrixPalettes(LLVOAvatar* avatarp)
+{
+    LL_PROFILE_ZONE_SCOPED;
+    //evict matrix palettes from the cache that haven't been updated in 10 frames
+    for (matrix_palette_cache_t::iterator iter = mMatrixPaletteCache.begin(); iter != mMatrixPaletteCache.end(); )
+    {
+        if (gFrameCount - iter->second.mFrame > 10)
+        {
+            iter = mMatrixPaletteCache.erase(iter);
+        }
+        else
+        {
+            ++iter;
+        }
+    }
+}
+
+const LLDrawPoolAvatar::MatrixPaletteCache& LLDrawPoolAvatar::updateSkinInfoMatrixPalette(LLVOAvatar * avatarp, const LLMeshSkinInfo* skin)
+{
+    MatrixPaletteCache& entry = mMatrixPaletteCache[skin];
+
+    if (entry.mFrame != gFrameCount)
+    {
+        LL_PROFILE_ZONE_SCOPED;
+        entry.mFrame = gFrameCount;
+        //build matrix palette
+        U32 count = LLSkinningUtil::getMeshJointCount(skin);
+        entry.mMatrixPalette.resize(count);
+        LLSkinningUtil::initSkinningMatrixPalette(&(entry.mMatrixPalette[0]), count, skin, avatarp);
+
+        const LLMatrix4a* mat = &(entry.mMatrixPalette[0]);
+
+        stop_glerror();
+        
+        entry.mGLMp.resize(count * 12);
+
+        F32* mp = &(entry.mGLMp[0]);
+        
+        for (U32 i = 0; i < count; ++i)
+        {
+            F32* m = (F32*)mat[i].mMatrix[0].getF32ptr();
+
+            U32 idx = i * 12;
+
+            mp[idx + 0] = m[0];
+            mp[idx + 1] = m[1];
+            mp[idx + 2] = m[2];
+            mp[idx + 3] = m[12];
+
+            mp[idx + 4] = m[4];
+            mp[idx + 5] = m[5];
+            mp[idx + 6] = m[6];
+            mp[idx + 7] = m[13];
+
+            mp[idx + 8] = m[8];
+            mp[idx + 9] = m[9];
+            mp[idx + 10] = m[10];
+            mp[idx + 11] = m[14];
+        }
+    }
+
+    return entry;
+}
+
 void LLDrawPoolAvatar::renderRiggedSimple(LLVOAvatar* avatar)
 {
 	renderRigged(avatar, RIGGED_SIMPLE);
diff --git a/indra/newview/lldrawpoolavatar.h b/indra/newview/lldrawpoolavatar.h
index 9b26266ced3481f740c837974bcb1f62851ef594..0c1ee2cced6d17249a788b261e5c0caab04412f4 100644
--- a/indra/newview/lldrawpoolavatar.h
+++ b/indra/newview/lldrawpoolavatar.h
@@ -28,15 +28,18 @@
 #define LL_LLDRAWPOOLAVATAR_H
 
 #include "lldrawpool.h"
+#include "llmodel.h"
+
+#include <unordered_map>
 
 class LLVOAvatar;
 class LLVOVolume;
 class LLGLSLShader;
 class LLFace;
-class LLMeshSkinInfo;
 class LLVolume;
 class LLVolumeFace;
 
+extern U32 gFrameCount;
 
 class LLDrawPoolAvatar : public LLFacePool
 {
@@ -259,6 +262,8 @@ typedef enum
 									  LLVolumeFace& vol_face);
 	void updateRiggedVertexBuffers(LLVOAvatar* avatar);
 
+    void updateSkinInfoMatrixPalettes(LLVOAvatar* avatarp);
+
 	void renderRigged(LLVOAvatar* avatar, U32 type, bool glow = false);
 	void renderRiggedSimple(LLVOAvatar* avatar);
 	void renderRiggedAlpha(LLVOAvatar* avatar);
@@ -278,6 +283,26 @@ typedef enum
 
 	std::vector<LLFace*> mRiggedFace[NUM_RIGGED_PASSES];
 
+    class MatrixPaletteCache
+    {
+    public:
+        U32 mFrame;
+        LLMeshSkinInfo::matrix_list_t mMatrixPalette;
+        
+        // Float array ready to be sent to GL
+        std::vector<F32> mGLMp;
+
+        MatrixPaletteCache() :
+            mFrame(gFrameCount-1)
+        {
+        }
+    };
+    
+    const MatrixPaletteCache& updateSkinInfoMatrixPalette(LLVOAvatar* avatarp, const LLMeshSkinInfo* skin);
+
+    typedef std::unordered_map<const LLMeshSkinInfo*, MatrixPaletteCache> matrix_palette_cache_t;
+    matrix_palette_cache_t mMatrixPaletteCache;
+
 	/*virtual*/ LLViewerTexture *getDebugTexture();
 	/*virtual*/ LLColor3 getDebugColor() const; // For AGP debug display
 
diff --git a/indra/newview/llfloatermodelpreview.cpp b/indra/newview/llfloatermodelpreview.cpp
index d9edd4dc30b16b6a7f3f03431c70e92c7b267edf..0e54b66ea959bd8bec32b59c3282d374b4c9e9f2 100644
--- a/indra/newview/llfloatermodelpreview.cpp
+++ b/indra/newview/llfloatermodelpreview.cpp
@@ -1357,31 +1357,31 @@ void LLFloaterModelPreview::clearAvatarTab()
 			}
 
 void LLFloaterModelPreview::updateAvatarTab(bool highlight_overrides)
-			{
+{
     S32 display_lod = mModelPreview->mPreviewLOD;
     if (mModelPreview->mModel[display_lod].empty())
-				{
+    {
         mSelectedJointName.clear();
         return;
-					}
+    }
 
     // Joints will be listed as long as they are listed in mAlternateBindMatrix
     // even if they are for some reason identical to defaults.
     // Todo: Are overrides always identical for all lods? They normally are, but there might be situations where they aren't.
     if (mJointOverrides[display_lod].empty())
-					{
+    {
         // populate map
         for (LLModelLoader::scene::iterator iter = mModelPreview->mScene[display_lod].begin(); iter != mModelPreview->mScene[display_lod].end(); ++iter)
-					{
+        {
             for (LLModelLoader::model_instance_list::iterator model_iter = iter->second.begin(); model_iter != iter->second.end(); ++model_iter)
-					{
+            {
                 LLModelInstance& instance = *model_iter;
                 LLModel* model = instance.mModel;
                 const LLMeshSkinInfo *skin = &model->mSkinInfo;
                 U32 joint_count = LLSkinningUtil::getMeshJointCount(skin);
                 U32 bind_count = highlight_overrides ? skin->mAlternateBindMatrix.size() : 0; // simply do not include overrides if data is not needed
                 if (bind_count > 0 && bind_count != joint_count)
-						{
+                {
                     std::ostringstream out;
                     out << "Invalid joint overrides for model " << model->getName();
                     out << ". Amount of joints " << joint_count;
@@ -1390,68 +1390,68 @@ void LLFloaterModelPreview::updateAvatarTab(bool highlight_overrides)
                     addStringToLog(out.str(), true);
                     // Disable overrides for this model
                     bind_count = 0;
-						}
+                }
                 if (bind_count > 0)
-						{
+                {
                     for (U32 j = 0; j < joint_count; ++j)
-							{
-                        const LLVector3& joint_pos = skin->mAlternateBindMatrix[j].getTranslation();
+                    {
+                        const LLVector3& joint_pos = LLVector3(skin->mAlternateBindMatrix[j].getTranslation());
                         LLJointOverrideData &data = mJointOverrides[display_lod][skin->mJointNames[j]];
 
                         LLJoint* pJoint = LLModelPreview::lookupJointByName(skin->mJointNames[j], mModelPreview);
                         if (pJoint)
-							{
+                        {
                             // see how voavatar uses aboveJointPosThreshold
                             if (pJoint->aboveJointPosThreshold(joint_pos))
-				{
+                            {
                                 // valid override
                                 if (data.mPosOverrides.size() > 0
                                     && (data.mPosOverrides.begin()->second - joint_pos).lengthSquared() > (LL_JOINT_TRESHOLD_POS_OFFSET * LL_JOINT_TRESHOLD_POS_OFFSET))
-					{
+                                {
                                     // File contains multiple meshes with conflicting joint offsets
                                     // preview may be incorrect, upload result might wary (depends onto
                                     // mesh_id that hasn't been generated yet).
                                     data.mHasConflicts = true;
-							}
+                                }
                                 data.mPosOverrides[model->getName()] = joint_pos;
-						}
-						else
-						{
+                            }
+                            else
+                            {
                                 // default value, it won't be accounted for by avatar
                                 data.mModelsNoOverrides.insert(model->getName());
-					}
-					}
-				}
-			}
-			else
-			{
+                            }
+                        }
+                    }
+                }
+                else
+                {
                     for (U32 j = 0; j < joint_count; ++j)
-				{				
+                    {
                         LLJointOverrideData &data = mJointOverrides[display_lod][skin->mJointNames[j]];
                         data.mModelsNoOverrides.insert(model->getName());
                     }
                 }
-			}
-		}
-	}
+            }
+        }
+    }
 
     LLPanel *panel = mTabContainer->getPanelByName("rigging_panel");
     LLScrollListCtrl *joints_list = panel->getChild<LLScrollListCtrl>("joints_list");
 
     if (joints_list->isEmpty())
-	{
+    {
         // Populate table
 
-    std::map<std::string, std::string> joint_alias_map;
+        std::map<std::string, std::string> joint_alias_map;
         mModelPreview->getJointAliases(joint_alias_map);
-    
+
         S32 conflicts = 0;
         joint_override_data_map_t::iterator joint_iter = mJointOverrides[display_lod].begin();
         joint_override_data_map_t::iterator joint_end = mJointOverrides[display_lod].end();
         while (joint_iter != joint_end)
-	{
+        {
             const std::string& listName = joint_iter->first;
-        
+
             LLScrollListItem::Params item_params;
             item_params.value(listName);
 
@@ -1459,38 +1459,38 @@ void LLFloaterModelPreview::updateAvatarTab(bool highlight_overrides)
             cell_params.font = LLFontGL::getFontSansSerif();
             cell_params.value = listName;
             if (joint_alias_map.find(listName) == joint_alias_map.end())
-	{
+            {
                 // Missing names
                 cell_params.color = LLColor4::red;
-	}
+            }
             if (joint_iter->second.mHasConflicts)
-	{
+            {
                 // Conflicts
                 cell_params.color = LLColor4::orange;
                 conflicts++;
-	}
+            }
             if (highlight_overrides && joint_iter->second.mPosOverrides.size() > 0)
-	{
+            {
                 cell_params.font.style = "BOLD";
-	}
+            }
 
             item_params.columns.add(cell_params);
 
             joints_list->addRow(item_params, ADD_BOTTOM);
             joint_iter++;
-	}
+        }
         joints_list->selectFirstItem();
         LLScrollListItem *selected = joints_list->getFirstSelected();
         if (selected)
-{
+        {
             mSelectedJointName = selected->getValue().asString();
-	}
+        }
 
         LLTextBox *joint_conf_descr = panel->getChild<LLTextBox>("conflicts_description");
         joint_conf_descr->setTextArg("[CONFLICTS]", llformat("%d", conflicts));
         joint_conf_descr->setTextArg("[JOINTS_COUNT]", llformat("%d", mJointOverrides[display_lod].size()));
-		}
-	}
+    }
+}
 
 //-----------------------------------------------------------------------------
 // addStringToLogTab()
diff --git a/indra/newview/llmodelpreview.cpp b/indra/newview/llmodelpreview.cpp
index a9e80ab5da107dcd7ea1ae10c3edaf030561c50a..01bddd781d2798cfd6740c925b30ec4bfffadbb0 100644
--- a/indra/newview/llmodelpreview.cpp
+++ b/indra/newview/llmodelpreview.cpp
@@ -591,7 +591,7 @@ void LLModelPreview::rebuildUploadData()
                 bool upload_skinweights = fmp && fmp->childGetValue("upload_skin").asBoolean();
                 if (upload_skinweights && high_lod_model->mSkinInfo.mJointNames.size() > 0)
                 {
-                    LLQuaternion bind_rot = LLSkinningUtil::getUnscaledQuaternion(high_lod_model->mSkinInfo.mBindShapeMatrix);
+                    LLQuaternion bind_rot = LLSkinningUtil::getUnscaledQuaternion(LLMatrix4(high_lod_model->mSkinInfo.mBindShapeMatrix));
                     LLQuaternion identity;
                     if (!bind_rot.isEqualEps(identity, 0.01))
                     {
@@ -3298,7 +3298,7 @@ BOOL LLModelPreview::render()
                                 LLJoint *joint = getPreviewAvatar()->getJoint(skin->mJointNums[j]);
                                 if (joint)
                                 {
-                                    const LLVector3& jointPos = skin->mAlternateBindMatrix[j].getTranslation();
+                                    const LLVector3& jointPos = LLVector3(skin->mAlternateBindMatrix[j].getTranslation());
                                     if (joint->aboveJointPosThreshold(jointPos))
                                     {
                                         bool override_changed;
@@ -3340,11 +3340,10 @@ BOOL LLModelPreview::render()
                             //build matrix palette
 
                             LLMatrix4a mat[LL_MAX_JOINTS_PER_MESH_OBJECT];
-                            LLSkinningUtil::initSkinningMatrixPalette((LLMatrix4*)mat, joint_count,
+                            LLSkinningUtil::initSkinningMatrixPalette(mat, joint_count,
                                 skin, getPreviewAvatar());
 
-                            LLMatrix4a bind_shape_matrix;
-                            bind_shape_matrix.loadu(skin->mBindShapeMatrix);
+                            const LLMatrix4a& bind_shape_matrix = skin->mBindShapeMatrix;
                             U32 max_joints = LLSkinningUtil::getMaxJointCount();
                             for (U32 j = 0; j < buffer->getNumVerts(); ++j)
                             {
diff --git a/indra/newview/llskinningutil.cpp b/indra/newview/llskinningutil.cpp
index e02b21f0365369119df3949dda251febaa397583..dc12de29fb3f1a2862e09e1c7257b122ec6a30ca 100644
--- a/indra/newview/llskinningutil.cpp
+++ b/indra/newview/llskinningutil.cpp
@@ -35,7 +35,6 @@
 #include "llrigginginfo.h"
 
 #define DEBUG_SKINNING  LL_DEBUG
-#define MAT_USE_SSE     1
 
 void dump_avatar_and_skin_state(const std::string& reason, LLVOAvatar *avatar, const LLMeshSkinInfo *skin)
 {
@@ -120,36 +119,26 @@ void LLSkinningUtil::scrubInvalidJoints(LLVOAvatar *avatar, LLMeshSkinInfo* skin
     skin->mInvalidJointsScrubbed = true;
 }
 
-#define MAT_USE_SSE 1
-
 void LLSkinningUtil::initSkinningMatrixPalette(
-    LLMatrix4* mat,
+    LLMatrix4a* mat,
     S32 count, 
     const LLMeshSkinInfo* skin,
     LLVOAvatar *avatar)
 {
+    LL_PROFILE_ZONE_SCOPED;
+
     initJointNums(const_cast<LLMeshSkinInfo*>(skin), avatar);
+
+    LLMatrix4a world[LL_CHARACTER_MAX_ANIMATED_JOINTS];
+
     for (U32 j = 0; j < count; ++j)
     {
         S32 joint_num = skin->mJointNums[j];
-        LLJoint *joint = NULL;
-        if (joint_num >= 0 && joint_num < LL_CHARACTER_MAX_ANIMATED_JOINTS)
-        {
-            joint = avatar->getJoint(joint_num);
-        }
-        llassert(joint);
+        LLJoint *joint = avatar->getJoint(joint_num);
+
         if (joint)
         {
-#ifdef MAT_USE_SSE
-            LLMatrix4a bind, world, res;
-            bind.loadu(skin->mInvBindMatrix[j]);
-            world.loadu(joint->getWorldMatrix());
-            matMul(bind,world,res);
-            memcpy(mat[j].mMatrix,res.mMatrix,16*sizeof(float));
-#else
-            mat[j] = skin->mInvBindMatrix[j];
-            mat[j] *= joint->getWorldMatrix();
-#endif
+            world[j] = joint->getWorldMatrix4a();
         }
         else
         {
@@ -159,16 +148,27 @@ void LLSkinningUtil::initSkinningMatrixPalette(
             // rendering  should  be disabled  unless  all joints  are
             // valid.  In other  cases of  skinned  rendering, invalid
             // joints should already have  been removed during scrubInvalidJoints().
-            LL_WARNS_ONCE("Avatar") << avatar->getFullname() 
-                                    << " rigged to invalid joint name " << skin->mJointNames[j] 
-                                    << " num " << skin->mJointNums[j] << LL_ENDL;
-            LL_WARNS_ONCE("Avatar") << avatar->getFullname() 
-                                    << " avatar build state: isBuilt() " << avatar->isBuilt() 
-                                    << " mInitFlags " << avatar->mInitFlags << LL_ENDL;
+            LL_WARNS_ONCE("Avatar") << avatar->getFullname()
+                << " rigged to invalid joint name " << skin->mJointNames[j]
+                << " num " << skin->mJointNums[j] << LL_ENDL;
+            LL_WARNS_ONCE("Avatar") << avatar->getFullname()
+                << " avatar build state: isBuilt() " << avatar->isBuilt()
+                << " mInitFlags " << avatar->mInitFlags << LL_ENDL;
 #endif
             dump_avatar_and_skin_state("initSkinningMatrixPalette joint not found", avatar, skin);
         }
     }
+
+    //NOTE: pointer striders used here as a micro-optimization over vector/array lookups
+    const LLMatrix4a* invBind = &(skin->mInvBindMatrix[0]);
+    const LLMatrix4a* w = world;
+    LLMatrix4a* m = mat;
+    LLMatrix4a* end = m + count;
+
+    while (m < end)
+    {
+        matMulUnsafe(*(invBind++), *(w++), *(m++));
+    }
 }
 
 void LLSkinningUtil::checkSkinWeights(LLVector4a* weights, U32 num_vertices, const LLMeshSkinInfo* skin)
@@ -212,7 +212,7 @@ void LLSkinningUtil::scrubSkinWeights(LLVector4a* weights, U32 num_vertices, con
 
 void LLSkinningUtil::getPerVertexSkinMatrix(
     F32* weights,
-    LLMatrix4a* mat,
+    const LLMatrix4a* mat,
     bool handle_bad_scale,
     LLMatrix4a& final_mat,
     U32 max_joints)
@@ -270,6 +270,7 @@ void LLSkinningUtil::initJointNums(LLMeshSkinInfo* skin, LLVOAvatar *avatar)
 {
     if (!skin->mJointNumsInitialized)
     {
+        LL_PROFILE_ZONE_SCOPED;
         for (U32 j = 0; j < skin->mJointNames.size(); ++j)
         {
     #if DEBUG_SKINNING     
@@ -357,13 +358,11 @@ void LLSkinningUtil::updateRiggingInfo(const LLMeshSkinInfo* skin, LLVOAvatar *a
                                 rig_info_tab[joint_num].setIsRiggedTo(true);
 
                                 // FIXME could precompute these matMuls.
-                                LLMatrix4a bind_shape;
-                                LLMatrix4a inv_bind;
+                                const LLMatrix4a& bind_shape = skin->mBindShapeMatrix;
+                                const LLMatrix4a& inv_bind = skin->mInvBindMatrix[joint_index];
                                 LLMatrix4a mat;
                                 LLVector4a pos_joint_space;
 
-                                bind_shape.loadu(skin->mBindShapeMatrix);
-                                inv_bind.loadu(skin->mInvBindMatrix[joint_index]);
                                 matMul(bind_shape, inv_bind, mat);
 
                                 mat.affineTransform(pos, pos_joint_space);
@@ -426,3 +425,4 @@ LLQuaternion LLSkinningUtil::getUnscaledQuaternion(const LLMatrix4& mat4)
     bind_rot.normalize();
     return bind_rot;
 }
+
diff --git a/indra/newview/llskinningutil.h b/indra/newview/llskinningutil.h
index efe7c85997d4281623d955a116e81f8db032a039..807418f9831e60ab0b2496f9a2b8204d130ae7e6 100644
--- a/indra/newview/llskinningutil.h
+++ b/indra/newview/llskinningutil.h
@@ -42,10 +42,10 @@ namespace LLSkinningUtil
     S32 getMaxJointCount();
     U32 getMeshJointCount(const LLMeshSkinInfo *skin);
     void scrubInvalidJoints(LLVOAvatar *avatar, LLMeshSkinInfo* skin);
-    void initSkinningMatrixPalette(LLMatrix4* mat, S32 count, const LLMeshSkinInfo* skin, LLVOAvatar *avatar);
+    void initSkinningMatrixPalette(LLMatrix4a* mat, S32 count, const LLMeshSkinInfo* skin, LLVOAvatar *avatar);
     void checkSkinWeights(LLVector4a* weights, U32 num_vertices, const LLMeshSkinInfo* skin);
     void scrubSkinWeights(LLVector4a* weights, U32 num_vertices, const LLMeshSkinInfo* skin);
-    void getPerVertexSkinMatrix(F32* weights, LLMatrix4a* mat, bool handle_bad_scale, LLMatrix4a& final_mat, U32 max_joints);
+    void getPerVertexSkinMatrix(F32* weights, const LLMatrix4a* mat, bool handle_bad_scale, LLMatrix4a& final_mat, U32 max_joints);
 
     LL_FORCE_INLINE void getPerVertexSkinMatrixWithIndices(
         F32*        weights,
diff --git a/indra/newview/llviewertexture.cpp b/indra/newview/llviewertexture.cpp
index ca01bb46aaa3958e6f8a6b36acc0a012020179e7..af55f68cd2af192c367a1f331632f8c01a81a860 100644
--- a/indra/newview/llviewertexture.cpp
+++ b/indra/newview/llviewertexture.cpp
@@ -208,6 +208,7 @@ void  LLViewerTextureManager::findTextures(const LLUUID& id, std::vector<LLViewe
 
 LLViewerFetchedTexture* LLViewerTextureManager::findFetchedTexture(const LLUUID& id, S32 tex_type)
 {
+    LL_PROFILE_ZONE_SCOPED;
     return gTextureList.findImage(id, (ETexListType)tex_type);
 }
 
diff --git a/indra/newview/llviewertexturelist.cpp b/indra/newview/llviewertexturelist.cpp
index 561319ca5d2b4eb61e555f3b4e9bbbc4271d7c17..12495078e99c9ea8d1874364a01b618bd3886096 100644
--- a/indra/newview/llviewertexturelist.cpp
+++ b/indra/newview/llviewertexturelist.cpp
@@ -620,6 +620,7 @@ void LLViewerTextureList::findTexturesByID(const LLUUID &image_id, std::vector<L
 
 LLViewerFetchedTexture *LLViewerTextureList::findImage(const LLTextureKey &search_key)
 {
+    LL_PROFILE_ZONE_SCOPED;
     uuid_map_t::iterator iter = mUUIDMap.find(search_key);
     if (iter == mUUIDMap.end())
         return NULL;
diff --git a/indra/newview/llvoavatar.cpp b/indra/newview/llvoavatar.cpp
index ba9f8d14cb96f2f245cd893c26f2ad1e1e5445a1..f260e4f7505269c498b36648119012755ccccf4c 100644
--- a/indra/newview/llvoavatar.cpp
+++ b/indra/newview/llvoavatar.cpp
@@ -4886,6 +4886,8 @@ bool LLVOAvatar::shouldAlphaMask()
 //-----------------------------------------------------------------------------
 U32 LLVOAvatar::renderSkinned()
 {
+    LL_PROFILE_ZONE_SCOPED;
+
 	U32 num_indices = 0;
 
 	if (!mIsBuilt)
@@ -6160,27 +6162,29 @@ LLJoint *LLVOAvatar::getJoint( const std::string &name )
 LLJoint *LLVOAvatar::getJoint( S32 joint_num )
 {
     LLJoint *pJoint = NULL;
-    S32 collision_start = mNumBones;
-    S32 attachment_start = mNumBones + mNumCollisionVolumes;
-    if (joint_num>=attachment_start)
+    if (joint_num >= 0)
     {
-        // Attachment IDs start at 1
-        S32 attachment_id = joint_num - attachment_start + 1;
-        attachment_map_t::iterator iter = mAttachmentPoints.find(attachment_id);
-        if (iter != mAttachmentPoints.end())
+        if (joint_num < mNumBones)
         {
-            pJoint = iter->second;
+            pJoint = mSkeleton[joint_num];
+        }
+        else if (joint_num < mNumBones + mNumCollisionVolumes)
+        {
+            S32 collision_id = joint_num - mNumBones;
+            pJoint = &mCollisionVolumes[collision_id];
+        }
+        else
+        {
+            // Attachment IDs start at 1
+            S32 attachment_id = joint_num - (mNumBones + mNumCollisionVolumes) + 1;
+            attachment_map_t::iterator iter = mAttachmentPoints.find(attachment_id);
+            if (iter != mAttachmentPoints.end())
+            {
+                pJoint = iter->second;
+            }
         }
     }
-    else if (joint_num>=collision_start)
-    {
-        S32 collision_id = joint_num-collision_start;
-        pJoint = &mCollisionVolumes[collision_id];
-    }
-    else if (joint_num>=0)
-    {
-        pJoint = mSkeleton[joint_num];
-    }
+    
 	llassert(!pJoint || pJoint->getJointNum() == joint_num);
     return pJoint;
 }
@@ -6515,7 +6519,7 @@ void LLVOAvatar::addAttachmentOverridesForObject(LLViewerObject *vo, std::set<LL
 					LLJoint* pJoint = getJoint( lookingForJoint );
 					if (pJoint)
 					{   									
-						const LLVector3& jointPos = pSkinData->mAlternateBindMatrix[i].getTranslation();									
+						const LLVector3& jointPos = LLVector3(pSkinData->mAlternateBindMatrix[i].getTranslation());
                         if (pJoint->aboveJointPosThreshold(jointPos))
                         {
                             bool override_changed;
@@ -7864,6 +7868,8 @@ void LLVOAvatar::onGlobalColorChanged(const LLTexGlobalColor* global_color)
 // Do rigged mesh attachments display with this av?
 bool LLVOAvatar::shouldRenderRigged() const
 {
+    LL_PROFILE_ZONE_SCOPED;
+
 	if (getOverallAppearance() == AOA_NORMAL)
 	{
 		return true;
@@ -10951,6 +10957,7 @@ void LLVOAvatar::updateOverallAppearanceAnimations()
 // Based on isVisuallyMuted(), but has 3 possible results.
 LLVOAvatar::AvatarOverallAppearance LLVOAvatar::getOverallAppearance() const
 {
+    LL_PROFILE_ZONE_SCOPED;
 	AvatarOverallAppearance result = AOA_NORMAL;
 
 	// Priority order (highest priority first)
diff --git a/indra/newview/llvoavatar.h b/indra/newview/llvoavatar.h
index 74ef589ca47a9532e7fc242c9d9e67fbf2acdf36..39adaab8cac366471dd5156b89cd4010e139081b 100644
--- a/indra/newview/llvoavatar.h
+++ b/indra/newview/llvoavatar.h
@@ -209,6 +209,11 @@ class LLVOAvatar :
 	virtual LLJoint*		getJoint(const std::string &name);
 	LLJoint*		        getJoint(S32 num);
 
+    //if you KNOW joint_num is a valid animated joint index, use getSkeletonJoint for efficiency
+    inline LLJoint* getSkeletonJoint(S32 joint_num) { return mSkeleton[joint_num]; }
+    inline size_t getSkeletonJointCount() const { return mSkeleton.size(); }
+
+
 	void 					addAttachmentOverridesForObject(LLViewerObject *vo, std::set<LLUUID>* meshes_seen = NULL, bool recursive = true);
 	void					removeAttachmentOverridesForObject(const LLUUID& mesh_id);
 	void					removeAttachmentOverridesForObject(LLViewerObject *vo);
diff --git a/indra/newview/llvovolume.cpp b/indra/newview/llvovolume.cpp
index f9b2285989b5d65252e27071637e9b02ea981008..e5a4b0f37466af91f674d9cfcd857c3b807f346b 100644
--- a/indra/newview/llvovolume.cpp
+++ b/indra/newview/llvovolume.cpp
@@ -4807,7 +4807,7 @@ void LLRiggedVolume::update(const LLMeshSkinInfo* skin, LLVOAvatar* avatar, cons
 
 	LLMatrix4a mat[kMaxJoints];
 	U32 maxJoints = LLSkinningUtil::getMeshJointCount(skin);
-    LLSkinningUtil::initSkinningMatrixPalette((LLMatrix4*)mat, maxJoints, skin, avatar);
+    LLSkinningUtil::initSkinningMatrixPalette(mat, maxJoints, skin, avatar);
 
     S32 rigged_vert_count = 0;
     S32 rigged_face_count = 0;
@@ -4823,8 +4823,7 @@ void LLRiggedVolume::update(const LLMeshSkinInfo* skin, LLVOAvatar* avatar, cons
 		if ( weight )
 		{
             LLSkinningUtil::checkSkinWeights(weight, dst_face.mNumVertices, skin);
-			LLMatrix4a bind_shape_matrix;
-			bind_shape_matrix.loadu(skin->mBindShapeMatrix);
+			const LLMatrix4a& bind_shape_matrix = skin->mBindShapeMatrix;
 
 			LLVector4a* pos = dst_face.mPositions;