diff --git a/indra/llprimitive/lldaeloader.cpp b/indra/llprimitive/lldaeloader.cpp
index 012641ad29a2723b2da60ef98a35c27c1c54404b..75fdbe00569f3237c5ff1b2f9176e1af04a900dc 100644
--- a/indra/llprimitive/lldaeloader.cpp
+++ b/indra/llprimitive/lldaeloader.cpp
@@ -1162,9 +1162,11 @@ void LLDAELoader::processDomModel(LLModel* model, DAE* dae, daeElement* root, do
 		mesh_scale *= normalized_transformation;
 		normalized_transformation = mesh_scale;
 
-		glh::matrix4f inv_mat((F32*) normalized_transformation.mMatrix);
-		inv_mat = inv_mat.inverse();
-		LLMatrix4 inverse_normalized_transformation(inv_mat.m);
+		LLMatrix4a inv_mat;
+		inv_mat.loadu(normalized_transformation);
+		inv_mat.invert();
+
+		LLMatrix4 inverse_normalized_transformation(inv_mat.getF32ptr());
 
 		domSkin::domBind_shape_matrix* bind_mat = skin->getBind_shape_matrix();
 
diff --git a/indra/llrender/llcubemap.cpp b/indra/llrender/llcubemap.cpp
index 41cf726f60384e8a2fbda384b9fa845d4e34dd86..6ca1f8c8edbc91141dcef693b43fe1cc552abda3 100644
--- a/indra/llrender/llcubemap.cpp
+++ b/indra/llrender/llcubemap.cpp
@@ -263,18 +263,14 @@ void LLCubeMap::setMatrix(S32 stage)
 		gGL.getTexUnit(stage)->activate();
 	}
 
-	LLVector3 x(gGLModelView+0);
-	LLVector3 y(gGLModelView+4);
-	LLVector3 z(gGLModelView+8);
-
-	LLMatrix3 mat3;
-	mat3.setRows(x,y,z);
-	LLMatrix4 trans(mat3);
+	LLMatrix4a trans;
+	trans.loadu(gGLModelView);
+	trans.setRow<3>(LLVector4a::getZero());
 	trans.transpose();
 
 	gGL.matrixMode(LLRender::MM_TEXTURE);
 	gGL.pushMatrix();
-	gGL.loadMatrix((F32 *)trans.mMatrix);
+	gGL.loadMatrix(trans);
 	gGL.matrixMode(LLRender::MM_MODELVIEW);
 	
 	/*if (stage > 0)
diff --git a/indra/llrender/llgl.cpp b/indra/llrender/llgl.cpp
index 575e37b4881fc0de797dd2ed3a3ff8113d91004c..3cd4f0f6eb0b3ad6574a66466095afa0354102f7 100644
--- a/indra/llrender/llgl.cpp
+++ b/indra/llrender/llgl.cpp
@@ -1885,7 +1885,7 @@ void parse_glsl_version(S32& major, S32& minor)
 	LLStringUtil::convertToS32(minor_str, minor);
 }
 
-LLGLUserClipPlane::LLGLUserClipPlane(const LLPlane& p, const glh::matrix4f& modelview, const glh::matrix4f& projection, bool apply)
+LLGLUserClipPlane::LLGLUserClipPlane(const LLPlane& p, const LLMatrix4a& modelview, const LLMatrix4a& projection, bool apply)
 {
 	mApply = apply;
 
@@ -1911,26 +1911,41 @@ void LLGLUserClipPlane::disable()
 
 void LLGLUserClipPlane::setPlane(F32 a, F32 b, F32 c, F32 d)
 {
-	glh::matrix4f& P = mProjection;
-	glh::matrix4f& M = mModelview;
-    
-	glh::matrix4f invtrans_MVP = (P * M).inverse().transpose();
-    glh::vec4f oplane(a,b,c,d);
-    glh::vec4f cplane;
-    invtrans_MVP.mult_matrix_vec(oplane, cplane);
+    LLMatrix4a& P = mProjection;
+	LLMatrix4a& M = mModelview;
+
+	LLMatrix4a invtrans_MVP;
+	invtrans_MVP.setMul(P,M);
+	invtrans_MVP.invert();
+	invtrans_MVP.transpose();
+
+	LLVector4a oplane(a,b,c,d);
+	LLVector4a cplane;
+	LLVector4a cplane_splat;
+	LLVector4a cplane_neg;
+
+	invtrans_MVP.rotate4(oplane,cplane);
+	
+	cplane_splat.splat<2>(cplane);
+	cplane_splat.setAbs(cplane_splat);
+	cplane.div(cplane_splat);
+	cplane.sub(LLVector4a(0.f,0.f,0.f,1.f));
+
+	cplane_splat.splat<2>(cplane);
+	cplane_neg = cplane;
+	cplane_neg.negate();
 
-    cplane /= fabs(cplane[2]); // normalize such that depth is not scaled
-    cplane[3] -= 1;
+	cplane.setSelectWithMask( cplane_splat.lessThan( _mm_setzero_ps() ), cplane_neg, cplane );
 
-    if(cplane[2] < 0)
-        cplane *= -1;
+	LLMatrix4a suffix;
+	suffix.setIdentity();
+	suffix.setColumn<2>(cplane);
+	LLMatrix4a newP;
+	newP.setMul(suffix,P);
 
-    glh::matrix4f suffix;
-    suffix.set_row(2, cplane);
-    glh::matrix4f newP = suffix * P;
     gGL.matrixMode(LLRender::MM_PROJECTION);
 	gGL.pushMatrix();
-    gGL.loadMatrix(newP.m);
+    gGL.loadMatrix(newP);
     gGL.matrixMode(LLRender::MM_MODELVIEW);
 }
 
@@ -2026,31 +2041,32 @@ void LLGLDepthTest::checkState()
 
 LLGLSquashToFarClip::LLGLSquashToFarClip()
 {
-    glh::matrix4f proj = get_current_projection();
+	LLMatrix4a proj;
+	proj.loadu(gGLProjection);
     setProjectionMatrix(proj, 0);
 }
 
-LLGLSquashToFarClip::LLGLSquashToFarClip(glh::matrix4f& P, U32 layer)
+LLGLSquashToFarClip::LLGLSquashToFarClip(const LLMatrix4a& projection, U32 layer)
 {
-    setProjectionMatrix(P, layer);
+    setProjectionMatrix(projection, layer);
 }
 
 
-void LLGLSquashToFarClip::setProjectionMatrix(glh::matrix4f& projection, U32 layer)
+void LLGLSquashToFarClip::setProjectionMatrix(const LLMatrix4a& P_in, U32 layer)
 {
+	LLMatrix4a P = P_in;
 
 	F32 depth = 0.99999f - 0.0001f * layer;
 
-	for (U32 i = 0; i < 4; i++)
-	{
-		projection.element(2, i) = projection.element(3, i) * depth;
-	}
+	LLVector4a col = P.getColumn<3>();
+	col.mul(depth);
+	P.setColumn<2>(col);
 
     LLRender::eMatrixMode last_matrix_mode = gGL.getMatrixMode();
 
 	gGL.matrixMode(LLRender::MM_PROJECTION);
 	gGL.pushMatrix();
-	gGL.loadMatrix(projection.m);
+	gGL.loadMatrix(P);
 
 	gGL.matrixMode(last_matrix_mode);
 }
diff --git a/indra/llrender/llgl.h b/indra/llrender/llgl.h
index 69c4f8a56a1f2765323c242010671a4f2ad6b0d6..c11aa237cf12b4c8388417b59753397553780924 100644
--- a/indra/llrender/llgl.h
+++ b/indra/llrender/llgl.h
@@ -38,6 +38,7 @@
 #include "llstring.h"
 #include "stdtypes.h"
 #include "v4math.h"
+#include "llmatrix4a.h"
 #include "llplane.h"
 #include "llgltypes.h"
 #include "llinstancetracker.h"
@@ -352,22 +353,24 @@ class LLGLDisable : public LLGLState
   leaves this class.
   Does not stack.
 */
+LL_ALIGN_PREFIX(16)
 class LLGLUserClipPlane 
 {
 public:
 	
-	LLGLUserClipPlane(const LLPlane& plane, const glh::matrix4f& modelview, const glh::matrix4f& projection, bool apply = true);
+	LLGLUserClipPlane(const LLPlane& plane, const LLMatrix4a& modelview, const LLMatrix4a& projection, bool apply = true);
 	~LLGLUserClipPlane();
 
 	void setPlane(F32 a, F32 b, F32 c, F32 d);
     void disable();
 
 private:
-	bool mApply;
 
-	glh::matrix4f mProjection;
-	glh::matrix4f mModelview;
-};
+	LL_ALIGN_16(LLMatrix4a mProjection);
+	LL_ALIGN_16(LLMatrix4a mModelview);
+
+	bool mApply;
+} LL_ALIGN_POSTFIX(16);
 
 /*
   Modify and load projection matrix to push depth values to far clip plane.
@@ -380,9 +383,9 @@ class LLGLSquashToFarClip
 {
 public:
     LLGLSquashToFarClip();
-	LLGLSquashToFarClip(glh::matrix4f& projection, U32 layer = 0);
+	LLGLSquashToFarClip(const LLMatrix4a& projection, U32 layer = 0);
 
-    void setProjectionMatrix(glh::matrix4f& projection, U32 layer);
+    void setProjectionMatrix(const LLMatrix4a& P_in, U32 layer);
 
 	~LLGLSquashToFarClip();
 };
diff --git a/indra/llrender/llrender.cpp b/indra/llrender/llrender.cpp
index e8f787b9a28aa63256f5eb5285e6893611e692d4..b74a3fde62bd426dcf5cfe7e3e9588865493116e 100644
--- a/indra/llrender/llrender.cpp
+++ b/indra/llrender/llrender.cpp
@@ -35,6 +35,8 @@
 #include "llrendertarget.h"
 #include "lltexture.h"
 #include "llshadermgr.h"
+#include "llmatrix4a.h"
+#include "alglmath.h"
 
 LLRender gGL;
 
@@ -982,12 +984,12 @@ void LLLightState::setPosition(const LLVector4& position)
 	}
 	else
 	{ //transform position by current modelview matrix
-		glh::vec4f pos(position.mV);
+		LLVector4a pos;
+		pos.loadua(position.mV);
 
-		const glh::matrix4f& mat = gGL.getModelviewMatrix();
-		mat.mult_matrix_vec(pos);
+		gGL.getModelviewMatrix().rotate4(pos,pos);
 
-		mPosition.set(pos.v);
+		mPosition.set(pos.getF32ptr());
 	}
 
 }
@@ -1068,12 +1070,12 @@ void LLLightState::setSpotDirection(const LLVector3& direction)
 	}
 	else
 	{ //transform direction by current modelview matrix
-		glh::vec3f dir(direction.mV);
+		LLVector4a dir;
+		dir.load3(direction.mV);
 
-		const glh::matrix4f& mat = gGL.getModelviewMatrix();
-		mat.mult_matrix_dir(dir);
+		gGL.getModelviewMatrix().rotate(dir,dir);
 
-		mSpotDirection.set(dir.v);
+		mSpotDirection.set(dir.getF32ptr());
 	}
 }
 
@@ -1120,6 +1122,13 @@ LLRender::LLRender()
 	}
 
 	mLightHash = 0;
+	
+	//Init base matrix for each mode
+	for(S32 i = 0; i < NUM_MATRIX_MODES; ++i)
+	{
+		mMatrix[i][0].setIdentity();
+	}
+
 }
 
 LLRender::~LLRender()
@@ -1291,8 +1300,7 @@ void LLRender::syncMatrices()
 		U32 i = MM_MODELVIEW;
 		if (mMatHash[MM_MODELVIEW] != shader->mMatHash[MM_MODELVIEW])
 		{ //update modelview, normal, and MVP
-			LLMatrix4a mat;
-			mat.loadu(mMatrix[MM_MODELVIEW][mMatIdx[MM_MODELVIEW]].m);
+			const LLMatrix4a& mat = mMatrix[MM_MODELVIEW][mMatIdx[MM_MODELVIEW]];
 
 			shader->uniformMatrix4fv(name[MM_MODELVIEW], 1, GL_FALSE, mat.getF32ptr());
 			shader->mMatHash[MM_MODELVIEW] = mMatHash[MM_MODELVIEW];
@@ -1326,9 +1334,7 @@ void LLRender::syncMatrices()
 			{
 				if (cached_mvp_mdv_hash != mMatHash[i] || cached_mvp_proj_hash != mMatHash[MM_PROJECTION])
 				{
-					LLMatrix4a proj;
-					proj.loadu(mMatrix[MM_PROJECTION][mMatIdx[MM_PROJECTION]].m);
-					cached_mvp.setMul(proj, mat);
+					cached_mvp.setMul(mMatrix[MM_PROJECTION][mMatIdx[MM_PROJECTION]], mat);
 					cached_mvp_mdv_hash = mMatHash[i];
 					cached_mvp_proj_hash = mMatHash[MM_PROJECTION];
 				}
@@ -1340,8 +1346,7 @@ void LLRender::syncMatrices()
 		i = MM_PROJECTION;
 		if (mMatHash[MM_PROJECTION] != shader->mMatHash[MM_PROJECTION])
 		{ //update projection matrix, normal, and MVP
-			LLMatrix4a mat;
-			mat.loadu(mMatrix[MM_PROJECTION][mMatIdx[MM_PROJECTION]].m);
+			const LLMatrix4a& mat = mMatrix[MM_PROJECTION][mMatIdx[MM_PROJECTION]];
 
             // it would be nice to have this automatically track the state of the proj matrix
             // but certain render paths (deferred lighting) require it to be mismatched *sigh*
@@ -1362,9 +1367,7 @@ void LLRender::syncMatrices()
 				{
 					if (cached_mvp_mdv_hash != mMatHash[MM_PROJECTION] || cached_mvp_proj_hash != mMatHash[MM_PROJECTION])
 					{
-						LLMatrix4a mdv;
-						mdv.loadu(mMatrix[MM_MODELVIEW][mMatIdx[MM_MODELVIEW]].m);
-						cached_mvp.setMul(mat, mdv);
+						cached_mvp.setMul(mat, mMatrix[MM_MODELVIEW][mMatIdx[MM_MODELVIEW]]);
 						cached_mvp_mdv_hash = mMatHash[MM_MODELVIEW];
 						cached_mvp_proj_hash = mMatHash[MM_PROJECTION];
 					}
@@ -1378,7 +1381,7 @@ void LLRender::syncMatrices()
 		{
 			if (mMatHash[i] != shader->mMatHash[i])
 			{
-				shader->uniformMatrix4fv(name[i], 1, GL_FALSE, mMatrix[i][mMatIdx[i]].m);
+				shader->uniformMatrix4fv(name[i], 1, GL_FALSE, mMatrix[i][mMatIdx[i]].getF32ptr());
 				shader->mMatHash[i] = mMatHash[i];
 			}
 		}
@@ -1406,7 +1409,7 @@ void LLRender::syncMatrices()
 			if (mMatHash[i] != mCurMatHash[i])
 			{
 				glMatrixMode(mode[i]);
-				glLoadMatrixf(mMatrix[i][mMatIdx[i]].m);
+				glLoadMatrixf(mMatrix[i][mMatIdx[i]].getF32ptr());
 				mCurMatHash[i] = mMatHash[i];
 			}
 		}
@@ -1417,7 +1420,7 @@ void LLRender::syncMatrices()
 			{
 				gGL.getTexUnit(i-MM_TEXTURE0)->activate();
 				glMatrixMode(mode[i]);
-				glLoadMatrixf(mMatrix[i][mMatIdx[i]].m);
+				glLoadMatrixf(mMatrix[i][mMatIdx[i]].getF32ptr());
 				mCurMatHash[i] = mMatHash[i];
 			}
 		}
@@ -1428,30 +1431,32 @@ void LLRender::syncMatrices()
 
 void LLRender::translatef(const GLfloat& x, const GLfloat& y, const GLfloat& z)
 {
-	flush();
-
+	if(	llabs(x) < F_APPROXIMATELY_ZERO &&
+		llabs(y) < F_APPROXIMATELY_ZERO &&
+		llabs(z) < F_APPROXIMATELY_ZERO)
 	{
-		glh::matrix4f trans_mat(1,0,0,x,
-								0,1,0,y,
-								0,0,1,z,
-								0,0,0,1);
-	
-		mMatrix[mMatrixMode][mMatIdx[mMatrixMode]].mult_right(trans_mat);
-		mMatHash[mMatrixMode]++;
+		return;
 	}
+
+	flush();
+
+	mMatrix[mMatrixMode][mMatIdx[mMatrixMode]].applyTranslation_affine(x,y,z);
+	mMatHash[mMatrixMode]++;
+
 }
 
 void LLRender::scalef(const GLfloat& x, const GLfloat& y, const GLfloat& z)
 {
+	if(	(llabs(x-1.f)) < F_APPROXIMATELY_ZERO &&
+		(llabs(y-1.f)) < F_APPROXIMATELY_ZERO &&
+		(llabs(z-1.f)) < F_APPROXIMATELY_ZERO)
+	{
+		return;
+	}
 	flush();
 	
 	{
-		glh::matrix4f scale_mat(x,0,0,0,
-								0,y,0,0,
-								0,0,z,0,
-								0,0,0,1);
-	
-		mMatrix[mMatrixMode][mMatIdx[mMatrixMode]].mult_right(scale_mat);
+		mMatrix[mMatrixMode][mMatIdx[mMatrixMode]].applyScale_affine(x,y,z);
 		mMatHash[mMatrixMode]++;
 	}
 }
@@ -1461,37 +1466,36 @@ void LLRender::ortho(F32 left, F32 right, F32 bottom, F32 top, F32 zNear, F32 zF
 	flush();
 
 	{
+		LLMatrix4a ortho_mat;
+		ortho_mat.setRow<0>(LLVector4a(2.f/(right-left),0,0));
+		ortho_mat.setRow<1>(LLVector4a(0,2.f/(top-bottom),0));
+		ortho_mat.setRow<2>(LLVector4a(0,0,-2.f/(zFar-zNear)));
+		ortho_mat.setRow<3>(LLVector4a(-(right+left)/(right-left),-(top+bottom)/(top-bottom),-(zFar+zNear)/(zFar-zNear),1));	
 
-		glh::matrix4f ortho_mat(2.f/(right-left),0,0,	-(right+left)/(right-left),
-								0,2.f/(top-bottom),0,	-(top+bottom)/(top-bottom),
-								0,0,-2.f/(zFar-zNear),	-(zFar+zNear)/(zFar-zNear),
-								0,0,0,1);
-	
-		mMatrix[mMatrixMode][mMatIdx[mMatrixMode]].mult_right(ortho_mat);
+		mMatrix[mMatrixMode][mMatIdx[mMatrixMode]].mul_affine(ortho_mat);
 		mMatHash[mMatrixMode]++;
 	}
 }
 
-void LLRender::rotatef(const GLfloat& a, const GLfloat& x, const GLfloat& y, const GLfloat& z)
+void LLRender::rotatef(const LLMatrix4a& rot)
 {
 	flush();
 
-	{
-		F32 r = a * DEG_TO_RAD;
-
-		F32 c = cosf(r);
-		F32 s = sinf(r);
-
-		F32 ic = 1.f-c;
+	mMatrix[mMatrixMode][mMatIdx[mMatrixMode]].mul_affine(rot);
+	mMatHash[mMatrixMode]++;
+}
 
-		glh::matrix4f rot_mat(x*x*ic+c,		x*y*ic-z*s,		x*z*ic+y*s,		0,
-							  x*y*ic+z*s,	y*y*ic+c,		y*z*ic-x*s,		0,
-							  x*z*ic-y*s,	y*z*ic+x*s,		z*z*ic+c,		0,
-							  0,0,0,1);
-	
-		mMatrix[mMatrixMode][mMatIdx[mMatrixMode]].mult_right(rot_mat);
-		mMatHash[mMatrixMode]++;
+void LLRender::rotatef(const GLfloat& a, const GLfloat& x, const GLfloat& y, const GLfloat& z)
+{
+	if(	llabs(a) < F_APPROXIMATELY_ZERO ||
+		llabs(a-360.f) < F_APPROXIMATELY_ZERO)
+	{
+		return;
 	}
+	
+	flush();
+
+	rotatef(ALGLMath::genRot(a,x,y,z));
 }
 
 void LLRender::pushMatrix()
@@ -1513,36 +1517,47 @@ void LLRender::pushMatrix()
 
 void LLRender::popMatrix()
 {
-	flush();
 	{
 		if (mMatIdx[mMatrixMode] > 0)
 		{
+			if ( memcmp(mMatrix[mMatrixMode][mMatIdx[mMatrixMode]].getF32ptr(), mMatrix[mMatrixMode][mMatIdx[mMatrixMode] - 1].getF32ptr(), sizeof(LLMatrix4a)) )
+			{
+				flush();
+			}
 			--mMatIdx[mMatrixMode];
 			mMatHash[mMatrixMode]++;
 		}
 		else
 		{
+			flush();
 			LL_WARNS() << "Matrix stack underflow." << LL_ENDL;
 		}
 	}
 }
 
-void LLRender::loadMatrix(const GLfloat* m)
+void LLRender::loadMatrix(const LLMatrix4a& mat)
 {
 	flush();
 	{
-		mMatrix[mMatrixMode][mMatIdx[mMatrixMode]].set_value((GLfloat*) m);
+		mMatrix[mMatrixMode][mMatIdx[mMatrixMode]] = mat;
 		mMatHash[mMatrixMode]++;
 	}
 }
 
-void LLRender::multMatrix(const GLfloat* m)
+void LLRender::loadMatrix(const F32* mat)
 {
 	flush();
 	{
-		glh::matrix4f mat((GLfloat*) m);
-	
-		mMatrix[mMatrixMode][mMatIdx[mMatrixMode]].mult_right(mat);
+		mMatrix[mMatrixMode][mMatIdx[mMatrixMode]].loadu(mat);
+		mMatHash[mMatrixMode]++;
+	}
+}
+
+void LLRender::multMatrix(const LLMatrix4a& mat)
+{
+	flush();
+	{
+		mMatrix[mMatrixMode][mMatIdx[mMatrixMode]].mul_affine(mat); 
 		mMatHash[mMatrixMode]++;
 	}
 }
@@ -1585,17 +1600,17 @@ void LLRender::loadIdentity()
 	{
 		llassert_always(mMatrixMode < NUM_MATRIX_MODES) ;
 
-		mMatrix[mMatrixMode][mMatIdx[mMatrixMode]].make_identity();
+		mMatrix[mMatrixMode][mMatIdx[mMatrixMode]].setIdentity();
 		mMatHash[mMatrixMode]++;
 	}
 }
 
-const glh::matrix4f& LLRender::getModelviewMatrix()
+const LLMatrix4a& LLRender::getModelviewMatrix()
 {
 	return mMatrix[MM_MODELVIEW][mMatIdx[MM_MODELVIEW]];
 }
 
-const glh::matrix4f& LLRender::getProjectionMatrix()
+const LLMatrix4a& LLRender::getProjectionMatrix()
 {
 	return mMatrix[MM_PROJECTION][mMatIdx[MM_PROJECTION]];
 }
@@ -2117,9 +2132,9 @@ void LLRender::vertexBatchPreTransformed(LLVector4a* verts, S32 vert_count)
 	}
 	else
 	{
+		mVerticesp.copyArray(mCount, verts, vert_count);
 		for (S32 i = 0; i < vert_count; i++)
 		{
-			mVerticesp[mCount] = verts[i];
 
 			mCount++;
 			mTexcoordsp[mCount] = mTexcoordsp[mCount-1];
@@ -2175,11 +2190,11 @@ void LLRender::vertexBatchPreTransformed(LLVector4a* verts, LLVector2* uvs, S32
 	}
 	else
 	{
+		mVerticesp.copyArray(mCount, verts, vert_count);
+		mTexcoordsp.copyArray(mCount, uvs, vert_count);
+	
 		for (S32 i = 0; i < vert_count; i++)
 		{
-			mVerticesp[mCount] = verts[i];
-			mTexcoordsp[mCount] = uvs[i];
-
 			mCount++;
 			mColorsp[mCount] = mColorsp[mCount-1];
 		}
@@ -2237,14 +2252,11 @@ void LLRender::vertexBatchPreTransformed(LLVector4a* verts, LLVector2* uvs, LLCo
 	}
 	else
 	{
-		for (S32 i = 0; i < vert_count; i++)
-		{
-			mVerticesp[mCount] = verts[i];
-			mTexcoordsp[mCount] = uvs[i];
-			mColorsp[mCount] = colors[i];
-
-			mCount++;
-		}
+		// Note: Batch copies instead of iterating.
+		mVerticesp.copyArray(mCount, verts, vert_count);
+		mTexcoordsp.copyArray(mCount, uvs, vert_count);
+		mColorsp.copyArray(mCount, colors, vert_count);
+		mCount += vert_count;
 	}
 
 	if (mCount > 0)
diff --git a/indra/llrender/llrender.h b/indra/llrender/llrender.h
index 16df9d65ba35f4e5e130d408e66cc83bd8db34b8..79ad9a8ba93b0c44030299d2d15594d9568d38b2 100644
--- a/indra/llrender/llrender.h
+++ b/indra/llrender/llrender.h
@@ -58,6 +58,7 @@ class LLCubeMap;
 class LLImageGL;
 class LLRenderTarget;
 class LLTexture ;
+class LLMatrix4a;
 
 #define LL_MATRIX_STACK_DEPTH 32
 
@@ -381,19 +382,28 @@ class LLRender
 
 	void translatef(const GLfloat& x, const GLfloat& y, const GLfloat& z);
 	void scalef(const GLfloat& x, const GLfloat& y, const GLfloat& z);
+	//rotatef requires generation of a transform matrix involving sine/cosine. If rotating by a constant value, use genRot, store the result in a static variable, and pass that var to rotatef.
+	void rotatef(const LLMatrix4a& rot);
 	void rotatef(const GLfloat& a, const GLfloat& x, const GLfloat& y, const GLfloat& z);
 	void ortho(F32 left, F32 right, F32 bottom, F32 top, F32 zNear, F32 zFar);
 
 	void pushMatrix();
 	void popMatrix();
-	void loadMatrix(const GLfloat* m);
+	void loadMatrix(const LLMatrix4a& mat);
+	void loadMatrix(const F32* mat);
 	void loadIdentity();
-	void multMatrix(const GLfloat* m);
+	void multMatrix(const LLMatrix4a& mat);
+	inline void multMatrix(const F32* mat)
+	{
+		LLMatrix4a inmat;
+		inmat.loadu(mat);
+		multMatrix(inmat);
+	}
 	void matrixMode(eMatrixMode mode);	
 	eMatrixMode getMatrixMode();
 
-	const glh::matrix4f& getModelviewMatrix();
-	const glh::matrix4f& getProjectionMatrix();
+	const LLMatrix4a& getModelviewMatrix();
+	const LLMatrix4a& getProjectionMatrix();
 
 	void syncMatrices();
 	void syncLightState();
@@ -487,7 +497,7 @@ class LLRender
 	eMatrixMode mMatrixMode;
 	U32 mMatIdx[NUM_MATRIX_MODES];
 	U32 mMatHash[NUM_MATRIX_MODES];
-	glh::matrix4f mMatrix[NUM_MATRIX_MODES][LL_MATRIX_STACK_DEPTH];
+	LL_ALIGN_16(LLMatrix4a mMatrix[NUM_MATRIX_MODES][LL_MATRIX_STACK_DEPTH]);
 	U32 mCurMatHash[NUM_MATRIX_MODES];
 	U32 mLightHash;
 	LLColor4 mAmbientLightColor;
diff --git a/indra/newview/lldrawpoolwater.cpp b/indra/newview/lldrawpoolwater.cpp
index d54adf37037485450fbf36a51bc2935e2958f2c8..d42592b8ef009e3ab605e0edd74e363a77a1acb7 100644
--- a/indra/newview/lldrawpoolwater.cpp
+++ b/indra/newview/lldrawpoolwater.cpp
@@ -279,11 +279,12 @@ void LLDrawPoolWater::render(S32 pass)
 
 		gGL.matrixMode(LLRender::MM_TEXTURE);
 		gGL.loadIdentity();
-		LLMatrix4 camera_mat = LLViewerCamera::getInstanceFast()->getModelview();
-		LLMatrix4 camera_rot(camera_mat.getMat3());
+		LLMatrix4a camera_rot;
+		camera_rot.loadu((F32*)LLViewerCamera::getInstanceFast()->getModelview().mMatrix);
+		camera_rot.extractRotation_affine();
 		camera_rot.invert();
 
-		gGL.loadMatrix((F32 *)camera_rot.mMatrix);
+		gGL.loadMatrix(camera_rot);
 
 		gGL.matrixMode(LLRender::MM_MODELVIEW);
 		LLOverrideFaceColor overrid(this, 1.f, 1.f, 1.f,  0.5f*up_dot);
@@ -481,8 +482,11 @@ void LLDrawPoolWater::shade2(bool edge, LLGLSLShader* shader, const LLColor3& li
 	{
         if (shader->getUniformLocation(LLShaderMgr::DEFERRED_NORM_MATRIX) >= 0)
 	    {
-		    glh::matrix4f norm_mat = get_current_modelview().inverse().transpose();
-		    shader->uniformMatrix4fv(LLShaderMgr::DEFERRED_NORM_MATRIX, 1, FALSE, norm_mat.m);
+			LLMatrix4a norm_mat;
+			norm_mat.loadu(gGLModelView);
+			norm_mat.invert();
+			norm_mat.transpose();
+		    shader->uniformMatrix4fv(LLShaderMgr::DEFERRED_NORM_MATRIX, 1, FALSE, norm_mat.getF32ptr());
 	    }
 	}
 
@@ -628,7 +632,8 @@ void LLDrawPoolWater::shade2(bool edge, LLGLSLShader* shader, const LLColor3& li
 			}
 			else
 			{
-				auto proj = get_current_projection();
+				LLMatrix4a proj;
+				proj.loadu(gGLProjection);
 				LLGLSquashToFarClip far_clip(proj);
 				face->renderIndexed();
 			}
diff --git a/indra/newview/llmaniptranslate.cpp b/indra/newview/llmaniptranslate.cpp
index 43126dafa8e3290b68eac602f661d9230fa79444..d7b8ae3fcd695b320e06cf00b50c806afc0c83ca 100644
--- a/indra/newview/llmaniptranslate.cpp
+++ b/indra/newview/llmaniptranslate.cpp
@@ -1689,12 +1689,15 @@ void LLManipTranslate::highlightIntersection(LLVector3 normal,
 			normal = -normal;
 		}
 		F32 d = -(selection_center * normal);
-		glh::vec4f plane(normal.mV[0], normal.mV[1], normal.mV[2], d );
+		LLVector4a plane(normal.mV[0], normal.mV[1], normal.mV[2], d );
 
-		gGL.getModelviewMatrix().inverse().mult_vec_matrix(plane);
+		LLMatrix4a inv_mat = gGL.getModelviewMatrix();
+		inv_mat.invert();
+		inv_mat.transpose();
+		inv_mat.rotate4(plane,plane);
 
 		static LLStaticHashedString sClipPlane("clip_plane");
-		gClipProgram.uniform4fv(sClipPlane, 1, plane.v);
+		gClipProgram.uniform4fv(sClipPlane, 1, plane.getF32ptr());
 		
 		BOOL particles = gPipeline.hasRenderType(LLPipeline::RENDER_TYPE_PARTICLES);
 		BOOL clouds = gPipeline.hasRenderType(LLPipeline::RENDER_TYPE_CLOUDS);
diff --git a/indra/newview/pipeline.cpp b/indra/newview/pipeline.cpp
index 030ce30bd5d4a998709a2326848010e2e57286a8..0c22bd51c4d93ef87ea229858ea9d7ce156ee5a9 100644
--- a/indra/newview/pipeline.cpp
+++ b/indra/newview/pipeline.cpp
@@ -8597,7 +8597,7 @@ void LLPipeline::bindDeferredShader(LLGLSLShader& shader, LLRenderTarget* light_
 
     if (sReflectionRender && !shader.getUniformLocation(LLShaderMgr::MODELVIEW_MATRIX))
     {
-        shader.uniformMatrix4fv(LLShaderMgr::MODELVIEW_MATRIX, 1, FALSE, mReflectionModelView.m);  
+        shader.uniformMatrix4fv(LLShaderMgr::MODELVIEW_MATRIX, 1, FALSE, mReflectionModelView.getF32ptr());  
     }
 
 	channel = shader.enableTexture(LLShaderMgr::DEFERRED_NOISE);
@@ -9560,9 +9560,11 @@ void LLPipeline::generateWaterReflection(LLCamera& camera_in)
 
         gPipeline.pushRenderTypeMask();
 
-        glh::matrix4f saved_modelview  = get_current_modelview();
-        glh::matrix4f saved_projection = get_current_projection();
-        glh::matrix4f mat;
+		LLMatrix4a saved_modelview;
+		saved_modelview.loadu(gGLModelView);
+		LLMatrix4a saved_projection;
+		saved_projection.loadu(gGLProjection);
+        LLMatrix4a mat;
 
         S32 reflection_detail  = RenderReflectionDetail;
 
@@ -9610,21 +9612,25 @@ void LLPipeline::generateWaterReflection(LLCamera& camera_in)
             gGL.matrixMode(LLRender::MM_MODELVIEW);
             gGL.pushMatrix();
 
-            mat.set_scale(glh::vec3f(1, 1, -1));
-            mat.set_translate(glh::vec3f(0,0,water_height*2.f));
-            mat = saved_modelview * mat;
+			mat.setIdentity();
+			mat.getRow<2>().negate();
+			mat.setTranslate_affine(LLVector3(0.f,0.f, water_height *2.f));
+			mat.setMul(saved_modelview,mat);
 
             mReflectionModelView = mat;
 
-            set_current_modelview(mat);
-            gGL.loadMatrix(mat.m);
+            set_current_modelview(mat.getF32ptr());
+            gGL.loadMatrix(mat);
 
             LLViewerCamera::updateFrustumPlanes(camera, FALSE, TRUE);
 
-            glh::vec3f    origin(0, 0, 0);
-            glh::matrix4f inv_mat = mat.inverse();
-            inv_mat.mult_matrix_vec(origin);
-            camera.setOrigin(origin.v);
+			LLMatrix4a inv_mat = mat;
+			inv_mat.invert();
+
+			LLVector4a origin;
+			origin.clear();
+			inv_mat.affineTransform(origin,origin);
+            camera.setOrigin(origin.getF32ptr());
 
             glCullFace(GL_FRONT);
 
@@ -9712,7 +9718,7 @@ void LLPipeline::generateWaterReflection(LLCamera& camera_in)
             gGL.matrixMode(LLRender::MM_MODELVIEW);
             gGL.popMatrix();
 
-            set_current_modelview(saved_modelview);
+            set_current_modelview(saved_modelview.getF32ptr());
         }
 
         //LLPipeline::sUseOcclusion = occlusion;
diff --git a/indra/newview/pipeline.h b/indra/newview/pipeline.h
index 5724ec4fe9db865b0d53b097d62fe2408d974166..35ebe24b70aa648f4d812851dcea89bcd57fa6ba 100644
--- a/indra/newview/pipeline.h
+++ b/indra/newview/pipeline.h
@@ -656,7 +656,7 @@ class LLPipeline
 	LLMatrix4a				mSunShadowMatrix[6];
 	LLMatrix4a				mShadowModelview[6];
 	LLMatrix4a				mShadowProjection[6];
-    glh::matrix4f           mReflectionModelView;
+	LLMatrix4a				mReflectionModelView;
 
 	LLPointer<LLDrawable>	mShadowSpotLight[2];
 	F32						mSpotLightFade[2];