diff --git a/indra/llmath/llmatrix4a.h b/indra/llmath/llmatrix4a.h
index 5e8abaa73f0b2fc5f46c3224b78b09eb77266aec..ce978bf4b958ac297db51d3f86fccb9cb0c1b2be 100644
--- a/indra/llmath/llmatrix4a.h
+++ b/indra/llmath/llmatrix4a.h
@@ -66,11 +66,42 @@ public:
 		mMatrix[3] = rhs.getRow<3>();
 	}
 
+	LLMatrix4a(const LLQuad& q1,const LLQuad& q2,const LLQuad& q3,const LLQuad& q4)
+	{
+		mMatrix[0] = q1;
+		mMatrix[1] = q2;
+		mMatrix[2] = q3;
+		mMatrix[3] = q4;
+	}
+
 	LLMatrix4a(const LLMatrix4& rhs)
 	{
 		loadu(rhs);
 	}
 
+	LLMatrix4a(const LLQuaternion2& quat)
+	{
+		const LLVector4a& xyzw = quat.getVector4a(); 
+		LLVector4a nyxwz = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(2,3,0,1));
+		nyxwz.negate();
+		const LLVector4a xnyynx = _mm_unpacklo_ps(xyzw,nyxwz);
+		const LLVector4a znwwnz = _mm_unpackhi_ps(xyzw,nyxwz);
+
+		LLMatrix4a mata;
+		mata.setRow<0>(_mm_shuffle_ps(xyzw, xnyynx, _MM_SHUFFLE(0,1,2,3)));
+		mata.setRow<1>(_mm_shuffle_ps(znwwnz, xyzw, _MM_SHUFFLE(1,0,2,3)));
+		mata.setRow<2>(_mm_shuffle_ps(xnyynx, xyzw, _MM_SHUFFLE(2,3,3,2)));
+		mata.setRow<3>(_mm_shuffle_ps(xnyynx, znwwnz, _MM_SHUFFLE(2,3,1,3)));
+
+		LLMatrix4a matb;
+		matb.setRow<0>(_mm_shuffle_ps(xyzw, xnyynx, _MM_SHUFFLE(3,1,2,3)));
+		matb.setRow<1>(_mm_shuffle_ps(znwwnz, xnyynx, _MM_SHUFFLE(1,0,2,3)));
+		matb.setRow<2>(_mm_shuffle_ps(xnyynx, znwwnz, _MM_SHUFFLE(3,2,3,2)));
+		matb.setRow<3>(xyzw);
+
+		setMul(matb,mata);
+	}
+
 	// Do NOT add aditional operators without consulting someone with SSE experience
 	inline const LLMatrix4a& operator= (const LLMatrix4a& rhs)
 	{
@@ -509,7 +540,153 @@ public:
 		return ret;
 	}
 
-		//======================Logic====================
+	//=============Affine transformation matrix only=========================
+
+	//Multiply matrix with a pure translation matrix.
+	inline void applyTranslation_affine(const F32& x, const F32& y, const F32& z)
+	{
+		const LLVector4a xyz0(x,y,z,0);	//load
+		LLVector4a xxxx;
+		xxxx.splat<0>(xyz0);
+		LLVector4a yyyy;
+		yyyy.splat<1>(xyz0);
+		LLVector4a zzzz;
+		zzzz.splat<2>(xyz0);
+
+		LLVector4a sum1;
+		LLVector4a sum2;
+		LLVector4a sum3;
+
+		sum1.setMul(xxxx,mMatrix[0]);
+		sum2.setMul(yyyy,mMatrix[1]);
+		sum3.setMul(zzzz,mMatrix[2]);
+
+		mMatrix[3].add(sum1);
+		mMatrix[3].add(sum2);
+		mMatrix[3].add(sum3);
+	}
+
+	//Multiply matrix with a pure translation matrix.
+	inline void applyTranslation_affine(const LLVector3& trans)
+	{
+		applyTranslation_affine(trans.mV[VX],trans.mV[VY],trans.mV[VZ]);
+	}
+
+	//Multiply matrix with a pure scale matrix.
+	inline void applyScale_affine(const F32& x, const F32& y, const F32& z)
+	{
+		const LLVector4a xyz0(x,y,z,0);	//load
+		LLVector4a xxxx;
+		xxxx.splat<0>(xyz0);
+		LLVector4a yyyy;
+		yyyy.splat<1>(xyz0);
+		LLVector4a zzzz;
+		zzzz.splat<2>(xyz0);
+
+		mMatrix[0].mul(xxxx);
+		mMatrix[1].mul(yyyy);
+		mMatrix[2].mul(zzzz);
+	}
+
+	//Multiply matrix with a pure scale matrix.
+	inline void applyScale_affine(const LLVector3& scale)
+	{
+		applyScale_affine(scale.mV[VX],scale.mV[VY],scale.mV[VZ]);
+	}
+
+	//Multiply matrix with a pure scale matrix.
+	inline void applyScale_affine(const F32& s)
+	{
+		const LLVector4a scale(s);	//load
+		mMatrix[0].mul(scale);
+		mMatrix[1].mul(scale);
+		mMatrix[2].mul(scale);
+	}
+
+	//Direct addition to row3.
+	inline void translate_affine(const LLVector3& trans)
+	{
+		LLVector4a translation;
+		translation.load3(trans.mV);
+		mMatrix[3].add(translation);
+	}
+
+	//Direct assignment of row3.
+	inline void setTranslate_affine(const LLVector3& trans)
+	{
+		static const LLVector4Logical mask = _mm_load_ps((F32*)&S_V4LOGICAL_MASK_TABLE[3*4]);
+
+		LLVector4a translation;
+		translation.load3(trans.mV);
+		
+		mMatrix[3].setSelectWithMask(mask,mMatrix[3],translation);
+	}
+
+	inline void mul_affine(const LLMatrix4a& rhs)
+	{
+		LLVector4a x0,y0,z0;
+		LLVector4a x1,y1,z1;
+		LLVector4a x2,y2,z2;
+		LLVector4a x3,y3,z3;
+
+		//12 shuffles
+		x0.splat<0>(rhs.mMatrix[0]);
+		x1.splat<0>(rhs.mMatrix[1]);
+		x2.splat<0>(rhs.mMatrix[2]);
+		x3.splat<0>(rhs.mMatrix[3]);
+
+		y0.splat<1>(rhs.mMatrix[0]);
+		y1.splat<1>(rhs.mMatrix[1]);
+		y2.splat<1>(rhs.mMatrix[2]);
+		y3.splat<1>(rhs.mMatrix[3]);
+
+		z0.splat<2>(rhs.mMatrix[0]);
+		z1.splat<2>(rhs.mMatrix[1]);
+		z2.splat<2>(rhs.mMatrix[2]);
+		z3.splat<2>(rhs.mMatrix[3]);
+
+		//12 muls
+		x0.mul(mMatrix[0]);
+		x1.mul(mMatrix[0]);
+		x2.mul(mMatrix[0]);
+		x3.mul(mMatrix[0]);
+
+		y0.mul(mMatrix[1]);
+		y1.mul(mMatrix[1]);
+		y2.mul(mMatrix[1]);
+		y3.mul(mMatrix[1]);
+
+		z0.mul(mMatrix[2]);
+		z1.mul(mMatrix[2]);
+		z2.mul(mMatrix[2]);
+		z3.mul(mMatrix[2]);
+
+		//9 adds
+		x0.add(y0);
+
+		x1.add(y1);
+
+		x2.add(y2);
+
+		x3.add(y3);
+		z3.add(mMatrix[3]);
+
+		mMatrix[0].setAdd(x0,z0);
+		mMatrix[1].setAdd(x1,z1);
+		mMatrix[2].setAdd(x2,z2);
+		mMatrix[3].setAdd(x3,z3);
+	}
+
+	inline void extractRotation_affine()
+	{
+		static const LLVector4Logical mask = _mm_load_ps((F32*)&S_V4LOGICAL_MASK_TABLE[3*4]);
+		mMatrix[0].setSelectWithMask(mask,_mm_setzero_ps(),mMatrix[0]);
+		mMatrix[1].setSelectWithMask(mask,_mm_setzero_ps(),mMatrix[1]);
+		mMatrix[2].setSelectWithMask(mask,_mm_setzero_ps(),mMatrix[2]);
+		mMatrix[3].setSelectWithMask(mask,LLVector4a(1.f),_mm_setzero_ps());
+	}
+
+	//======================Logic====================
 private:
 	template<bool mins> inline void init_foos(LLMatrix4a& foos) const
 	{
diff --git a/indra/llmath/llquaternion2.h b/indra/llmath/llquaternion2.h
index fc32165b092c16c12a69e16d52cc2bdbacd4e769..6cfe91a024a4cf784af664dc3364f89d5b65dc0a 100644
--- a/indra/llmath/llquaternion2.h
+++ b/indra/llmath/llquaternion2.h
@@ -85,6 +85,8 @@ public:
 	// Quantize this quaternion to 16 bit precision
 	inline void quantize16();
 
+	inline void mul(const LLQuaternion2& b);
+
 	/////////////////////////
 	// Quaternion inspection
 	/////////////////////////
diff --git a/indra/llmath/llquaternion2.inl b/indra/llmath/llquaternion2.inl
index 2a6987552d2949dcf392c1c00603ff70a97a86ea..52d67620f13d40a037fa986809900a3a8bc3517d 100644
--- a/indra/llmath/llquaternion2.inl
+++ b/indra/llmath/llquaternion2.inl
@@ -50,6 +50,39 @@ inline LLVector4a& LLQuaternion2::getVector4aRw()
 	return mQ;
 }
 
+inline void LLQuaternion2::mul(const LLQuaternion2& b)
+{
+	static LL_ALIGN_16(const unsigned int signMask[4]) = { 0x0, 0x0, 0x0, 0x80000000 };
+
+	LLVector4a sum1, sum2, prod1, prod2, prod3, prod4;
+	const LLVector4a& va = mQ;
+	const LLVector4a& vb = b.getVector4a();
+
+	//			[VX] [VY] [VZ] [VW]
+	//prod1:	+wx  +wy  +wz  +ww  Bwwww*Axyzw 
+	//prod2:	+xw  +yw  +zw  -xx  Bxyzx*Awwwx		[VW] sign flip
+	//prod3:	+yz  +zx  +xy  -yy  Byzxy*Azxyy		[VW] sign flip
+	//prod4:	-zy  -xz  -yx  -zz  Bzxyz*Ayzzz
+
+	const LLVector4a Bwwww = _mm_shuffle_ps(vb,vb,_MM_SHUFFLE(3,3,3,3));
+	const LLVector4a Bxyzx = _mm_shuffle_ps(vb,vb,_MM_SHUFFLE(0,2,1,0));
+	const LLVector4a Awwwx = _mm_shuffle_ps(va,va,_MM_SHUFFLE(0,3,3,3));
+	const LLVector4a Byzxy = _mm_shuffle_ps(vb,vb,_MM_SHUFFLE(1,0,2,1));
+	const LLVector4a Azxyy = _mm_shuffle_ps(va,va,_MM_SHUFFLE(1,1,0,2));
+	const LLVector4a Bzxyz = _mm_shuffle_ps(vb,vb,_MM_SHUFFLE(2,1,0,2));
+	const LLVector4a Ayzxz = _mm_shuffle_ps(va,va,_MM_SHUFFLE(2,0,2,1));
+
+	prod1.setMul(Bwwww,va);
+	prod2.setMul(Bxyzx,Awwwx);
+	prod3.setMul(Byzxy,Azxyy);
+	prod4.setMul(Bzxyz,Ayzxz);
+
+	sum1.setAdd(prod2,prod3);
+	sum1 = _mm_xor_ps(sum1, _mm_load_ps((const float*)signMask));	
+	sum2.setSub(prod1,prod4);
+	mQ.setAdd(sum1,sum2);
+}
+
 /////////////////////////
 // Quaternion modification
 /////////////////////////
diff --git a/indra/llmath/llvector4a.h b/indra/llmath/llvector4a.h
index dd1a71efe9528fea7f7ac6a65f2f25976827f800..e553119a37207511714833ff6ef1084bd2a84d88 100644
--- a/indra/llmath/llvector4a.h
+++ b/indra/llmath/llvector4a.h
@@ -128,7 +128,7 @@ public:
 	inline void loadua(const F32* src);
 	
 	// Load only three floats beginning at address 'src'. Slowest method.
-	inline void load3(const F32* src);
+	inline void load3(const F32* src, const F32 w=0.f);
 	
 	// Store to a 16-byte aligned memory address
 	inline void store4a(F32* dst) const;
@@ -285,6 +285,8 @@ public:
 	void quantize8( const LLVector4a& low, const LLVector4a& high );
 	void quantize16( const LLVector4a& low, const LLVector4a& high );
 
+	void negate();
+
 	////////////////////////////////////
 	// LOGICAL
 	////////////////////////////////////	
diff --git a/indra/llmath/llvector4a.inl b/indra/llmath/llvector4a.inl
index f7062b6e367aecc343b3d2b3f5bb1497b3603e72..4eba273471ebd5d5f7db062cc837b82edbc9c0c3 100644
--- a/indra/llmath/llvector4a.inl
+++ b/indra/llmath/llvector4a.inl
@@ -41,11 +41,11 @@ inline void LLVector4a::loadua(const F32* src)
 }
 
 // Load only three floats beginning at address 'src'. Slowest method.
-inline void LLVector4a::load3(const F32* src)
+inline void LLVector4a::load3(const F32* src, const F32 w)
 {
 	// mQ = { 0.f, src[2], src[1], src[0] } = { W, Z, Y, X }
 	// NB: This differs from the convention of { Z, Y, X, W }
-	mQ = _mm_set_ps(0.f, src[2], src[1], src[0]);
+	mQ = _mm_set_ps(w, src[2], src[1], src[0]);
 }	
 
 // Store to a 16-byte aligned memory address
@@ -536,6 +536,11 @@ inline void LLVector4a::clamp( const LLVector4a& low, const LLVector4a& high )
 	setSelectWithMask( lowMask, low, *this );
 }
 
+inline void LLVector4a::negate()
+{
+	static LL_ALIGN_16(const U32 signMask[4]) = {0x80000000, 0x80000000, 0x80000000, 0x80000000 };
+	mQ = _mm_xor_ps(*reinterpret_cast<const LLQuad*>(signMask), mQ);
+}
 
 ////////////////////////////////////
 // LOGICAL
diff --git a/indra/llmath/llvolume.cpp b/indra/llmath/llvolume.cpp
index c6ceaefcbe34326a9fb5dcd474e52bb544981e7c..8953d79c2ce5c90690132fabdedc6f4f2cf7c2f7 100644
--- a/indra/llmath/llvolume.cpp
+++ b/indra/llmath/llvolume.cpp
@@ -3661,15 +3661,12 @@ S32 LLVolume::getNumTriangles(S32* vcount) const
 void LLVolume::generateSilhouetteVertices(std::vector<LLVector3> &vertices,
 										  std::vector<LLVector3> &normals,
 										  const LLVector3& obj_cam_vec_in,
-										  const LLMatrix4& mat_in,
-										  const LLMatrix3& norm_mat_in,
+										  const LLMatrix4a& mat_in,
+										  const LLMatrix4a& norm_mat_in,
 										  S32 face_mask)
 {
-	LLMatrix4a mat;
-	mat.loadu(mat_in);
-
-	LLMatrix4a norm_mat;
-	norm_mat.loadu(norm_mat_in);
+	const LLMatrix4a& mat = mat_in;
+	const LLMatrix4a& norm_mat = norm_mat_in;
 		
 	LLVector4a obj_cam_vec;
 	obj_cam_vec.load3(obj_cam_vec_in.mV);
diff --git a/indra/llmath/llvolume.h b/indra/llmath/llvolume.h
index 7bb31f929cadd2385945ac6630f4312718fba50b..fac694cb4cfdd49281ca9d374bf84cdb2bfedb4c 100644
--- a/indra/llmath/llvolume.h
+++ b/indra/llmath/llvolume.h
@@ -1025,8 +1025,8 @@ public:
 	void generateSilhouetteVertices(std::vector<LLVector3> &vertices, 
 									std::vector<LLVector3> &normals, 
 									const LLVector3& view_vec,
-									const LLMatrix4& mat,
-									const LLMatrix3& norm_mat,
+									const LLMatrix4a& mat,
+									const LLMatrix4a& norm_mat,
 									S32 face_index);
 
 	//get the face index of the face that intersects with the given line segment at the point 
diff --git a/indra/newview/llface.cpp b/indra/newview/llface.cpp
index 241d5da4133a771b76e396fe87dbc51d1f5b03b5..6f09e54b3deba99f9c8ba0d7c985d4de95e30c43 100644
--- a/indra/newview/llface.cpp
+++ b/indra/newview/llface.cpp
@@ -578,7 +578,7 @@ void LLFace::renderSelected(LLViewerTexture *imagep, const LLColor4& color)
                     // called when selecting a face during edit of a mesh object
 					LLGLEnable offset(GL_POLYGON_OFFSET_FILL);
 					glPolygonOffset(-1.f, -1.f);
-					gGL.multMatrix((F32*) volume->getRelativeXform().mMatrix);
+					gGL.multMatrix(volume->getRelativeXform().getF32ptr());
 					const LLVolumeFace& vol_face = rigged->getVolumeFace(getTEOffset());
 					LLVertexBuffer::drawElements(LLRender::TRIANGLES, vol_face.mNumVertices, vol_face.mPositions, vol_face.mTexCoords, vol_face.mNumIndices, vol_face.mIndices);
 				}
@@ -807,14 +807,13 @@ bool less_than_max_mag(const LLVector4a& vec)
 }
 
 BOOL LLFace::genVolumeBBoxes(const LLVolume &volume, S32 f,
-								const LLMatrix4& mat_vert_in, BOOL global_volume)
+								const LLMatrix4a& mat_vert_in, BOOL global_volume)
 {
 	//get bounding box
 	if (mDrawablep->isState(LLDrawable::REBUILD_VOLUME | LLDrawable::REBUILD_POSITION | LLDrawable::REBUILD_RIGGED))
 	{
 		//VECTORIZE THIS
-		LLMatrix4a mat_vert;
-		mat_vert.loadu(mat_vert_in);
+		const LLMatrix4a& mat_vert = mat_vert_in;
 
 		LLVector4a min,max;
 	
diff --git a/indra/newview/llface.h b/indra/newview/llface.h
index 0997c186cc4d6b918696151c5bff3bef5c26d01f..cbe2834fb837a28fd8a151f82a7f91df7a8a6509 100644
--- a/indra/newview/llface.h
+++ b/indra/newview/llface.h
@@ -187,7 +187,7 @@ public:
 	void		setSize(S32 numVertices, S32 num_indices = 0, bool align = false);
 	
 	BOOL		genVolumeBBoxes(const LLVolume &volume, S32 f,
-									const LLMatrix4& mat_vert_in, BOOL global_volume = FALSE);
+									const LLMatrix4a& mat_vert_in, BOOL global_volume = FALSE);
 	
 	void		init(LLDrawable* drawablep, LLViewerObject* objp);
 	void		destroy();
@@ -233,7 +233,7 @@ public:
 	void	notifyAboutMissingAsset(LLViewerTexture *texture);
 
 public: //aligned members
-	LLVector4a		mExtents[2];
+	LL_ALIGN_16(LLVector4a		mExtents[2]);
 
 private:
 	F32         adjustPartialOverlapPixelArea(F32 cos_angle_to_view_dir, F32 radius );
diff --git a/indra/newview/llflexibleobject.cpp b/indra/newview/llflexibleobject.cpp
index db42fb161777484f7d421a05ed8fbd0d8b88449a..ea05fa04ca0b8d954880a325fecb0372b7af3627 100644
--- a/indra/newview/llflexibleobject.cpp
+++ b/indra/newview/llflexibleobject.cpp
@@ -905,32 +905,35 @@ LLQuaternion LLVolumeImplFlexible::getEndRotation()
 
 void LLVolumeImplFlexible::updateRelativeXform(bool force_identity)
 {
-	LLQuaternion delta_rot;
-	LLVector3 delta_pos, delta_scale;
+
 	LLVOVolume* vo = (LLVOVolume*) mVO;
 
 	bool use_identity = vo->mDrawable->isSpatialRoot() || force_identity;
 
+	vo->mRelativeXform.setIdentity();
+
 	//matrix from local space to parent relative/global space
-	delta_rot = use_identity ? LLQuaternion() : vo->mDrawable->getRotation();
-	delta_pos = use_identity ? LLVector3(0,0,0) : vo->mDrawable->getPosition();
-	delta_scale = LLVector3(1,1,1);
+	LLVector4a delta_pos;
+	LLQuaternion2 delta_rot;
+	if(use_identity)
+	{
+		delta_pos.set(0,0,0,1.f);
+		delta_rot.getVector4aRw() = delta_pos;
+	}
+	else
+	{
+		delta_pos.load3(vo->mDrawable->getPosition().mV,1.f);
+		delta_rot.getVector4aRw().loadua(vo->mDrawable->getRotation().mQ);
+		vo->mRelativeXform.getRow<0>().setRotated(delta_rot,vo->mRelativeXform.getRow<0>());
+		vo->mRelativeXform.getRow<1>().setRotated(delta_rot,vo->mRelativeXform.getRow<1>());
+		vo->mRelativeXform.getRow<2>().setRotated(delta_rot,vo->mRelativeXform.getRow<2>());
+	}
 
-	// Vertex transform (4x4)
-	LLVector3 x_axis = LLVector3(delta_scale.mV[VX], 0.f, 0.f) * delta_rot;
-	LLVector3 y_axis = LLVector3(0.f, delta_scale.mV[VY], 0.f) * delta_rot;
-	LLVector3 z_axis = LLVector3(0.f, 0.f, delta_scale.mV[VZ]) * delta_rot;
+	vo->mRelativeXform.setRow<3>(delta_pos);
 
-	vo->mRelativeXform.initRows(LLVector4(x_axis, 0.f),
-							LLVector4(y_axis, 0.f),
-							LLVector4(z_axis, 0.f),
-							LLVector4(delta_pos, 1.f));
-			
-	x_axis.normVec();
-	y_axis.normVec();
-	z_axis.normVec();
-	
-	vo->mRelativeXformInvTrans.setRows(x_axis, y_axis, z_axis);
+	vo->mRelativeXformInvTrans = vo->mRelativeXform;
+	vo->mRelativeXformInvTrans.invert();
+	vo->mRelativeXformInvTrans.transpose();
 }
 
 const LLMatrix4& LLVolumeImplFlexible::getWorldMatrix(LLXformMatrix* xform) const
diff --git a/indra/newview/llselectmgr.cpp b/indra/newview/llselectmgr.cpp
index 61eb159ae24097468c0f8db14b1ef29ecdcff9b9..a3c82450653fd745cbda2cdcbc7f0af8f6210fb8 100644
--- a/indra/newview/llselectmgr.cpp
+++ b/indra/newview/llselectmgr.cpp
@@ -6257,7 +6257,7 @@ void pushWireframe(LLDrawable* drawable)
 	{
 		LLVertexBuffer::unbind();
 		gGL.pushMatrix();
-		gGL.multMatrix((F32*) vobj->getRelativeXform().mMatrix);
+		gGL.multMatrix(vobj->getRelativeXform().getF32ptr());
 
 		LLVolume* volume = NULL;
 
diff --git a/indra/newview/llspatialpartition.cpp b/indra/newview/llspatialpartition.cpp
index 0d33447fc53e3ce695d454d2364f458469085f97..298c57ac58e7f7b432bc933097aa1550bcd2eccf 100644
--- a/indra/newview/llspatialpartition.cpp
+++ b/indra/newview/llspatialpartition.cpp
@@ -2078,7 +2078,7 @@ void renderNormals(LLDrawable* drawablep)
 	{
 		LLVolume* volume = vol->getVolume();
 		gGL.pushMatrix();
-		gGL.multMatrix((F32*) vol->getRelativeXform().mMatrix);
+		gGL.multMatrix(vol->getRelativeXform().getF32ptr());
 		
 		gGL.getTexUnit(0)->unbind(LLTexUnit::TT_TEXTURE);
 
@@ -2230,7 +2230,7 @@ void renderPhysicsShape(LLDrawable* drawable, LLVOVolume* volume)
 	LLVector3 size(0.25f,0.25f,0.25f);
 
 	gGL.pushMatrix();
-	gGL.multMatrix((F32*) volume->getRelativeXform().mMatrix);
+	gGL.multMatrix(volume->getRelativeXform().getF32ptr());
 		
 	if (type == LLPhysicsShapeBuilderUtil::PhysicsShapeSpecification::USER_MESH)
 	{
@@ -2963,7 +2963,7 @@ void renderRaycast(LLDrawable* drawablep)
 					
 					gGL.pushMatrix();
 					gGL.translatef(trans.mV[0], trans.mV[1], trans.mV[2]);					
-					gGL.multMatrix((F32*) vobj->getRelativeXform().mMatrix);
+					gGL.multMatrix(vobj->getRelativeXform().getF32ptr());
 
 					LLVector4a start, end;
 					if (transform)
diff --git a/indra/newview/llviewercamera.cpp b/indra/newview/llviewercamera.cpp
index 50398ffeb15d411d3b282c1a5272113866b3431a..fb24dbe31c4570ac1ad3684c2758c3d0d3742338 100644
--- a/indra/newview/llviewercamera.cpp
+++ b/indra/newview/llviewercamera.cpp
@@ -716,14 +716,12 @@ BOOL LLViewerCamera::areVertsVisible(LLViewerObject* volumep, BOOL all_verts)
 	LLVOVolume* vo_volume = (LLVOVolume*) volumep;
 
 	vo_volume->updateRelativeXform();
-	LLMatrix4 mat = vo_volume->getRelativeXform();
 	
 	LLMatrix4 render_mat(vo_volume->getRenderRotation(), LLVector4(vo_volume->getRenderPosition()));
 
 	LLMatrix4a render_mata;
 	render_mata.loadu(render_mat);
-	LLMatrix4a mata;
-	mata.loadu(mat);
+	const LLMatrix4a& mata = vo_volume->getRelativeXform();;
 
 	num_faces = volume->getNumVolumeFaces();
 	for (i = 0; i < num_faces; i++)
diff --git a/indra/newview/llvovolume.cpp b/indra/newview/llvovolume.cpp
index b72228121ef3993a8911f9d3c83e15fa8ec41dee..ef50150f71c170d1a7981c5aa9dff57a74ab075d 100644
--- a/indra/newview/llvovolume.cpp
+++ b/indra/newview/llvovolume.cpp
@@ -1522,93 +1522,53 @@ void LLVOVolume::updateRelativeXform(bool force_identity)
 	{ //rigged volume (which is in agent space) is used for generating bounding boxes etc
 	  //inverse of render matrix should go to partition space
 		mRelativeXform = getRenderMatrix();
-
-		F32* dst = (F32*) mRelativeXformInvTrans.mMatrix;
-		F32* src = (F32*) mRelativeXform.mMatrix;
-		dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2];
-		dst[3] = src[4]; dst[4] = src[5]; dst[5] = src[6];
-		dst[6] = src[8]; dst[7] = src[9]; dst[8] = src[10];
-		
+		mRelativeXformInvTrans = mRelativeXform;
 		mRelativeXform.invert();
 		mRelativeXformInvTrans.transpose();
 	}
 	else if (drawable->isActive() || force_identity)
 	{				
 		// setup relative transforms
-		LLQuaternion delta_rot;
-		LLVector3 delta_pos, delta_scale;
-		
 		//matrix from local space to parent relative/global space
 		bool use_identity = force_identity || drawable->isSpatialRoot();
-		delta_rot = use_identity ? LLQuaternion() : mDrawable->getRotation();
-		delta_pos = use_identity ? LLVector3(0,0,0) : mDrawable->getPosition();
-		delta_scale = mDrawable->getScale();
-
-		// Vertex transform (4x4)
-		LLVector3 x_axis = LLVector3(delta_scale.mV[VX], 0.f, 0.f) * delta_rot;
-		LLVector3 y_axis = LLVector3(0.f, delta_scale.mV[VY], 0.f) * delta_rot;
-		LLVector3 z_axis = LLVector3(0.f, 0.f, delta_scale.mV[VZ]) * delta_rot;
-
-		mRelativeXform.initRows(LLVector4(x_axis, 0.f),
-								LLVector4(y_axis, 0.f),
-								LLVector4(z_axis, 0.f),
-								LLVector4(delta_pos, 1.f));
 
-		
-		// compute inverse transpose for normals
-		// mRelativeXformInvTrans.setRows(x_axis, y_axis, z_axis);
-		// mRelativeXformInvTrans.invert(); 
-		// mRelativeXformInvTrans.setRows(x_axis, y_axis, z_axis);
-		// grumble - invert is NOT a matrix invert, so we do it by hand:
-
-		LLMatrix3 rot_inverse = LLMatrix3(~delta_rot);
-
-		LLMatrix3 scale_inverse;
-		scale_inverse.setRows(LLVector3(1.0, 0.0, 0.0) / delta_scale.mV[VX],
-							  LLVector3(0.0, 1.0, 0.0) / delta_scale.mV[VY],
-							  LLVector3(0.0, 0.0, 1.0) / delta_scale.mV[VZ]);
-							   
-		
-		mRelativeXformInvTrans = rot_inverse * scale_inverse;
+		if(use_identity)
+		{
+			mRelativeXform.setIdentity();
+			mRelativeXform.applyScale_affine(mDrawable->getScale());
+		}
+		else
+		{
+			mRelativeXform = LLQuaternion2(mDrawable->getRotation());
+			mRelativeXform.applyScale_affine(mDrawable->getScale());
+			mRelativeXform.setTranslate_affine(mDrawable->getPosition());
+		}
 
+		mRelativeXformInvTrans = mRelativeXform;
+		mRelativeXformInvTrans.invert();
 		mRelativeXformInvTrans.transpose();
 	}
 	else
 	{
-		LLVector3 pos = getPosition();
-		LLVector3 scale = getScale();
-		LLQuaternion rot = getRotation();
-	
+		LLVector4a pos;
+		pos.load3(getPosition().mV);
+		LLQuaternion2 rot(getRotation());
 		if (mParent)
 		{
-			pos *= mParent->getRotation();
-			pos += mParent->getPosition();
-			rot *= mParent->getRotation();
+			LLMatrix4a lrot = LLQuaternion2(mParent->getRotation());
+			lrot.rotate(pos,pos);
+			LLVector4a lpos;
+			lpos.load3(mParent->getPosition().mV);
+			pos.add(lpos);
+			rot.mul(LLQuaternion2(mParent->getRotation()));
 		}
-		
-		//LLViewerRegion* region = getRegion();
-		//pos += region->getOriginAgent();
-		
-		LLVector3 x_axis = LLVector3(scale.mV[VX], 0.f, 0.f) * rot;
-		LLVector3 y_axis = LLVector3(0.f, scale.mV[VY], 0.f) * rot;
-		LLVector3 z_axis = LLVector3(0.f, 0.f, scale.mV[VZ]) * rot;
-
-		mRelativeXform.initRows(LLVector4(x_axis, 0.f),
-								LLVector4(y_axis, 0.f),
-								LLVector4(z_axis, 0.f),
-								LLVector4(pos, 1.f));
-
-		// compute inverse transpose for normals
-		LLMatrix3 rot_inverse = LLMatrix3(~rot);
-
-		LLMatrix3 scale_inverse;
-		scale_inverse.setRows(LLVector3(1.0, 0.0, 0.0) / scale.mV[VX],
-							  LLVector3(0.0, 1.0, 0.0) / scale.mV[VY],
-							  LLVector3(0.0, 0.0, 1.0) / scale.mV[VZ]);
-							   
-		
-		mRelativeXformInvTrans = rot_inverse * scale_inverse;
 
+		mRelativeXform = rot;
+		mRelativeXform.applyScale_affine(getScale());
+		mRelativeXform.setTranslate_affine(LLVector3(pos.getF32ptr()));
+
+		mRelativeXformInvTrans = mRelativeXform;
+		mRelativeXformInvTrans.invert();
 		mRelativeXformInvTrans.transpose();
 	}
 }
@@ -3285,10 +3245,10 @@ void LLVOVolume::generateSilhouette(LLSelectNode* nodep, const LLVector3& view_p
 		}
 		
 		updateRelativeXform();
-		LLMatrix4 trans_mat = mRelativeXform;
+		LLMatrix4a trans_mat = mRelativeXform;
 		if (mDrawable->isStatic())
 		{
-			trans_mat.translate(getRegion()->getOriginAgent());
+			trans_mat.translate_affine(getRegion()->getOriginAgent());
 		}
 
 		volume->generateSilhouetteVertices(nodep->mSilhouetteVertices, nodep->mSilhouetteNormals, view_vector, trans_mat, mRelativeXformInvTrans, nodep->getTESelectMask());
@@ -5554,11 +5514,8 @@ void LLVolumeGeometryManager::rebuildMesh(LLSpatialGroup* group)
 						{
 							llassert(!face->isState(LLFace::RIGGED));
 
-							LLMatrix4a temprelxformmat(vobj->getRelativeXform());
-							LLMatrix4a temprelxformmatinv; 
-							temprelxformmatinv.loadu(vobj->getRelativeXformInvTrans());
 							if (!face->getGeometryVolume(*volume, face->getTEOffset(), 
-								temprelxformmat, temprelxformmatinv, face->getGeomIndex()))
+								vobj->getRelativeXform(), vobj->getRelativeXformInvTrans(), face->getGeomIndex()))
 							{ //something's gone wrong with the vertex buffer accounting, rebuild this group 
 								group->dirtyGeom();
 								gPipeline.markRebuild(group, TRUE);
@@ -6496,11 +6453,8 @@ void LLVolumeGeometryManager::genDrawInfo(LLSpatialGroup* group, U32 mask, LLFac
 				U32 te_idx = facep->getTEOffset();
 
 				llassert(!facep->isState(LLFace::RIGGED));
-				LLMatrix4a temprelxformmat(vobj->getRelativeXform());
-				LLMatrix4a temprelxformmatinv;
-				temprelxformmatinv.loadu(vobj->getRelativeXformInvTrans());
 				if (!facep->getGeometryVolume(*volume, te_idx,
-					temprelxformmat, temprelxformmatinv, index_offset,true))
+					vobj->getRelativeXform(), vobj->getRelativeXformInvTrans(), index_offset,true))
 				{
 					LL_WARNS() << "Failed to get geometry for face!" << LL_ENDL;
 				}
diff --git a/indra/newview/llvovolume.h b/indra/newview/llvovolume.h
index 6eb04f649ce8f9ce30f733a3ebbf0541eb7b0715..edd983b498318d192a63a60fa71e1dd544f1f1b7 100644
--- a/indra/newview/llvovolume.h
+++ b/indra/newview/llvovolume.h
@@ -27,6 +27,7 @@
 #ifndef LL_LLVOVOLUME_H
 #define LL_LLVOVOLUME_H
 
+#include "llmemory.h"
 #include "llviewerobject.h"
 #include "llviewertexture.h"
 #include "llviewermedia.h"
@@ -126,8 +127,8 @@ public:
 	/*virtual*/	BOOL	setParent(LLViewerObject* parent) override;
 				S32		getLOD() const override { return mLOD; }
 	const LLVector3		getPivotPositionAgent() const override;
-	const LLMatrix4&	getRelativeXform() const				{ return mRelativeXform; }
-	const LLMatrix3&	getRelativeXformInvTrans() const		{ return mRelativeXformInvTrans; }
+	const LLMatrix4a&	getRelativeXform() const				{ return mRelativeXform; }
+	const LLMatrix4a&	getRelativeXformInvTrans() const		{ return mRelativeXformInvTrans; }
 	/*virtual*/	const LLMatrix4	getRenderMatrix() const override;
 				typedef std::map<LLUUID, S32> texture_cost_t;
 				U32 	getRenderCost(texture_cost_t &textures) const;
@@ -361,8 +362,8 @@ private:
 	BOOL		mLODChanged;
 	BOOL		mSculptChanged;
 	F32			mSpotLightPriority;
-	LLMatrix4	mRelativeXform;
-	LLMatrix3	mRelativeXformInvTrans;
+	LL_ALIGN_16(LLMatrix4a	mRelativeXform);
+	LL_ALIGN_16(LLMatrix4a	mRelativeXformInvTrans);
 	BOOL		mVolumeChanged;
 	F32			mVObjRadius;
 	LLVolumeInterface *mVolumeImpl;