From 05a23f8dbaa45c64bcf6c55dd09a468ba2b1f144 Mon Sep 17 00:00:00 2001
From: Dave Parks <davep@lindenlab.com>
Date: Fri, 21 May 2010 04:49:12 -0500
Subject: [PATCH] Vectorized memcpy. 16-byte aligned vertex buffers. (almost)
 fully vectorized avatar vertex buffer updating --- index buffers still need
 to be vectorized

---
 indra/llrender/llvertexbuffer.cpp   | 195 +++++++++++++++++++++++++++-
 indra/llrender/llvertexbuffer.h     |   7 +-
 indra/newview/lldrawpoolavatar.cpp  |   4 +-
 indra/newview/llpolymesh.cpp        |  25 ++--
 indra/newview/llviewerjointmesh.cpp | 169 +++++++++---------------
 5 files changed, 273 insertions(+), 127 deletions(-)

diff --git a/indra/llrender/llvertexbuffer.cpp b/indra/llrender/llvertexbuffer.cpp
index 7fa47cd171b..a50eb7211cf 100644
--- a/indra/llrender/llvertexbuffer.cpp
+++ b/indra/llrender/llvertexbuffer.cpp
@@ -39,6 +39,7 @@
 #include "llglheaders.h"
 #include "llmemtype.h"
 #include "llrender.h"
+#include "llvector4a.h"
 
 //============================================================================
 
@@ -66,6 +67,27 @@ S32	LLVertexBuffer::sWeight4Loc = -1;
 
 std::vector<U32> LLVertexBuffer::sDeleteList;
 
+#define LL_ALIGNED_VB 1
+
+#if LL_ALIGNED_VB
+
+S32 LLVertexBuffer::sTypeOffsets[LLVertexBuffer::TYPE_MAX] =
+{
+	sizeof(LLVector4), // TYPE_VERTEX,
+	sizeof(LLVector4), // TYPE_NORMAL,
+	sizeof(LLVector2), // TYPE_TEXCOORD0,
+	sizeof(LLVector2), // TYPE_TEXCOORD1,
+	sizeof(LLVector2), // TYPE_TEXCOORD2,
+	sizeof(LLVector2), // TYPE_TEXCOORD3,
+	sizeof(LLColor4U), // TYPE_COLOR,
+	sizeof(LLVector4), // TYPE_BINORMAL,
+	sizeof(F32),	   // TYPE_WEIGHT,
+	sizeof(LLVector4), // TYPE_WEIGHT4,
+	sizeof(LLVector4), // TYPE_CLOTHWEIGHT,
+};
+
+#else
+
 S32 LLVertexBuffer::sTypeOffsets[LLVertexBuffer::TYPE_MAX] =
 {
 	sizeof(LLVector3), // TYPE_VERTEX,
@@ -81,6 +103,8 @@ S32 LLVertexBuffer::sTypeOffsets[LLVertexBuffer::TYPE_MAX] =
 	sizeof(LLVector4), // TYPE_CLOTHWEIGHT,
 };
 
+#endif
+
 U32 LLVertexBuffer::sGLMode[LLRender::NUM_MODES] = 
 {
 	GL_TRIANGLES,
@@ -428,11 +452,41 @@ LLVertexBuffer::LLVertexBuffer(U32 typemask, S32 usage) :
 
 	mTypeMask = typemask;
 	mStride = stride;
+	mAlignedOffset = 0;
+
 	sCount++;
 }
 
+#if LL_ALIGNED_VB
+//static
+S32 LLVertexBuffer::calcStride(const U32& typemask, S32* offsets, S32 num_vertices)
+{
+	S32 offset = 0;
+	for (S32 i=0; i<TYPE_MAX; i++)
+	{
+		U32 mask = 1<<i;
+		if (typemask & mask)
+		{
+			if (offsets)
+			{
+				offsets[i] = offset;
+				offset += LLVertexBuffer::sTypeOffsets[i]*num_vertices;
+				offset = (offset + 0xF) & ~0xF;
+			}
+		}
+	}
+
+	return offset+16;
+}
+
+S32 LLVertexBuffer::getSize() const
+{
+	return mStride;
+}
+
+#else
 //static
-S32 LLVertexBuffer::calcStride(const U32& typemask, S32* offsets)
+S32 LLVertexBuffer::calcStride(const U32& typemask, S32* offsets, S32 num_vertices)
 {
 	S32 stride = 0;
 	for (S32 i=0; i<TYPE_MAX; i++)
@@ -451,6 +505,12 @@ S32 LLVertexBuffer::calcStride(const U32& typemask, S32* offsets)
 	return stride;
 }
 
+S32 LLVertexBuffer::getSize() const
+{ 
+	return mNumVerts*mStride; 
+}
+
+#endif
 // protected, use unref()
 //virtual
 LLVertexBuffer::~LLVertexBuffer()
@@ -560,7 +620,7 @@ void LLVertexBuffer::createGLBuffer()
 	{
 		static int gl_buffer_idx = 0;
 		mGLBuffer = ++gl_buffer_idx;
-		mMappedData = new U8[size];
+		mMappedData = (U8*) _mm_malloc(size, 16);
 		memset(mMappedData, 0, size);
 	}
 }
@@ -612,7 +672,7 @@ void LLVertexBuffer::destroyGLBuffer()
 		}
 		else
 		{
-			delete [] mMappedData;
+			_mm_free(mMappedData);
 			mMappedData = NULL;
 			mEmpty = TRUE;
 		}
@@ -664,7 +724,7 @@ void LLVertexBuffer::updateNumVerts(S32 nverts)
 	}
 
 	mRequestedNumVerts = nverts;
-	
+
 	if (!mDynamicSize)
 	{
 		mNumVerts = nverts;
@@ -679,6 +739,9 @@ void LLVertexBuffer::updateNumVerts(S32 nverts)
 		}
 		mNumVerts = nverts;
 	}
+#if LL_ALIGNED_VB
+	mStride = calcStride(mTypeMask, mOffsets, mNumVerts);
+#endif
 
 }
 
@@ -886,7 +949,11 @@ U8* LLVertexBuffer::mapBuffer(S32 access)
 			setBuffer(0);
 			mLocked = TRUE;
 			stop_glerror();	
-			mMappedData = (U8*) glMapBufferARB(GL_ARRAY_BUFFER_ARB, GL_WRITE_ONLY_ARB);
+
+			U8* src = (U8*) glMapBufferARB(GL_ARRAY_BUFFER_ARB, GL_WRITE_ONLY_ARB);
+			mMappedData = LL_NEXT_ALIGNED_ADDRESS<U8>(src);
+			mAlignedOffset = mMappedData - src;
+			
 			stop_glerror();
 		}
 		{
@@ -975,6 +1042,45 @@ void LLVertexBuffer::unmapBuffer()
 
 //----------------------------------------------------------------------------
 
+#if LL_ALIGNED_VB
+
+template <class T,S32 type> struct VertexBufferStrider
+{
+	typedef LLStrider<T> strider_t;
+	static bool get(LLVertexBuffer& vbo, 
+					strider_t& strider, 
+					S32 index)
+	{
+		if (vbo.mapBuffer() == NULL)
+		{
+			llwarns << "mapBuffer failed!" << llendl;
+			return FALSE;
+		}
+
+		if (type == LLVertexBuffer::TYPE_INDEX)
+		{
+			S32 stride = sizeof(T);
+			strider = (T*)(vbo.getMappedIndices() + index*stride);
+			strider.setStride(0);
+			return TRUE;
+		}
+		else if (vbo.hasDataType(type))
+		{
+			S32 stride = LLVertexBuffer::sTypeOffsets[type];
+			strider = (T*)(vbo.getMappedData() + vbo.getOffset(type)+index*stride);
+			strider.setStride(stride);
+			return TRUE;
+		}
+		else
+		{
+			llerrs << "VertexBufferStrider could not find valid vertex data." << llendl;
+		}
+		return FALSE;
+	}
+};
+
+#else
+
 template <class T,S32 type> struct VertexBufferStrider
 {
 	typedef LLStrider<T> strider_t;
@@ -1010,6 +1116,7 @@ template <class T,S32 type> struct VertexBufferStrider
 	}
 };
 
+#endif
 
 bool LLVertexBuffer::getVertexStrider(LLStrider<LLVector3>& strider, S32 index)
 {
@@ -1272,6 +1379,82 @@ void LLVertexBuffer::setBuffer(U32 data_mask)
 	}
 }
 
+#if LL_ALIGNED_VB
+
+// virtual (default)
+void LLVertexBuffer::setupVertexBuffer(U32 data_mask) const
+{
+	LLMemType mt2(LLMemType::MTYPE_VERTEX_SETUP_VERTEX_BUFFER);
+	stop_glerror();
+	U8* base = useVBOs() ? (U8*) mAlignedOffset : mMappedData;
+
+	if ((data_mask & mTypeMask) != data_mask)
+	{
+		llerrs << "LLVertexBuffer::setupVertexBuffer missing required components for supplied data mask." << llendl;
+	}
+
+
+	if (data_mask & MAP_NORMAL)
+	{
+		glNormalPointer(GL_FLOAT, LLVertexBuffer::sTypeOffsets[TYPE_NORMAL], (void*)(base + mOffsets[TYPE_NORMAL]));
+	}
+	if (data_mask & MAP_TEXCOORD3)
+	{
+		glClientActiveTextureARB(GL_TEXTURE3_ARB);
+		glTexCoordPointer(2,GL_FLOAT, LLVertexBuffer::sTypeOffsets[TYPE_TEXCOORD3], (void*)(base + mOffsets[TYPE_TEXCOORD3]));
+		glClientActiveTextureARB(GL_TEXTURE0_ARB);
+	}
+	if (data_mask & MAP_TEXCOORD2)
+	{
+		glClientActiveTextureARB(GL_TEXTURE2_ARB);
+		glTexCoordPointer(2,GL_FLOAT, LLVertexBuffer::sTypeOffsets[TYPE_TEXCOORD2], (void*)(base + mOffsets[TYPE_TEXCOORD2]));
+		glClientActiveTextureARB(GL_TEXTURE0_ARB);
+	}
+	if (data_mask & MAP_TEXCOORD1)
+	{
+		glClientActiveTextureARB(GL_TEXTURE1_ARB);
+		glTexCoordPointer(2,GL_FLOAT, LLVertexBuffer::sTypeOffsets[TYPE_TEXCOORD1], (void*)(base + mOffsets[TYPE_TEXCOORD1]));
+		glClientActiveTextureARB(GL_TEXTURE0_ARB);
+	}
+	if (data_mask & MAP_BINORMAL)
+	{
+		glClientActiveTextureARB(GL_TEXTURE2_ARB);
+		glTexCoordPointer(3,GL_FLOAT, LLVertexBuffer::sTypeOffsets[TYPE_BINORMAL], (void*)(base + mOffsets[TYPE_BINORMAL]));
+		glClientActiveTextureARB(GL_TEXTURE0_ARB);
+	}
+	if (data_mask & MAP_TEXCOORD0)
+	{
+		glTexCoordPointer(2,GL_FLOAT, LLVertexBuffer::sTypeOffsets[TYPE_TEXCOORD0], (void*)(base + mOffsets[TYPE_TEXCOORD0]));
+	}
+	if (data_mask & MAP_COLOR)
+	{
+		glColorPointer(4, GL_UNSIGNED_BYTE, LLVertexBuffer::sTypeOffsets[TYPE_COLOR], (void*)(base + mOffsets[TYPE_COLOR]));
+	}
+	
+	if (data_mask & MAP_WEIGHT)
+	{
+		glVertexAttribPointerARB(1, 1, GL_FLOAT, FALSE, LLVertexBuffer::sTypeOffsets[TYPE_WEIGHT], (void*)(base + mOffsets[TYPE_WEIGHT]));
+	}
+
+	if (data_mask & MAP_WEIGHT4 && sWeight4Loc != -1)
+	{
+		glVertexAttribPointerARB(sWeight4Loc, 4, GL_FLOAT, FALSE, LLVertexBuffer::sTypeOffsets[TYPE_WEIGHT4], (void*)(base+mOffsets[TYPE_WEIGHT4]));
+	}
+
+	if (data_mask & MAP_CLOTHWEIGHT)
+	{
+		glVertexAttribPointerARB(4, 4, GL_FLOAT, TRUE,  LLVertexBuffer::sTypeOffsets[TYPE_CLOTHWEIGHT], (void*)(base + mOffsets[TYPE_CLOTHWEIGHT]));
+	}
+	if (data_mask & MAP_VERTEX)
+	{
+		glVertexPointer(3,GL_FLOAT, LLVertexBuffer::sTypeOffsets[TYPE_VERTEX], (void*)(base + 0));
+	}
+
+	llglassertok();
+}
+
+#else
+
 // virtual (default)
 void LLVertexBuffer::setupVertexBuffer(U32 data_mask) const
 {
@@ -1344,6 +1527,8 @@ void LLVertexBuffer::setupVertexBuffer(U32 data_mask) const
 	llglassertok();
 }
 
+#endif
+
 void LLVertexBuffer::markDirty(U32 vert_index, U32 vert_count, U32 indices_index, U32 indices_count)
 {
 	// TODO: use GL_APPLE_flush_buffer_range here
diff --git a/indra/llrender/llvertexbuffer.h b/indra/llrender/llvertexbuffer.h
index d1700aa54ab..03799af9789 100644
--- a/indra/llrender/llvertexbuffer.h
+++ b/indra/llrender/llvertexbuffer.h
@@ -98,7 +98,7 @@ class LLVertexBuffer : public LLRefCount
 	//if offsets is not NULL, its contents will be filled
 	//with the offset of each vertex component in the buffer, 
 	// indexed by the following enum
-	static S32 calcStride(const U32& typemask, S32* offsets = NULL); 										
+	static S32 calcStride(const U32& typemask, S32* offsets = NULL, S32 num_vertices = 0); 										
 
 	enum {
 		TYPE_VERTEX,
@@ -192,7 +192,7 @@ class LLVertexBuffer : public LLRefCount
 	S32 getStride() const					{ return mStride; }
 	U32 getTypeMask() const					{ return mTypeMask; }
 	BOOL hasDataType(S32 type) const		{ return ((1 << type) & getTypeMask()) ? TRUE : FALSE; }
-	S32 getSize() const						{ return mNumVerts*mStride; }
+	S32 getSize() const;
 	S32 getIndicesSize() const				{ return mNumIndices * sizeof(U16); }
 	U8* getMappedData() const				{ return mMappedData; }
 	U8* getMappedIndices() const			{ return mMappedIndexData; }
@@ -213,6 +213,7 @@ class LLVertexBuffer : public LLRefCount
 	S32		mRequestedNumVerts;  // Number of vertices requested
 	S32		mRequestedNumIndices;  // Number of indices requested
 
+	ptrdiff_t mAlignedOffset;
 	S32		mStride;
 	U32		mTypeMask;
 	S32		mUsage;			// GL usage
@@ -227,7 +228,7 @@ class LLVertexBuffer : public LLRefCount
 	S32		mOffsets[TYPE_MAX];
 	BOOL	mResized;		// if TRUE, client buffer has been resized and GL buffer has not
 	BOOL	mDynamicSize;	// if TRUE, buffer has been resized at least once (and should be padded)
-
+	
 	class DirtyRegion
 	{
 	public:
diff --git a/indra/newview/lldrawpoolavatar.cpp b/indra/newview/lldrawpoolavatar.cpp
index d1f4be71f55..1e9053239d1 100644
--- a/indra/newview/lldrawpoolavatar.cpp
+++ b/indra/newview/lldrawpoolavatar.cpp
@@ -1542,7 +1542,7 @@ LLVertexBufferAvatar::LLVertexBufferAvatar()
 
 void LLVertexBufferAvatar::setupVertexBuffer(U32 data_mask) const
 {
-	if (sRenderingSkinned)
+/*	if (sRenderingSkinned)
 	{
 		U8* base = useVBOs() ? NULL : mMappedData;
 
@@ -1562,7 +1562,7 @@ void LLVertexBufferAvatar::setupVertexBuffer(U32 data_mask) const
 			set_vertex_clothing_weights(LLDrawPoolAvatar::sVertexProgram->mAttribute[LLViewerShaderMgr::AVATAR_CLOTHING], mStride, (LLVector4*)(base + mOffsets[TYPE_CLOTHWEIGHT]));
 		}
 	}
-	else
+	else*/
 	{
 		LLVertexBuffer::setupVertexBuffer(data_mask);
 	}
diff --git a/indra/newview/llpolymesh.cpp b/indra/newview/llpolymesh.cpp
index b8bdbfb2f81..98c01913974 100644
--- a/indra/newview/llpolymesh.cpp
+++ b/indra/newview/llpolymesh.cpp
@@ -140,7 +140,7 @@ void LLPolyMeshSharedData::freeMeshData()
 		delete [] mDetailTexCoords;
 		mDetailTexCoords = NULL;
 
-		delete [] mWeights;
+		_mm_free(mWeights);
 		mWeights = NULL;
 	}
 
@@ -230,7 +230,7 @@ BOOL LLPolyMeshSharedData::allocateVertexData( U32 numVertices )
 	mBaseBinormals = new LLVector3[ numVertices ];
 	mTexCoords = new LLVector2[ numVertices ];
 	mDetailTexCoords = new LLVector2[ numVertices ];
-	mWeights = new F32[ numVertices ];
+	mWeights = (F32*) _mm_malloc((numVertices*sizeof(F32)+0xF) & ~0xF, 16);
 	for (i = 0; i < numVertices; i++)
 	{
 		mWeights[i] = 0.f;
@@ -717,13 +717,20 @@ LLPolyMesh::LLPolyMesh(LLPolyMeshSharedData *shared_data, LLPolyMesh *reference_
 		//use aligned vertex data to make LLPolyMesh SSE friendly
 		mVertexData = (F32*) _mm_malloc(nfloats*4, 16);
 		int offset = 0;
-		mCoords = 				(LLVector4*)(mVertexData + offset); offset += 4*nverts;
-		mNormals = 				(LLVector4*)(mVertexData + offset); offset += 4*nverts;
-		mScaledNormals = 		(LLVector3*)(mVertexData + offset); offset += 3*nverts;
-		mBinormals = 			(LLVector3*)(mVertexData + offset); offset += 3*nverts;
-		mScaledBinormals = 		(LLVector3*)(mVertexData + offset); offset += 3*nverts;
-		mTexCoords = 			(LLVector2*)(mVertexData + offset); offset += 2*nverts;
-		mClothingWeights = 	(LLVector4*)(mVertexData + offset); offset += 4*nverts;
+
+		//all members must be 16-byte aligned except the last 3
+		mCoords				= 	(LLVector4*)(mVertexData + offset); offset += 4*nverts;
+		mNormals			=	(LLVector4*)(mVertexData + offset); offset += 4*nverts;
+		mClothingWeights	= 	(LLVector4*)(mVertexData + offset); offset += 4*nverts;
+		mTexCoords			= 	(LLVector2*)(mVertexData + offset); offset += 2*nverts;
+
+		// these members don't need to be 16-byte aligned, but the first one might be
+		// read during an aligned memcpy of mTexCoords
+		mScaledNormals		=	(LLVector3*)(mVertexData + offset); offset += 3*nverts;
+		mBinormals			=	(LLVector3*)(mVertexData + offset); offset += 3*nverts;
+		mScaledBinormals	=	(LLVector3*)(mVertexData + offset); offset += 3*nverts;
+		
+		
 #else
 		mCoords = new LLVector3[mSharedData->mNumVertices];
 		mNormals = new LLVector3[mSharedData->mNumVertices];
diff --git a/indra/newview/llviewerjointmesh.cpp b/indra/newview/llviewerjointmesh.cpp
index 236ad98d686..a7e7bfadd69 100644
--- a/indra/newview/llviewerjointmesh.cpp
+++ b/indra/newview/llviewerjointmesh.cpp
@@ -655,6 +655,9 @@ U32 LLViewerJointMesh::drawShape( F32 pixelArea, BOOL first_pass, BOOL is_dummy)
 //-----------------------------------------------------------------------------
 void LLViewerJointMesh::updateFaceSizes(U32 &num_vertices, U32& num_indices, F32 pixel_area)
 {
+	//bump num_vertices to next multiple of 4
+	num_vertices = (num_vertices + 0x3) & ~0x3;
+
 	// Do a pre-alloc pass to determine sizes of data.
 	if (mMesh && mValid)
 	{
@@ -677,6 +680,8 @@ static LLFastTimer::DeclareTimer FTM_AVATAR_FACE("Avatar Face");
 
 void LLViewerJointMesh::updateFaceData(LLFace *face, F32 pixel_area, BOOL damp_wind, bool terse_update)
 {
+	//IF THIS FUNCTION BREAKS, SEE LLPOLYMESH CONSTRUCTOR AND CHECK ALIGNMENT OF INPUT ARRAYS
+
 	mFace = face;
 
 	if (mFace->mVertexBuffer.isNull())
@@ -684,6 +689,16 @@ void LLViewerJointMesh::updateFaceData(LLFace *face, F32 pixel_area, BOOL damp_w
 		return;
 	}
 
+	LLDrawPool *poolp = mFace->getPool();
+	BOOL hardware_skinning = (poolp && poolp->getVertexShaderLevel() > 0) ? TRUE : FALSE;
+
+	if (!hardware_skinning && terse_update)
+	{ //no need to do terse updates if we're doing software vertex skinning
+	 // since mMesh is being copied into mVertexBuffer every frame
+		return;
+	}
+
+
 	LLFastTimer t(FTM_AVATAR_FACE);
 
 	LLStrider<LLVector3> verticesp;
@@ -696,108 +711,52 @@ void LLViewerJointMesh::updateFaceData(LLFace *face, F32 pixel_area, BOOL damp_w
 	// Copy data into the faces from the polymesh data.
 	if (mMesh && mValid)
 	{
-		if (mMesh->getNumVertices())
+		const U32 num_verts = mMesh->getNumVertices();
+
+		if (num_verts)
 		{
-			stop_glerror();
 			face->getGeometryAvatar(verticesp, normalsp, tex_coordsp, vertex_weightsp, clothing_weightsp);
-			stop_glerror();
 			face->mVertexBuffer->getIndexStrider(indicesp);
-			stop_glerror();
 
 			verticesp += mMesh->mFaceVertexOffset;
-			tex_coordsp += mMesh->mFaceVertexOffset;
 			normalsp += mMesh->mFaceVertexOffset;
-			vertex_weightsp += mMesh->mFaceVertexOffset;
-			clothing_weightsp += mMesh->mFaceVertexOffset;
-
-			const U32* __restrict coords = (U32*) mMesh->getCoords();
-			const U32* __restrict tex_coords = (U32*) mMesh->getTexCoords();
-			const U32* __restrict normals = (U32*) mMesh->getNormals();
-			const U32* __restrict weights = (U32*) mMesh->getWeights();
-			const U32* __restrict cloth_weights = (U32*) mMesh->getClothingWeights();
-
-			const U32 num_verts = mMesh->getNumVertices();
-
-			U32 i = 0;
-
-			const U32 skip = verticesp.getSkip()/sizeof(U32);
+			
+			F32* v = (F32*) verticesp.get();
+			F32* n = (F32*) normalsp.get();
+			
+			U32 words = num_verts*4;
 
-			U32* __restrict v = (U32*) verticesp.get();
-			U32* __restrict n = (U32*) normalsp.get();
+			LLVector4a::memcpyNonAliased16(v, (F32*) mMesh->getCoords(), words);
+			LLVector4a::memcpyNonAliased16(n, (F32*) mMesh->getNormals(), words);
+						
 			
-			if (terse_update)
+			if (!terse_update)
 			{
-				for (S32 i = num_verts; i > 0; --i)
-				{
-					//morph target application only, only update positions and normals
-					v[0] = coords[0]; 
-					v[1] = coords[1]; 
-					v[2] = coords[2];		
-					coords += 4;
-					v += skip;
-				}
+				vertex_weightsp += mMesh->mFaceVertexOffset;
+				clothing_weightsp += mMesh->mFaceVertexOffset;
+				tex_coordsp += mMesh->mFaceVertexOffset;
+		
+				F32* tc = (F32*) tex_coordsp.get();
+				F32* vw = (F32*) vertex_weightsp.get();
+				F32* cw = (F32*) clothing_weightsp.get();	
 
-				for (S32 i = num_verts; i > 0; --i)
-				{
-					n[0] = normals[0]; 
-					n[1] = normals[1];
-					n[2] = normals[2];
-					normals += 4;
-					n += skip;
-				}
+				LLVector4a::memcpyNonAliased16(tc, (F32*) mMesh->getTexCoords(), num_verts*2);
+				LLVector4a::memcpyNonAliased16(vw, (F32*) mMesh->getWeights(), num_verts);	
+				LLVector4a::memcpyNonAliased16(cw, (F32*) mMesh->getClothingWeights(), num_verts*4);	
 			}
-			else
-			{
-
-				U32* __restrict tc = (U32*) tex_coordsp.get();
-				U32* __restrict vw = (U32*) vertex_weightsp.get();
-				U32* __restrict cw = (U32*) clothing_weightsp.get();
-				
-				do
-				{
-					v[0] = coords[0]; 
-					v[1] = coords[1]; 
-					v[2] = coords[2];		
-					coords += 4;
-					v += skip;
-
-					tc[0] = *(tex_coords++); 
-					tc[1] = *(tex_coords++);
-					tc += skip;
-
-					n[0] = normals[0]; 
-					n[1] = normals[1];
-					n[2] = normals[2];
-					normals += 4;
-					n += skip;
-
-					vw[0] = *(weights++);
-					vw += skip;
-
-					cw[0] = *(cloth_weights++);
-					cw[1] = *(cloth_weights++);
-					cw[2] = *(cloth_weights++);
-					cw[3] = *(cloth_weights++);
-					cw += skip;
-				}
-				while (++i < num_verts);
-
-				const U32 idx_count = mMesh->getNumFaces()*3;
 
-				indicesp += mMesh->mFaceIndexOffset;
+			const U32 idx_count = mMesh->getNumFaces()*3;
 
-				U16* __restrict idx = indicesp.get();
-				S32* __restrict src_idx = (S32*) mMesh->getFaces();
+			indicesp += mMesh->mFaceIndexOffset;
 
-				i = 0;
+			U16* __restrict idx = indicesp.get();
+			S32* __restrict src_idx = (S32*) mMesh->getFaces();	
 
-				const S32 offset = (S32) mMesh->mFaceVertexOffset;
+			const S32 offset = (S32) mMesh->mFaceVertexOffset;
 
-				do
-				{
-					*(idx++) = *(src_idx++)+offset;
-				}
-				while (++i < idx_count);
+			for (S32 i = 0; i < idx_count; ++i)
+			{
+				*(idx++) = *(src_idx++)+offset;
 			}
 		}
 	}
@@ -824,50 +783,44 @@ void LLViewerJointMesh::updateGeometryOriginal(LLFace *mFace, LLPolyMesh *mMesh)
 	buffer->getVertexStrider(o_vertices,  0);
 	buffer->getNormalStrider(o_normals,   0);
 
-	//F32 last_weight = F32_MAX;
-	LLMatrix4a gBlendMat;
+	F32* __restrict vert = o_vertices[0].mV;
+	F32* __restrict norm = o_normals[0].mV;
 
 	const F32* __restrict weights = mMesh->getWeights();
 	const LLVector4a* __restrict coords = (LLVector4a*) mMesh->getCoords();
 	const LLVector4a* __restrict normals = (LLVector4a*) mMesh->getNormals();
 
+	U32 offset = mMesh->mFaceVertexOffset*4;
+	vert += offset;
+	norm += offset;
+
 	for (U32 index = 0; index < mMesh->getNumVertices(); index++)
 	{
-		U32 bidx = index + mMesh->mFaceVertexOffset;
-		
-		// blend by first matrix
-		F32 w = weights[index]; 
-		
-		//LLVector4a coord;
-		//coord.load4a(coords[index].mV);
+		// equivalent to joint = floorf(weights[index]);
+		S32 joint = _mm_cvtt_ss2si(_mm_load_ss(weights+index));
+		F32 w = weights[index] - joint;		
 
-		//LLVector4a norm;
-		//norm.load4a(normals[index].mV);
+		LLMatrix4a gBlendMat;
 
-		S32 joint = llfloor(w);
-		w -= joint;
-				
-		if (w > 0.f)
+		if (w != 0.f)
 		{
-			// Try to keep all the accesses to the matrix data as close
-			// together as possible.  This function is a hot spot on the
-			// Mac. JC
+			// blend between matrices and apply
 			gBlendMat.setLerp(gJointMatAligned[joint+0],
 							  gJointMatAligned[joint+1], w);
 
 			LLVector4a res;
 			gBlendMat.affineTransform(coords[index], res);
-			o_vertices[bidx].setVec(res[0], res[1], res[2]);
+			res.store4a(vert+index*4);
 			gBlendMat.rotate(normals[index], res);
-			o_normals[bidx].setVec(res[0], res[1], res[2]);
+			res.store4a(norm+index*4);
 		}
 		else
 		{  // No lerp required in this case.
 			LLVector4a res;
 			gJointMatAligned[joint].affineTransform(coords[index], res);
-			o_vertices[bidx].setVec(res[0], res[1], res[2]);
+			res.store4a(vert+index*4);
 			gJointMatAligned[joint].rotate(normals[index], res);
-			o_normals[bidx].setVec(res[0], res[1], res[2]);
+			res.store4a(norm+index*4);
 		}
 	}
 
-- 
GitLab