Skip to content
Snippets Groups Projects
Commit d41a5ea3 authored by Rye Mutt's avatar Rye Mutt :bread:
Browse files

SSE optimizations

parent f8ccb750
No related branches found
No related tags found
No related merge requests found
...@@ -237,41 +237,20 @@ inline void LLVector4a::mul(const F32 x) ...@@ -237,41 +237,20 @@ inline void LLVector4a::mul(const F32 x)
// Set this to (a x b) (geometric cross-product) // Set this to (a x b) (geometric cross-product)
inline void LLVector4a::setCross3(const LLVector4a& a, const LLVector4a& b) inline void LLVector4a::setCross3(const LLVector4a& a, const LLVector4a& b)
{ {
// Vectors are stored in memory in w, z, y, x order from high to low LLQuad tmp0 = _mm_shuffle_ps(b.mQ, b.mQ, _MM_SHUFFLE(3, 0, 2, 1));
// Set vector1 = { a[W], a[X], a[Z], a[Y] } LLQuad tmp1 = _mm_shuffle_ps(a.mQ, a.mQ, _MM_SHUFFLE(3, 0, 2, 1));
const LLQuad vector1 = _mm_shuffle_ps( a.mQ, a.mQ, _MM_SHUFFLE( 3, 0, 2, 1 )); tmp0 = _mm_mul_ps(tmp0, a.mQ);
// Set vector2 = { b[W], b[Y], b[X], b[Z] } tmp1 = _mm_mul_ps(tmp1, b.mQ);
const LLQuad vector2 = _mm_shuffle_ps( b.mQ, b.mQ, _MM_SHUFFLE( 3, 1, 0, 2 )); LLQuad tmp2 = _mm_sub_ps(tmp0, tmp1);
// mQ = { a[W]*b[W], a[X]*b[Y], a[Z]*b[X], a[Y]*b[Z] } mQ = _mm_shuffle_ps(tmp2, tmp2, _MM_SHUFFLE(3, 0, 2, 1));
mQ = _mm_mul_ps( vector1, vector2 ); }
// vector3 = { a[W], a[Y], a[X], a[Z] }
const LLQuad vector3 = _mm_shuffle_ps( a.mQ, a.mQ, _MM_SHUFFLE( 3, 1, 0, 2 ));
// vector4 = { b[W], b[X], b[Z], b[Y] }
const LLQuad vector4 = _mm_shuffle_ps( b.mQ, b.mQ, _MM_SHUFFLE( 3, 0, 2, 1 ));
// mQ = { 0, a[X]*b[Y] - a[Y]*b[X], a[Z]*b[X] - a[X]*b[Z], a[Y]*b[Z] - a[Z]*b[Y] }
mQ = _mm_sub_ps( mQ, _mm_mul_ps( vector3, vector4 ));
}
/* This function works, but may be slightly slower than the one below on older machines
inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b)
{
// ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
// yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
const LLQuad wzxy = _mm_shuffle_ps( ab, ab, _MM_SHUFFLE(3, 2, 0, 1 ));
// xPlusY = { 2*a[W]*b[W], 2 * a[Z] * b[Z], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
const LLQuad xPlusY = _mm_add_ps(ab, wzxy);
// xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
const LLQuad xPlusYSplat = _mm_movelh_ps(xPlusY, xPlusY);
// zSplat = { a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z] }
const LLQuad zSplat = _mm_shuffle_ps( ab, ab, _MM_SHUFFLE( 2, 2, 2, 2 ));
// mQ = { a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
mQ = _mm_add_ps(zSplat, xPlusYSplat);
}*/
// Set all elements to the dot product of the x, y, and z elements in a and b // Set all elements to the dot product of the x, y, and z elements in a and b
inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b) inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b)
{ {
#if AL_AVX
mQ = _mm_dp_ps(a.mQ, b.mQ, 0x7f);
#else
// ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] } // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ ); const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
// yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] } // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
...@@ -284,11 +263,15 @@ inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b) ...@@ -284,11 +263,15 @@ inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b)
const __m128i zSplat = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE( 2, 2, 2, 2 )); const __m128i zSplat = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE( 2, 2, 2, 2 ));
// mQ = { a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same } // mQ = { a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
mQ = _mm_add_ps(_mm_castsi128_ps(zSplat), xPlusYSplat); mQ = _mm_add_ps(_mm_castsi128_ps(zSplat), xPlusYSplat);
#endif
} }
// Set all elements to the dot product of the x, y, z, and w elements in a and b // Set all elements to the dot product of the x, y, z, and w elements in a and b
inline void LLVector4a::setAllDot4(const LLVector4a& a, const LLVector4a& b) inline void LLVector4a::setAllDot4(const LLVector4a& a, const LLVector4a& b)
{ {
#if AL_AVX
mQ = _mm_dp_ps(a.mQ, b.mQ, 0xff);
#else
// ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] } // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ ); const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
// yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] } // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
...@@ -301,21 +284,29 @@ inline void LLVector4a::setAllDot4(const LLVector4a& a, const LLVector4a& b) ...@@ -301,21 +284,29 @@ inline void LLVector4a::setAllDot4(const LLVector4a& a, const LLVector4a& b)
// mQ = { a[W]*b[W] + a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same } // mQ = { a[W]*b[W] + a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
mQ = _mm_add_ps(xPlusYSplat, zPlusWSplat); mQ = _mm_add_ps(xPlusYSplat, zPlusWSplat);
#endif
} }
// Return the 3D dot product of this vector and b // Return the 3D dot product of this vector and b
inline LLSimdScalar LLVector4a::dot3(const LLVector4a& b) const inline LLSimdScalar LLVector4a::dot3(const LLVector4a& b) const
{ {
#if AL_AVX
return _mm_dp_ps(mQ, b.mQ, 0x7f);
#else
const LLQuad ab = _mm_mul_ps( mQ, b.mQ ); const LLQuad ab = _mm_mul_ps( mQ, b.mQ );
const LLQuad splatY = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(1, 1, 1, 1) ) ); const LLQuad splatY = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(1, 1, 1, 1) ) );
const LLQuad splatZ = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(2, 2, 2, 2) ) ); const LLQuad splatZ = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(2, 2, 2, 2) ) );
const LLQuad xPlusY = _mm_add_ps( ab, splatY ); const LLQuad xPlusY = _mm_add_ps( ab, splatY );
return _mm_add_ps( xPlusY, splatZ ); return _mm_add_ps( xPlusY, splatZ );
#endif
} }
// Return the 4D dot product of this vector and b // Return the 4D dot product of this vector and b
inline LLSimdScalar LLVector4a::dot4(const LLVector4a& b) const inline LLSimdScalar LLVector4a::dot4(const LLVector4a& b) const
{ {
#if AL_AVX
return _mm_dp_ps(mQ, b.mQ, 0xff);
#else
// ab = { w, z, y, x } // ab = { w, z, y, x }
const LLQuad ab = _mm_mul_ps( mQ, b.mQ ); const LLQuad ab = _mm_mul_ps( mQ, b.mQ );
// upperProdsInLowerElems = { y, x, y, x } // upperProdsInLowerElems = { y, x, y, x }
...@@ -325,6 +316,7 @@ inline LLSimdScalar LLVector4a::dot4(const LLVector4a& b) const ...@@ -325,6 +316,7 @@ inline LLSimdScalar LLVector4a::dot4(const LLVector4a& b) const
// shuffled = { z+x, z+x, z+x, z+x } // shuffled = { z+x, z+x, z+x, z+x }
const LLQuad shuffled = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( sumOfPairs ), _MM_SHUFFLE(1, 1, 1, 1) ) ); const LLQuad shuffled = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( sumOfPairs ), _MM_SHUFFLE(1, 1, 1, 1) ) );
return _mm_add_ss( sumOfPairs, shuffled ); return _mm_add_ss( sumOfPairs, shuffled );
#endif
} }
// Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed // Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed
...@@ -432,11 +424,10 @@ inline void LLVector4a::normalize3fast_checked(LLVector4a* d) ...@@ -432,11 +424,10 @@ inline void LLVector4a::normalize3fast_checked(LLVector4a* d)
// Return true if this vector is normalized with respect to x,y,z up to tolerance // Return true if this vector is normalized with respect to x,y,z up to tolerance
inline LLBool32 LLVector4a::isNormalized3( F32 tolerance ) const inline LLBool32 LLVector4a::isNormalized3( F32 tolerance ) const
{ {
static LL_ALIGN_16(const U32 ones[4]) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 };
LLSimdScalar tol = _mm_load_ss( &tolerance ); LLSimdScalar tol = _mm_load_ss( &tolerance );
tol = _mm_mul_ss( tol, tol ); tol = _mm_mul_ss( tol, tol );
LLVector4a lenSquared; lenSquared.setAllDot3( *this, *this ); LLVector4a lenSquared; lenSquared.setAllDot3( *this, *this );
lenSquared.sub( *reinterpret_cast<const LLVector4a*>(ones) ); lenSquared.sub( _mm_set1_ps(1.f) );
lenSquared.setAbs(lenSquared); lenSquared.setAbs(lenSquared);
return _mm_comile_ss( lenSquared, tol ); return _mm_comile_ss( lenSquared, tol );
} }
...@@ -444,11 +435,10 @@ inline LLBool32 LLVector4a::isNormalized3( F32 tolerance ) const ...@@ -444,11 +435,10 @@ inline LLBool32 LLVector4a::isNormalized3( F32 tolerance ) const
// Return true if this vector is normalized with respect to all components up to tolerance // Return true if this vector is normalized with respect to all components up to tolerance
inline LLBool32 LLVector4a::isNormalized4( F32 tolerance ) const inline LLBool32 LLVector4a::isNormalized4( F32 tolerance ) const
{ {
static LL_ALIGN_16(const U32 ones[4]) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 };
LLSimdScalar tol = _mm_load_ss( &tolerance ); LLSimdScalar tol = _mm_load_ss( &tolerance );
tol = _mm_mul_ss( tol, tol ); tol = _mm_mul_ss( tol, tol );
LLVector4a lenSquared; lenSquared.setAllDot4( *this, *this ); LLVector4a lenSquared; lenSquared.setAllDot4( *this, *this );
lenSquared.sub( *reinterpret_cast<const LLVector4a*>(ones) ); lenSquared.sub(_mm_set1_ps(1.f));
lenSquared.setAbs(lenSquared); lenSquared.setAbs(lenSquared);
return _mm_comile_ss( lenSquared, tol ); return _mm_comile_ss( lenSquared, tol );
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment