Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
Alchemy Viewer
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Locked files
Deploy
Releases
Package Registry
Operate
Terraform modules
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Silent mode is enabled
All outbound communications are blocked.
Learn more
.
Show more breadcrumbs
Alchemy Viewer
Alchemy Viewer
Commits
d41a5ea3
Commit
d41a5ea3
authored
4 years ago
by
Rye Mutt
Browse files
Options
Downloads
Patches
Plain Diff
SSE optimizations
parent
f8ccb750
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
indra/llmath/llvector4a.inl
+26
-36
26 additions, 36 deletions
indra/llmath/llvector4a.inl
with
26 additions
and
36 deletions
indra/llmath/llvector4a.inl
+
26
−
36
View file @
d41a5ea3
...
@@ -237,41 +237,20 @@ inline void LLVector4a::mul(const F32 x)
...
@@ -237,41 +237,20 @@ inline void LLVector4a::mul(const F32 x)
// Set this to (a x b) (geometric cross-product)
// Set this to (a x b) (geometric cross-product)
inline void LLVector4a::setCross3(const LLVector4a& a, const LLVector4a& b)
inline void LLVector4a::setCross3(const LLVector4a& a, const LLVector4a& b)
{
{
// Vectors are stored in memory in w, z, y, x order from high to low
LLQuad tmp0 = _mm_shuffle_ps(b.mQ, b.mQ, _MM_SHUFFLE(3, 0, 2, 1));
// Set vector1 = { a[W], a[X], a[Z], a[Y] }
LLQuad tmp1 = _mm_shuffle_ps(a.mQ, a.mQ, _MM_SHUFFLE(3, 0, 2, 1));
const LLQuad vector1 = _mm_shuffle_ps( a.mQ, a.mQ, _MM_SHUFFLE( 3, 0, 2, 1 ));
tmp0 = _mm_mul_ps(tmp0, a.mQ);
// Set vector2 = { b[W], b[Y], b[X], b[Z] }
tmp1 = _mm_mul_ps(tmp1, b.mQ);
const LLQuad vector2 = _mm_shuffle_ps( b.mQ, b.mQ, _MM_SHUFFLE( 3, 1, 0, 2 ));
LLQuad tmp2 = _mm_sub_ps(tmp0, tmp1);
// mQ = { a[W]*b[W], a[X]*b[Y], a[Z]*b[X], a[Y]*b[Z] }
mQ = _mm_shuffle_ps(tmp2, tmp2, _MM_SHUFFLE(3, 0, 2, 1));
mQ = _mm_mul_ps( vector1, vector2 );
}
// vector3 = { a[W], a[Y], a[X], a[Z] }
const LLQuad vector3 = _mm_shuffle_ps( a.mQ, a.mQ, _MM_SHUFFLE( 3, 1, 0, 2 ));
// vector4 = { b[W], b[X], b[Z], b[Y] }
const LLQuad vector4 = _mm_shuffle_ps( b.mQ, b.mQ, _MM_SHUFFLE( 3, 0, 2, 1 ));
// mQ = { 0, a[X]*b[Y] - a[Y]*b[X], a[Z]*b[X] - a[X]*b[Z], a[Y]*b[Z] - a[Z]*b[Y] }
mQ = _mm_sub_ps( mQ, _mm_mul_ps( vector3, vector4 ));
}
/* This function works, but may be slightly slower than the one below on older machines
inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b)
{
// ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
// yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
const LLQuad wzxy = _mm_shuffle_ps( ab, ab, _MM_SHUFFLE(3, 2, 0, 1 ));
// xPlusY = { 2*a[W]*b[W], 2 * a[Z] * b[Z], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
const LLQuad xPlusY = _mm_add_ps(ab, wzxy);
// xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
const LLQuad xPlusYSplat = _mm_movelh_ps(xPlusY, xPlusY);
// zSplat = { a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z] }
const LLQuad zSplat = _mm_shuffle_ps( ab, ab, _MM_SHUFFLE( 2, 2, 2, 2 ));
// mQ = { a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
mQ = _mm_add_ps(zSplat, xPlusYSplat);
}*/
// Set all elements to the dot product of the x, y, and z elements in a and b
// Set all elements to the dot product of the x, y, and z elements in a and b
inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b)
inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b)
{
{
#if AL_AVX
mQ = _mm_dp_ps(a.mQ, b.mQ, 0x7f);
#else
// ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
// ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
// yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
// yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
...
@@ -284,11 +263,15 @@ inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b)
...
@@ -284,11 +263,15 @@ inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b)
const __m128i zSplat = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE( 2, 2, 2, 2 ));
const __m128i zSplat = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE( 2, 2, 2, 2 ));
// mQ = { a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
// mQ = { a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
mQ = _mm_add_ps(_mm_castsi128_ps(zSplat), xPlusYSplat);
mQ = _mm_add_ps(_mm_castsi128_ps(zSplat), xPlusYSplat);
#endif
}
}
// Set all elements to the dot product of the x, y, z, and w elements in a and b
// Set all elements to the dot product of the x, y, z, and w elements in a and b
inline void LLVector4a::setAllDot4(const LLVector4a& a, const LLVector4a& b)
inline void LLVector4a::setAllDot4(const LLVector4a& a, const LLVector4a& b)
{
{
#if AL_AVX
mQ = _mm_dp_ps(a.mQ, b.mQ, 0xff);
#else
// ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
// ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
// yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
// yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
...
@@ -301,21 +284,29 @@ inline void LLVector4a::setAllDot4(const LLVector4a& a, const LLVector4a& b)
...
@@ -301,21 +284,29 @@ inline void LLVector4a::setAllDot4(const LLVector4a& a, const LLVector4a& b)
// mQ = { a[W]*b[W] + a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
// mQ = { a[W]*b[W] + a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
mQ = _mm_add_ps(xPlusYSplat, zPlusWSplat);
mQ = _mm_add_ps(xPlusYSplat, zPlusWSplat);
#endif
}
}
// Return the 3D dot product of this vector and b
// Return the 3D dot product of this vector and b
inline LLSimdScalar LLVector4a::dot3(const LLVector4a& b) const
inline LLSimdScalar LLVector4a::dot3(const LLVector4a& b) const
{
{
#if AL_AVX
return _mm_dp_ps(mQ, b.mQ, 0x7f);
#else
const LLQuad ab = _mm_mul_ps( mQ, b.mQ );
const LLQuad ab = _mm_mul_ps( mQ, b.mQ );
const LLQuad splatY = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(1, 1, 1, 1) ) );
const LLQuad splatY = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(1, 1, 1, 1) ) );
const LLQuad splatZ = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(2, 2, 2, 2) ) );
const LLQuad splatZ = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(2, 2, 2, 2) ) );
const LLQuad xPlusY = _mm_add_ps( ab, splatY );
const LLQuad xPlusY = _mm_add_ps( ab, splatY );
return _mm_add_ps( xPlusY, splatZ );
return _mm_add_ps( xPlusY, splatZ );
#endif
}
}
// Return the 4D dot product of this vector and b
// Return the 4D dot product of this vector and b
inline LLSimdScalar LLVector4a::dot4(const LLVector4a& b) const
inline LLSimdScalar LLVector4a::dot4(const LLVector4a& b) const
{
{
#if AL_AVX
return _mm_dp_ps(mQ, b.mQ, 0xff);
#else
// ab = { w, z, y, x }
// ab = { w, z, y, x }
const LLQuad ab = _mm_mul_ps( mQ, b.mQ );
const LLQuad ab = _mm_mul_ps( mQ, b.mQ );
// upperProdsInLowerElems = { y, x, y, x }
// upperProdsInLowerElems = { y, x, y, x }
...
@@ -325,6 +316,7 @@ inline LLSimdScalar LLVector4a::dot4(const LLVector4a& b) const
...
@@ -325,6 +316,7 @@ inline LLSimdScalar LLVector4a::dot4(const LLVector4a& b) const
// shuffled = { z+x, z+x, z+x, z+x }
// shuffled = { z+x, z+x, z+x, z+x }
const LLQuad shuffled = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( sumOfPairs ), _MM_SHUFFLE(1, 1, 1, 1) ) );
const LLQuad shuffled = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( sumOfPairs ), _MM_SHUFFLE(1, 1, 1, 1) ) );
return _mm_add_ss( sumOfPairs, shuffled );
return _mm_add_ss( sumOfPairs, shuffled );
#endif
}
}
// Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed
// Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed
...
@@ -432,11 +424,10 @@ inline void LLVector4a::normalize3fast_checked(LLVector4a* d)
...
@@ -432,11 +424,10 @@ inline void LLVector4a::normalize3fast_checked(LLVector4a* d)
// Return true if this vector is normalized with respect to x,y,z up to tolerance
// Return true if this vector is normalized with respect to x,y,z up to tolerance
inline LLBool32 LLVector4a::isNormalized3( F32 tolerance ) const
inline LLBool32 LLVector4a::isNormalized3( F32 tolerance ) const
{
{
static LL_ALIGN_16(const U32 ones[4]) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 };
LLSimdScalar tol = _mm_load_ss( &tolerance );
LLSimdScalar tol = _mm_load_ss( &tolerance );
tol = _mm_mul_ss( tol, tol );
tol = _mm_mul_ss( tol, tol );
LLVector4a lenSquared; lenSquared.setAllDot3( *this, *this );
LLVector4a lenSquared; lenSquared.setAllDot3( *this, *this );
lenSquared.sub(
*reinterpret_cast<const LLVector4a*>(ones
) );
lenSquared.sub(
_mm_set1_ps(1.f
) );
lenSquared.setAbs(lenSquared);
lenSquared.setAbs(lenSquared);
return _mm_comile_ss( lenSquared, tol );
return _mm_comile_ss( lenSquared, tol );
}
}
...
@@ -444,11 +435,10 @@ inline LLBool32 LLVector4a::isNormalized3( F32 tolerance ) const
...
@@ -444,11 +435,10 @@ inline LLBool32 LLVector4a::isNormalized3( F32 tolerance ) const
// Return true if this vector is normalized with respect to all components up to tolerance
// Return true if this vector is normalized with respect to all components up to tolerance
inline LLBool32 LLVector4a::isNormalized4( F32 tolerance ) const
inline LLBool32 LLVector4a::isNormalized4( F32 tolerance ) const
{
{
static LL_ALIGN_16(const U32 ones[4]) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 };
LLSimdScalar tol = _mm_load_ss( &tolerance );
LLSimdScalar tol = _mm_load_ss( &tolerance );
tol = _mm_mul_ss( tol, tol );
tol = _mm_mul_ss( tol, tol );
LLVector4a lenSquared; lenSquared.setAllDot4( *this, *this );
LLVector4a lenSquared; lenSquared.setAllDot4( *this, *this );
lenSquared.sub(
*reinterpret_cast<const LLVector4a*>(ones)
);
lenSquared.sub(
_mm_set1_ps(1.f)
);
lenSquared.setAbs(lenSquared);
lenSquared.setAbs(lenSquared);
return _mm_comile_ss( lenSquared, tol );
return _mm_comile_ss( lenSquared, tol );
}
}
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment