I have played around with SSE. I am using VS2012, and I wanted to know what was the fastest way to calculate the length of a vector.
The code looks very bad.
float Magnitude() const
{
#if SSE && SSE_ASM
float result;
//Optimized magnitude calculation with SSE and Assembly
__asm
{
MOV EAX, this //Move [this] to EAX.
MOVAPS XMM2, [EAX] //Copy data EAX to XMM2 register
MULPS XMM2, XMM2 //Square the XMM2 register.
MOVAPS XMM1, XMM2 //Make a copy
SHUFPS XMM2, XMM1, _MM_SHUFFLE(1, 0, 3, 2) //Shuffle so that we can add together the elements.
ADDPS XMM2, XMM1 //Add the elements.
MOVAPS XMM1, XMM2 //Make a copy
SHUFPS XMM1, XMM1, _MM_SHUFFLE(0, 1, 0, 1) //Second addition of elements using shuffle
ADDPS XMM2, XMM1
SQRTPS XMM2, XMM2 //Get the square root
MOVSS [result], XMM2 //Store the result in the float.
}
return result;
#elif SSE
__m128 tmp = _mm_mul_ps(components, components);
tmp = _mm_add_ps(_mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(1, 0, 3, 2)), tmp);
tmp = _mm_sqrt_ps(_mm_add_ps(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0, 1, 0, 1))));
float result;
_mm_store_ss(&result, tmp);
return result;
#endif
#if !SSE && !SSE_ASM
return sqrtf(__x * __x + __y * __y + __z * __z + __w * __w);
#endif
}
Guess what, the normal method was as fast as the SSE-Intrinsics, while my assembly code was acctually slower.
So yeah, you can't beat the compiler :)