Hey guys, I've done a "quick" first implementation of a vector normalize and dot product using sse intrinsics and was wondering if there's still something that could be optimized further.
Here's my code:
_declspec(align(16))
struct Vec4
{
float x, y, z, w;
inline const Vec4 Normalize()
{
__m128 tmp;
// copy data into the 128bit register
tmp = _mm_set_ps(w, z, y, x);
// 0x7F = 0111 1111 ~ means we don't want the w-component multiplied
// and the result written to all 4 components
__m128 dp = _mm_dp_ps(tmp, tmp, 0x7F);
// compute rsqrt of the dot product
dp = _mm_rsqrt_ps(dp);
// vec * rsqrt(dot(vec, vec))
tmp = _mm_mul_ps(tmp, dp);
Vec4 vec;
union {__m128 v; float f[4]; } uf; // to access the 4 floats
uf.v = tmp;
vec.x = uf.f[0];
vec.y = uf.f[1];
vec.z = uf.f[2];
vec.w = 1.0f;
return vec;
}
inline const float Dot(const Vec4 &v2)
{
__m128 a;
// copy data into the 128bit register
a = _mm_set_ps(w, z, y, x);
__m128 b = _mm_set_ps(v2.w, v2.z, v2.y, v2.x);
// 0x7F = 0111 1111 ~ means we don't want w-component multiplied
// and the result written to all 4 components
__m128 dp = _mm_dp_ps(a, b, 0x7F);
Vec4 vec;
union {__m128 v; float f[4]; } uf; // to access the 4 floats
uf.v = dp;
return uf.f[0];
}
};
Thanks in advance