Hey guys, I've done a "quick" first implementation of a vector normalize and dot product using sse intrinsics and was wondering if there's still something that could be optimized further.

Here's my code:

_declspec(align(16)) struct Vec4 { float x, y, z, w; inline const Vec4 Normalize() { __m128 tmp; // copy data into the 128bit register tmp = _mm_set_ps(w, z, y, x); // 0x7F = 0111 1111 ~ means we don't want the w-component multiplied // and the result written to all 4 components __m128 dp = _mm_dp_ps(tmp, tmp, 0x7F); // compute rsqrt of the dot product dp = _mm_rsqrt_ps(dp); // vec * rsqrt(dot(vec, vec)) tmp = _mm_mul_ps(tmp, dp); Vec4 vec; union {__m128 v; float f[4]; } uf; // to access the 4 floats uf.v = tmp; vec.x = uf.f[0]; vec.y = uf.f[1]; vec.z = uf.f[2]; vec.w = 1.0f; return vec; } inline const float Dot(const Vec4 &v2) { __m128 a; // copy data into the 128bit register a = _mm_set_ps(w, z, y, x); __m128 b = _mm_set_ps(v2.w, v2.z, v2.y, v2.x); // 0x7F = 0111 1111 ~ means we don't want w-component multiplied // and the result written to all 4 components __m128 dp = _mm_dp_ps(a, b, 0x7F); Vec4 vec; union {__m128 v; float f[4]; } uf; // to access the 4 floats uf.v = dp; return uf.f[0]; } };

Thanks in advance