_declspec(align(16)) struct Vec4 { float x, y, z, w; inline const Vec4 Normalize() { __m128 tmp = _mm_load_ps( &x ); __m128 dp = _mm_dp_ps(tmp, tmp, 0x7F); dp = _mm_rsqrt_ps(dp); tmp = _mm_mul_ps(tmp, dp); Vec4 vec; _mm_store_ps( &vec.x, tmp ); vec.w = 1.0f; return vec; } inline const float Dot(const Vec4 &v2) { __m128 a = _mm_load_ps( &x ); __m128 b = _mm_load_ps( &v2.x ); __m128 dp = _mm_dp_ps(a, b, 0x7F); float result; _mm_store_ss( &result, dp ); return result; } };That should be a bit faster since it removes the unneeded register flushes and leverages the aligned load speeds given that the class is 16 byte aligned. In effect, even though you are not using an __m128 for storage in the class, this is treating the class as one anyway.

NOTE: Also note that these two functions are full of wait states due to the latencies of the operations being performed. If you are doing batches of normalizations/dot products, running 2 or 4 at a time interleaved will effectively triple the throughput of the function. Given SSE4, you don't actually need the SoA data reorg Hodge suggests, you just need to deal with more than one in flight at a time.