edit: holy cow VC2012 is almost twice as fast...problem is I'm forced to use 2008 for this project.
Here's the disassembly from the new code:
inline const void Normalize()
{
00413480 push ebx
00413481 mov ebx,esp
00413483 sub esp,8
00413486 and esp,0FFFFFFF0h
00413489 add esp,4
0041348C push ebp
0041348D mov ebp,dword ptr [ebx+4]
00413490 mov dword ptr [esp+4],ebp
00413494 mov ebp,esp
00413496 sub esp,158h
0041349C push esi
0041349D push edi
0041349E push ecx
0041349F lea edi,[ebp-158h]
004134A5 mov ecx,56h
004134AA mov eax,0CCCCCCCCh
004134AF rep stos dword ptr es:[edi]
004134B1 pop ecx
004134B2 mov eax,dword ptr [___security_cookie (42F0D0h)]
004134B7 xor eax,ebp
004134B9 mov dword ptr [ebp-4],eax
004134BC mov dword ptr [ebp-0Ch],ecx
// 0x7F = 0111 1111 ~ means we don't want the w-component multiplied
// and the result written to all 4 components
__m128 dp = _mm_dp_ps(v, v, 0x7F);
004134BF mov eax,dword ptr [ebp-0Ch]
004134C2 movaps xmm0,xmmword ptr [eax]
004134C5 mov ecx,dword ptr [ebp-0Ch]
004134C8 movaps xmm1,xmmword ptr [ecx]
004134CB dpps xmm1,xmm0,7Fh
004134D1 movaps xmmword ptr [ebp-150h],xmm1
004134D8 movaps xmm0,xmmword ptr [ebp-150h]
004134DF movaps xmmword ptr [ebp-30h],xmm0
// compute rsqrt of the dot product
dp = _mm_rsqrt_ps(dp);
004134E3 rsqrtps xmm0,xmmword ptr [ebp-30h]
004134E7 movaps xmmword ptr [ebp-130h],xmm0
004134EE movaps xmm0,xmmword ptr [ebp-130h]
004134F5 movaps xmmword ptr [ebp-30h],xmm0
// vec * rsqrt(dot(vec, vec))
v = _mm_mul_ps(v, dp);
004134F9 movaps xmm0,xmmword ptr [ebp-30h]
004134FD mov eax,dword ptr [ebp-0Ch]
00413500 movaps xmm1,xmmword ptr [eax]
00413503 mulps xmm1,xmm0
00413506 movaps xmmword ptr [ebp-110h],xmm1
0041350D mov ecx,dword ptr [ebp-0Ch]
00413510 movaps xmm0,xmmword ptr [ebp-110h]
00413517 movaps xmmword ptr [ecx],xmm0
}
0041351A pop edi
0041351B pop esi
0041351C mov ecx,dword ptr [ebp-4]
0041351F xor ecx,ebp
00413521 call @ILT+140(@__security_check_cookie@4) (411091h)
00413526 mov esp,ebp
00413528 pop ebp
00413529 mov esp,ebx
0041352B pop ebx
0041352C ret