Jump to content

  • Log In with Google      Sign In   
  • Create Account

#Actuallipsryme

Posted 20 September 2013 - 02:58 PM

edit: holy cow VC2012 is almost twice as fast...problem is I'm forced to use 2008 for this project.

 

Here's the disassembly from the new code:

   inline const void Normalize()
    {
00413480  push        ebx  
00413481  mov         ebx,esp 
00413483  sub         esp,8 
00413486  and         esp,0FFFFFFF0h 
00413489  add         esp,4 
0041348C  push        ebp  
0041348D  mov         ebp,dword ptr [ebx+4] 
00413490  mov         dword ptr [esp+4],ebp 
00413494  mov         ebp,esp 
00413496  sub         esp,158h 
0041349C  push        esi  
0041349D  push        edi  
0041349E  push        ecx  
0041349F  lea         edi,[ebp-158h] 
004134A5  mov         ecx,56h 
004134AA  mov         eax,0CCCCCCCCh 
004134AF  rep stos    dword ptr es:[edi] 
004134B1  pop         ecx  
004134B2  mov         eax,dword ptr [___security_cookie (42F0D0h)] 
004134B7  xor         eax,ebp 
004134B9  mov         dword ptr [ebp-4],eax 
004134BC  mov         dword ptr [ebp-0Ch],ecx 
        // 0x7F = 0111 1111 ~ means we don't want the w-component multiplied
        // and the result written to all 4 components
        __m128 dp = _mm_dp_ps(v, v, 0x7F); 
004134BF  mov         eax,dword ptr [ebp-0Ch] 
004134C2  movaps      xmm0,xmmword ptr [eax] 
004134C5  mov         ecx,dword ptr [ebp-0Ch] 
004134C8  movaps      xmm1,xmmword ptr [ecx] 
004134CB  dpps        xmm1,xmm0,7Fh 
004134D1  movaps      xmmword ptr [ebp-150h],xmm1 
004134D8  movaps      xmm0,xmmword ptr [ebp-150h] 
004134DF  movaps      xmmword ptr [ebp-30h],xmm0 

        // compute rsqrt of the dot product
        dp = _mm_rsqrt_ps(dp);
004134E3  rsqrtps     xmm0,xmmword ptr [ebp-30h] 
004134E7  movaps      xmmword ptr [ebp-130h],xmm0 
004134EE  movaps      xmm0,xmmword ptr [ebp-130h] 
004134F5  movaps      xmmword ptr [ebp-30h],xmm0 

        // vec * rsqrt(dot(vec, vec))
        v = _mm_mul_ps(v, dp);
004134F9  movaps      xmm0,xmmword ptr [ebp-30h] 
004134FD  mov         eax,dword ptr [ebp-0Ch] 
00413500  movaps      xmm1,xmmword ptr [eax] 
00413503  mulps       xmm1,xmm0 
00413506  movaps      xmmword ptr [ebp-110h],xmm1 
0041350D  mov         ecx,dword ptr [ebp-0Ch] 
00413510  movaps      xmm0,xmmword ptr [ebp-110h] 
00413517  movaps      xmmword ptr [ecx],xmm0 
	  }
0041351A  pop         edi  
0041351B  pop         esi  
0041351C  mov         ecx,dword ptr [ebp-4] 
0041351F  xor         ecx,ebp 
00413521  call        @ILT+140(@__security_check_cookie@4) (411091h) 
00413526  mov         esp,ebp 
00413528  pop         ebp  
00413529  mov         esp,ebx 
0041352B  pop         ebx  
0041352C  ret              

#2lipsryme

Posted 20 September 2013 - 02:58 PM

edit: holy cow VC2012 is almost twice as fast...problem is I'm kind of forced to use 2008.

 

Here's the disassembly from the new code:

   inline const void Normalize()
    {
00413480  push        ebx  
00413481  mov         ebx,esp 
00413483  sub         esp,8 
00413486  and         esp,0FFFFFFF0h 
00413489  add         esp,4 
0041348C  push        ebp  
0041348D  mov         ebp,dword ptr [ebx+4] 
00413490  mov         dword ptr [esp+4],ebp 
00413494  mov         ebp,esp 
00413496  sub         esp,158h 
0041349C  push        esi  
0041349D  push        edi  
0041349E  push        ecx  
0041349F  lea         edi,[ebp-158h] 
004134A5  mov         ecx,56h 
004134AA  mov         eax,0CCCCCCCCh 
004134AF  rep stos    dword ptr es:[edi] 
004134B1  pop         ecx  
004134B2  mov         eax,dword ptr [___security_cookie (42F0D0h)] 
004134B7  xor         eax,ebp 
004134B9  mov         dword ptr [ebp-4],eax 
004134BC  mov         dword ptr [ebp-0Ch],ecx 
        // 0x7F = 0111 1111 ~ means we don't want the w-component multiplied
        // and the result written to all 4 components
        __m128 dp = _mm_dp_ps(v, v, 0x7F); 
004134BF  mov         eax,dword ptr [ebp-0Ch] 
004134C2  movaps      xmm0,xmmword ptr [eax] 
004134C5  mov         ecx,dword ptr [ebp-0Ch] 
004134C8  movaps      xmm1,xmmword ptr [ecx] 
004134CB  dpps        xmm1,xmm0,7Fh 
004134D1  movaps      xmmword ptr [ebp-150h],xmm1 
004134D8  movaps      xmm0,xmmword ptr [ebp-150h] 
004134DF  movaps      xmmword ptr [ebp-30h],xmm0 

        // compute rsqrt of the dot product
        dp = _mm_rsqrt_ps(dp);
004134E3  rsqrtps     xmm0,xmmword ptr [ebp-30h] 
004134E7  movaps      xmmword ptr [ebp-130h],xmm0 
004134EE  movaps      xmm0,xmmword ptr [ebp-130h] 
004134F5  movaps      xmmword ptr [ebp-30h],xmm0 

        // vec * rsqrt(dot(vec, vec))
        v = _mm_mul_ps(v, dp);
004134F9  movaps      xmm0,xmmword ptr [ebp-30h] 
004134FD  mov         eax,dword ptr [ebp-0Ch] 
00413500  movaps      xmm1,xmmword ptr [eax] 
00413503  mulps       xmm1,xmm0 
00413506  movaps      xmmword ptr [ebp-110h],xmm1 
0041350D  mov         ecx,dword ptr [ebp-0Ch] 
00413510  movaps      xmm0,xmmword ptr [ebp-110h] 
00413517  movaps      xmmword ptr [ecx],xmm0 
	  }
0041351A  pop         edi  
0041351B  pop         esi  
0041351C  mov         ecx,dword ptr [ebp-4] 
0041351F  xor         ecx,ebp 
00413521  call        @ILT+140(@__security_check_cookie@4) (411091h) 
00413526  mov         esp,ebp 
00413528  pop         ebp  
00413529  mov         esp,ebx 
0041352B  pop         ebx  
0041352C  ret              

#1lipsryme

Posted 20 September 2013 - 02:54 PM

Here's the disassembly from the new code:

   inline const void Normalize()
    {
00413480  push        ebx  
00413481  mov         ebx,esp 
00413483  sub         esp,8 
00413486  and         esp,0FFFFFFF0h 
00413489  add         esp,4 
0041348C  push        ebp  
0041348D  mov         ebp,dword ptr [ebx+4] 
00413490  mov         dword ptr [esp+4],ebp 
00413494  mov         ebp,esp 
00413496  sub         esp,158h 
0041349C  push        esi  
0041349D  push        edi  
0041349E  push        ecx  
0041349F  lea         edi,[ebp-158h] 
004134A5  mov         ecx,56h 
004134AA  mov         eax,0CCCCCCCCh 
004134AF  rep stos    dword ptr es:[edi] 
004134B1  pop         ecx  
004134B2  mov         eax,dword ptr [___security_cookie (42F0D0h)] 
004134B7  xor         eax,ebp 
004134B9  mov         dword ptr [ebp-4],eax 
004134BC  mov         dword ptr [ebp-0Ch],ecx 
        // 0x7F = 0111 1111 ~ means we don't want the w-component multiplied
        // and the result written to all 4 components
        __m128 dp = _mm_dp_ps(v, v, 0x7F); 
004134BF  mov         eax,dword ptr [ebp-0Ch] 
004134C2  movaps      xmm0,xmmword ptr [eax] 
004134C5  mov         ecx,dword ptr [ebp-0Ch] 
004134C8  movaps      xmm1,xmmword ptr [ecx] 
004134CB  dpps        xmm1,xmm0,7Fh 
004134D1  movaps      xmmword ptr [ebp-150h],xmm1 
004134D8  movaps      xmm0,xmmword ptr [ebp-150h] 
004134DF  movaps      xmmword ptr [ebp-30h],xmm0 

        // compute rsqrt of the dot product
        dp = _mm_rsqrt_ps(dp);
004134E3  rsqrtps     xmm0,xmmword ptr [ebp-30h] 
004134E7  movaps      xmmword ptr [ebp-130h],xmm0 
004134EE  movaps      xmm0,xmmword ptr [ebp-130h] 
004134F5  movaps      xmmword ptr [ebp-30h],xmm0 

        // vec * rsqrt(dot(vec, vec))
        v = _mm_mul_ps(v, dp);
004134F9  movaps      xmm0,xmmword ptr [ebp-30h] 
004134FD  mov         eax,dword ptr [ebp-0Ch] 
00413500  movaps      xmm1,xmmword ptr [eax] 
00413503  mulps       xmm1,xmm0 
00413506  movaps      xmmword ptr [ebp-110h],xmm1 
0041350D  mov         ecx,dword ptr [ebp-0Ch] 
00413510  movaps      xmm0,xmmword ptr [ebp-110h] 
00413517  movaps      xmmword ptr [ecx],xmm0 
	  }
0041351A  pop         edi  
0041351B  pop         esi  
0041351C  mov         ecx,dword ptr [ebp-4] 
0041351F  xor         ecx,ebp 
00413521  call        @ILT+140(@__security_check_cookie@4) (411091h) 
00413526  mov         esp,ebp 
00413528  pop         ebp  
00413529  mov         esp,ebx 
0041352B  pop         ebx  
0041352C  ret              

PARTNERS