Jump to content
  • Advertisement

Archived

This topic is now archived and is closed to further replies.

kddak

SSE2 vector "optimizations"

This topic is 5303 days old which is more than the 365 day threshold we allow for new replies. Please post a new topic.

If you intended to correct an error in the post then please contact us.

Recommended Posts

I''m doing research on photorealistic renderring and in an attempt to maximize my optimization, I am looking into creating an SSE2 optimized 3D vector class. I would think SSE2 would speed things up, albeit only slightly, but it is actually about 4/3 slower. There are obviously more function calls (to the intrinsics), but I assumed that the optimizations would more than make up for this. Does anyone have any ideas why this might be? I''m using MSVS.NET 2003. I''ll post code if that will help.

Share this post


Link to post
Share on other sites
Advertisement
It''s Visual Studio .NET compiler that slows the things down. Suggest trying the same code with Visual Studio 6.0. I hope that''ll speed the things up. If that would not work, just email me. I have some sse and mmx optimized routines, sse still untested. No info on sse2, though.

Share this post


Link to post
Share on other sites
Check the asm that''s being generated. It''s possible that you''re loading and storing the xmm registers on each call to your sse vector class. Perhaps you can post the asm generated by the following program?
Vector v1(1,2,3);
Vector v2(4,5,6);
Vector v3 = v1 + v2;
v1 = v1 + v3;
v2 = v2 + v3;


Ideally, it should be something like
load v1, blah
load v2, blah
load v3, v1
add v3, v2
add v1, v3
add v2, v3
If it looks like this:
load v1, blah
load v2, blah
add v2, v1
store v2, blah
load v1, blah
load v3, blah
store v1, blah
load v2, blah
load v3, blah
store v2, blah

then you''re in trouble

Share this post


Link to post
Share on other sites
Unoptimized vector addition:


; 60 : return Vector( x + rkVector.x, y + rkVector.y, z + rkVector.z );

mov eax, DWORD PTR _rkVector$[esp]
movss xmm0, DWORD PTR [eax+12]
movss xmm1, DWORD PTR [eax+8]
movss xmm2, DWORD PTR [eax+4]
mov eax, DWORD PTR ___$ReturnUdt$[esp]
addss xmm0, DWORD PTR [ecx+12]
addss xmm1, DWORD PTR [ecx+8]
addss xmm2, DWORD PTR [ecx+4]
mov DWORD PTR $T21584[esp+4], 0
mov DWORD PTR [eax], OFFSET FLAT:??_7Vector@@6B@
movss DWORD PTR [eax+4], xmm2
movss DWORD PTR [eax+8], xmm1
movss DWORD PTR [eax+12], xmm0




SSE2 "optimized":
; 54 : Vector kVector;
; 55 : __m128 vec;
; 56 :
; 57 : vec = _mm_add_ps( xyz, rkVector.xyz );

movaps xmm0, XMMWORD PTR [ecx+16]
mov eax, DWORD PTR _rkVector$[ebp]
movaps xmm1, XMMWORD PTR [eax+16]

; 58 : kVector.xyz = vec;
; 59 : return kVector;

mov eax, DWORD PTR ___$ReturnUdt$[ebp]
addps xmm0, xmm1
movaps XMMWORD PTR [eax+16], xmm0
movss xmm0, DWORD PTR _kVector$[esp+96]
movss DWORD PTR [eax+32], xmm0
movss xmm0, DWORD PTR _kVector$[esp+100]
movss DWORD PTR [eax+36], xmm0
movss xmm0, DWORD PTR _kVector$[esp+104]
mov DWORD PTR $T21612[esp+64], 0
mov DWORD PTR [eax], OFFSET FLAT:??_7Vector@@6B@
movss DWORD PTR [eax+40], xmm0


[edited by - kddak on April 12, 2004 12:07:02 PM]

[edited by - kddak on April 12, 2004 12:07:38 PM]

Share this post


Link to post
Share on other sites
it looks like it''s doing way too much data shuffling. Can we see the code for your vector assignment operator and copy constructor? We might be able to suggest a way to fix it.

Share this post


Link to post
Share on other sites
#ifndef VECTOR_H
#define VECTOR_H
#include <emmintrin.h>
//#include <math.h>

class Vector
{
public:
Vector( float fx = 0, float fy = 0, float fz = 0 ) { xyz = _mm_set_ps( fx, fy, fz, 0 ); }
// Vector( float fx = 0, float fy = 0, float fz = 0 ) : x( fx ), y( fy ), z( fz ) { }
Vector( const Vector &rkVector ) : xyz( rkVector.xyz ) { }
// Vector( const Vector &rkVector ) : x( rkVector.x ), y( rkVector.y ), z( rkVector.z ) { }
virtual ~Vector() { }
__inline float Length();
// Vector Normalize();
// Vector Cross( const Vector &rkVector );
float Dot( const Vector &rkVector );
// __inline Vector operator *( float fMul ) { Vector kVec; kVec.x = x * fMul; kVec.y = y * fMul; kVec.z = z * fMul; return kVec; }
__inline float operator *( const Vector &rkVector ) { return Dot( rkVector ); }
// __inline Vector operator %( const Vector &rkVector ) { return Cross( rkVector ); }
__inline Vector operator +( const Vector &rkVector );
__inline Vector operator -( const Vector &rkVector );
// __inline float operator []( unsigned int uiIndex ) { unsigned uiValue = uiIndex % 3; if( uiValue == 0 ) return x; if( uiValue == 1 ) return y; return z; }
__m128 xyz;
// float x, y, z;
};

__inline float Vector::Dot( const Vector &rkVector )
{
__m128 vec1 = _mm_mul_ps( xyz, rkVector.xyz ), vec2, vec3;
float fRet;

vec2 = _mm_shuffle_ps( vec1, vec1, _MM_SHUFFLE( 1, 2, 3, 0 ) );
vec3 = _mm_shuffle_ps( vec1, vec1, _MM_SHUFFLE( 2, 3, 0, 1 ) );
_mm_store_ss( &fRet, _mm_add_ps( vec3, _mm_add_ps( vec1, vec2 ) ) );

return fRet;
// return ( x * rkVector.x ) + ( y * rkVector.y ) + ( z * rkVector.z );
}

__inline float Vector::Length()
{
__m128 vec1 = _mm_mul_ps( xyz, xyz ), vec2, vec3;
float fRet;

vec2 = _mm_shuffle_ps( vec1, vec1, _MM_SHUFFLE( 1, 2, 3, 0 ) );
vec3 = _mm_shuffle_ps( vec1, vec1, _MM_SHUFFLE( 2, 3, 0, 1 ) );
_mm_store_ss( &fRet, _mm_sqrt_ss( _mm_add_ps( vec3, _mm_add_ps( vec1, vec2 ) ) ) );
return fRet;
// return sqrtf( ( x * x ) + ( y * y ) + ( z * z ) );
}

__inline Vector Vector::operator +( const Vector &rkVector )
{
Vector kVector;
__m128 vec;

vec = _mm_add_ps( xyz, rkVector.xyz );
kVector.xyz = vec;
return kVector;
// return Vector( x + rkVector.x, y + rkVector.y, z + rkVector.z );
}

__inline Vector Vector::operator -( const Vector &rkVector )
{
Vector kVector;
__m128 vec;

vec = _mm_sub_ps( xyz, rkVector.xyz );
kVector.xyz = vec;
return kVector;
// return Vector( x - rkVector.x, y - rkVector.y, z - rkVector.z );
}

#endif

Share this post


Link to post
Share on other sites
Further "research" has shown that the SSE2 vector class is about 2-3x faster with the actual calculations, but loading the floating point values into the __m128 in the constructor takes significantly longer to the point where the entire process takes quite a bit longer than just using straight float operations. Is there a better (read "faster") way to load the floats into the __m128?

Share this post


Link to post
Share on other sites

  • Advertisement
×

Important Information

By using GameDev.net, you agree to our community Guidelines, Terms of Use, and Privacy Policy.

We are the game development community.

Whether you are an indie, hobbyist, AAA developer, or just trying to learn, GameDev.net is the place for you to learn, share, and connect with the games industry. Learn more About Us or sign up!

Sign me up!