# fast vector class (source) * not so fast

## Recommended Posts

just wrote a fast vector class using SSE... free to use for anyone that needs it... its only upto SSE2 instructions.. the dot product could be further improved with a SSE3 instruction that sums the values in the array.. i havent found the name of this instruction though.. suggestions for improvement are welcome! EDIT:: changed some stuff... EDIT2:: added some stuff.. fixed crash bugs EDIT3:: seems like people tryin this code gets worse results than regular code... which means that is a not so fast vector class...

//////////////////////////////////////
/// vec3 BY Dragon_Strike, Robert Nagy
/// 2007-03-01
//////////////////////////////////////

#ifndef vec3_H
#define vec3_H

#include <math.h>

static bool UseSSE = true;

__declspec(align(16)) class vec3
{

public:

float x,y,z,w;

vec3(float _x = 0, float _y = 0, float _z = 0, float _w = 1)
{
x = _x; y = _y; z = _z; w = _w;
}

vec3(const vec3 &vec)
{
x = vec.x; y = vec.y; z = vec.z; w = vec.w;
}

~vec3() {}

void Set(float ex, float why, float zee)
{
x = ex; y = why; z = zee;
}

vec3 operator*(float num) const
{
return vec3(x*num, y*num, z*num);
}

vec3 operator*(const vec3 &vec) const
{
return vec3(x*vec.x, y*vec.y, z*vec.z);
}

vec3 operator/(float num) const
{
return vec3(x/num, y/num, z/num);
}

vec3 operator/(const vec3 &vec) const
{
return vec3(x/vec.x, y/vec.y, z/vec.z);
}

vec3 operator+(const vec3 &vec) const
{
return vec3(x+vec.x, y+vec.y, z+vec.z);
}

vec3 operator+(float num) const
{
return vec3(x+num, y+num, z+num);
}

vec3 operator-(const vec3 &vec) const
{
return vec3(x-vec.x, y-vec.y, z-vec.z);
}
vec3 operator-(float num) const
{
return vec3(x-num, y-num, z-num);
}
vec3 operator-() const
{
return vec3(-x, -y, -z);
}

void operator+=(const vec3 &vec)
{
x += vec.x; y += vec.y; z += vec.z;
}
void operator+=(float f)
{
x += f; y += f; z += f;
}

void operator-=(const vec3 &vec)
{
x -= vec.x; y -= vec.y; z -= vec.z;
}
void operator-=(float f)
{
x -= f; y -= f; z -= f;
}

void operator/=(float f)
{
x /= f; y /= f; z /= f;
}

void operator*=(float f)
{
x *= f; y *= f; z *= f;
}

void operator=(const vec3 &vec)
{
x = vec.x ; y = vec.y ; z = vec.z;
}
void operator=(float f)
{
x = f ; y = f ; z = f;
}

bool operator != (const vec3& vec)
{
return ((fabs(x - vec.x) > EPSILON) || (fabs(y - vec.y) > EPSILON) || (fabs(z - vec.z) > EPSILON));
}

bool operator == (const vec3& vec)
{
return !(*this != vec);
}

};

inline float length(const vec3 &vec)
{
float f;
if (UseSSE)
{
__asm {
mov esi, vec
movaps xmm0, [esi]

mulps xmm0, xmm0
movaps xmm1, xmm0
shufps xmm1, xmm1, 1001110b
movaps xmm1, xmm0
shufps xmm1, xmm1, 00010001b
sqrtss xmm0, xmm0
movss f, xmm0
}
}
else
{
f = sqrt(vec.x*vec.x + vec.y*vec.y + vec.z*vec.z);
}

return f;
}

inline float distance(const vec3 &vec1, const vec3 &vec2)
{
return length(vec1-vec2);
}

inline vec3 cross(const vec3 &vec1, const vec3 &vec2)
{
vec3 vec;

if (UseSSE)
{

__asm {

mov esi, vec1
mov edi, vec2
movaps xmm0, [esi]
movaps xmm1, [edi]

movaps xmm2, xmm0
movaps xmm3, xmm1
shufps xmm0, xmm0, 11001001b
shufps xmm1, xmm1, 11010010b
mulps xmm0, xmm1
shufps xmm2, xmm2, 11010010b
shufps xmm3, xmm3, 11001001b
mulps xmm2, xmm3
subps xmm0, xmm2

movaps vec, xmm0
}

}
else
{
vec = vec3(vec1.y * vec2.z - vec1.z * vec2.y,
vec1.z * vec2.x - vec1.x * vec2.z,
vec1.x * vec2.y - vec1.y * vec2.x);

}

return vec;
}

inline float dot(const vec3 &vec1, const vec3 &vec2)
{
return (vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z );
}

inline vec3 normalize(vec3 &vec1)
{
vec3 vec;
if (UseSSE)
{
__asm {
mov esi, vec1
movaps xmm0, [esi]

movaps xmm2, xmm0
mulps xmm0, xmm0
movaps xmm1, xmm0
shufps xmm1, xmm1, 1001110b
movaps xmm1, xmm0
shufps xmm1, xmm1, 00010001b

rsqrtps xmm0, xmm0
mulps xmm2, xmm0
movaps vec, xmm2
}
}
else
{
float l = length(vec1);
if (l > 0)
vec = vec1 / l;
}

return vec;
}

inline vec3 faceforward(const vec3 &N, const vec3 &I, const vec3 &Nref)
{
vec3 vec = N;
if (dot(Nref, I) < 0)
return vec;
else
return -vec;
}

inline bool any(const vec3 &vec)
{
return (vec.x > EPSILON || vec.y > EPSILON || vec.z > EPSILON );
}

inline bool all(const vec3 &vec)
{
return (vec.x > EPSILON && vec.y > EPSILON && vec.z > EPSILON );
}

inline vec3 abs(const vec3 &vec1)
{
vec3 vec;
if (vec1.x < 0)
vec.x = -vec1.x;
if (vec1.y < 0)
vec.y = -vec1.y;
if (vec1.z < 0)
vec.z = -vec1.z;

return vec;
}

#endif


[Edited by - Dragon_Strike on March 3, 2007 8:09:58 PM]

##### Share on other sites
vec3& operator=(const vec3 &vec) {   x = vec.x ; y = vec.y ; z = vec.z;	   return *this;}

##### Share on other sites
Just my 2 cents on a more abstract, but still high performance, data structure:

enum {x, y, z, w};struct vec3{...   float m[3];...   float& operator[](int i)   {       return m[i];   }   float operator[](int i) const   {       return m[i];   }

Then you can do stuff like:

vec3 foo;height = foo[y]; // etc...

And RE: the overloaded == function, you may want to take an epsilon into account:

#define EPSILON 0.0001f   friend bool operator != (const vec3& a, const vec3& b)   {      const float epsilon = 0.0001f;      return ((fabs(a[x] - b[x]) > EPSILON) || (fabs(a[y] - b[y]) > EPSILON) || (fabs(a[z] - b[z]) > EPSILON));   }   friend bool operator == (const vec3& a, const vec3& b)   {      return !(a != b); // Do this so you only need to update one function if == or != changes...   }

HTH

##### Share on other sites
The binary operators should be implemented as non-member functions.

I don't know a lot about the x86 vector unit, but would it be faster if the vector elements were stored in the vector registers rather than main memory? That would eliminate copies to and from memory between consecutive operations (though perhaps the optimizer already does this).

##### Share on other sites
Quote:
 Original post by x452AlbaJust my 2 cents on a more abstract, but still high performance, data structure:*** Source Snippet Removed ***

why is taht better?
Quote:
 Then you can do stuff like:*** Source Snippet Removed ***And RE: the overloaded == function, you may want to take an epsilon into account:

i dont quite understand what that would be good for?
Quote:
 *** Source Snippet Removed ***HTH

thx for the suggestions

##### Share on other sites
Quote:
 Original post by JohnBoltonThe binary operators should be implemented as non-member functions.I don't know a lot about the x86 vector unit, but would it be faster if the vector elements were stored in the vector registers rather than main memory? That would eliminate copies to and from memory between consecutive operations (though perhaps the optimizer already does this).

could u explain this a bit more in detail?

##### Share on other sites
Quote:
 Original post by x452AlbaAnd RE: the overloaded == function, you may want to take an epsilon into account

- this goes for the "any(..)" functions too.

##### Share on other sites
I'm a lazy bum so I'll ask: have you benchmarked this against say, the DirectX vectors?

##### Share on other sites
the epsilon code is wrong :P

to compare two floats:
inline bool float_equal( float a, float b ) { return fabsf( a - b ) <= std::max( 1.0f, std::max(a,b) ) * FLT_EPSILON; }

Comparing the individual elements in the vectors against each other probably isn't the best way to compare two vectors. Should probably compare the length and angle between the vectors.

And it's much better to use templates for creating vector classes, then you can have vectors of arbitrary size and only need to code a single class :)

##### Share on other sites
Quote:
 Original post by _swx_the epsilon code is wrong :P

ummmmmmmmmm.... They're both correct, used in the industry, and acceptable. Just that your's is more *mathematically* correct. No big deal. :)

##### Share on other sites
Quote:
 Original post by _swx_And it's much better to use templates for creating vector classes, then you can have vectors of arbitrary size and only need to code a single class :)

Yes, more flexible, but you'd lose all the hardcoded and highly optimized SSE code (and that's the whole point).

##### Share on other sites
^^^

Not only that, but how many differently sized vectors have you used? Almost everything is done with 3 & 4 dim, some with 2, and I've never had the need for anything over 4. If I did, I'd probably be doing some sort of heavy math related work, in which case I would just some some random math package like pari/gp or matlab or something.

You're not even realy getting flexability by using templates.

##### Share on other sites
Quote:

And RE: the overloaded == function, you may want to take an epsilon into account:
Quote:
 i dont quite understand what that would be good for?

Well, in a simple example (and assuming in the context of a 3D game), what's the difference between A and B?:

vec3 A(0.000000089f, 0.00000012f, 0.0000000f);vec3 B(0.0f, 0.0f, 0.0f);

A and B are so close that they can be considered to be the same. Your code does not allow for that.

And now for a slightly more complex example:

...typedef vertex_t vector_t;struct normal_vertex_t{// GL_N3F_V3F   vector_t n;   vertex_t v;};...void Model_md2::GenerateShadowVolume(){   bool* bEdgeUsed = NULL;   int i = 0;   bEdgeUsed = new bool[3 * nTriangles];   for (int j = 0; j < 3 * nTriangles; bEdgeUsed[j++] = false);   volume = new triangle_face_normals_t[4 * nTriangles];   for (int t1 = 0; t1 < nTriangles; ++t1)   {      vector_t n1 = CalculateNormal(vertices[triangles[t1].vertex[0]].v, vertices[triangles[t1].vertex[1]].v, vertices[triangles[t1].vertex[2]].v);      volume[i  ].vertex[0] = triangles[t1].vertex[0];      volume[i  ].n[0]      = n1;      volume[i  ].vertex[1] = triangles[t1].vertex[1];      volume[i  ].n[1]      = n1;      volume[i  ].vertex[2] = triangles[t1].vertex[2];      volume[i++].n[2]      = n1;      for (int t2 = (t1 + 1); t2 < nTriangles; ++t2)      {         vector_t n2 = CalculateNormal(vertices[triangles[t2].vertex[0]].v, vertices[triangles[t2].vertex[1]].v, vertices[triangles[t2].vertex[2]].v);         for (int v1 = 0; v1 < 3; ++v1)         {            if (bEdgeUsed[t1 * 3 + v1] == false)            {               for (int v2 = 0; v2 < 3; ++v2)               {                  if (bEdgeUsed[t2 * 3 + v2] == false)                  {                     int v1_next = (v1 + 1) % 3; /* Wrap around. */                     int v2_next = (v2 + 1) % 3; /* Wrap around. */                     normal_vertex_t pEdge1[2], pEdge2[2];                     pEdge1[0] = vertices[triangles[t1].vertex[v1]];                     pEdge1[1] = vertices[triangles[t1].vertex[v1_next]];                     pEdge2[0] = vertices[triangles[t2].vertex[v2]];                     pEdge2[1] = vertices[triangles[t2].vertex[v2_next]];                     if (pEdge1[1].v == pEdge2[0].v && pEdge1[0].v == pEdge2[1].v) //// This test is where it really matters! //////                     {                     /* Build a quad. */                        volume[i  ].vertex[0] = triangles[t1].vertex[v1]; /* Triangle 1. */                        volume[i  ].n[0]      = n1;                        volume[i  ].vertex[1] = triangles[t2].vertex[v2_next];                        volume[i  ].n[1]      = n2;                        volume[i  ].vertex[2] = triangles[t2].vertex[v2];                        volume[i++].n[2]      = n2;                        volume[i  ].vertex[0] = triangles[t2].vertex[v2]; /* Triangle 2. */                        volume[i  ].n[0]      = n2;                        volume[i  ].vertex[1] = triangles[t1].vertex[v1_next];                        volume[i  ].n[1]      = n1;                        volume[i  ].vertex[2] = triangles[t1].vertex[v1];                        volume[i++].n[2]      = n1;                        bEdgeUsed[t1 * 3 + v1] = true;                        bEdgeUsed[t2 * 3 + v2] = true;                        break;                     }                  }               }            }         }      }                    }   delete [] bEdgeUsed;   nVolume = i;   return;}

##### Share on other sites
Quote:
Original post by x452Alba
Quote:
 Original post by _swx_And it's much better to use templates for creating vector classes, then you can have vectors of arbitrary size and only need to code a single class :)

Yes, more flexible, but you'd lose all the hardcoded and highly optimized SSE code (and that's the whole point).

So use template specialization for the functions that you can produce better code for than the compiler. And IMO it's kinda ugly to overload the == operator for other things than exact equality, especially if you dont use the proper epsilon value. Use a function that accepts a relative error value to compare vectors for near equality.

And if you want a really fast vector class you should be using expression templates.

##### Share on other sites
Quote:
 Original post by x452AlbaYes, more flexible, but you'd lose all the hardcoded and highly optimized SSE code (and that's the whole point).

You don't really lose any of the custom hardcoded optimizations with templates:

template< typename BaseType >class TypeTraits{public:...    typedef BaseType Type;    typedef TypeTraits< Type > MyType;...public:...    static void Add (Type & sum, Type a, Type b);    static void AddArray (Type * sum, Type const * a, Type const * b, unsigned int size);...};template< typename BaseType, unsigned int Dimension, typename BaseTypeTraits = TypeTraits< BaseType > >class Vector{public:...    typedef BaseType Type;    typedef BaseTypeTraits Traits;    typedef Vector< Type, Dimension, Traits > MyType;...private:...    Type m_fields[Dimension];...public:...    friend MyType const operator+(MyType const & a, MyType const & b)    {        MyType result;        Traits::AddArray(result.m_fields, a.m_fields, b.m_fields, Dimension);        return result;    }...};

there would be a default TypeTraits template that could be used with any type. After you get some profiler results, you can specialize the TypeTraits template and define the hard coded, highly optimized SSE code. Though, I would leave the optimizations for the compiler - it is pretty awesome what it can do! :)

##### Share on other sites
Quote:
 Original post by _swx_the epsilon code is wrong :P

Sorry, but your code does work well for numbers between 0 and 1, or negative numbers. FLT_EPSILON yields the smallest number such that 1.0f + FLT_EPSILON != 1.0f.

##### Share on other sites
Quote:
 Original post by stanloI'm a lazy bum so I'll ask: have you benchmarked this against say, the DirectX vectors?

no... i dont use directx... but if somebody tires it id like to know the results

##### Share on other sites
Quote:
Original post by Dragon_Strike
Quote:
 Original post by stanloI'm a lazy bum so I'll ask: have you benchmarked this against say, the DirectX vectors?

no... i dont use directx... but if somebody tires it id like to know the results

How about against a similar implimentation that doesn't use SSE?

##### Share on other sites
Quote:
Original post by stanlo
Quote:
Original post by Dragon_Strike
Quote:
 Original post by stanloI'm a lazy bum so I'll ask: have you benchmarked this against say, the DirectX vectors?

no... i dont use directx... but if somebody tires it id like to know the results

How about against a similar implimentation that doesn't use SSE?

well i got it to about 30-40% faster...

##### Share on other sites
You should use intrinsics (if supported by your compiler) instead of straight assembly. That way you get the generated instructions you want without having to worry about instruction scheduling, register allocation and loading/storing from/to memory, all of which will cause speed penalties in your code when calling several inlined SSE-using methods in a row (which is a fairly common case with core math classes).

Regards.

##### Share on other sites
Quote:
 Original post by AbominacionYou should use intrinsics (if supported by your compiler) instead of straight assembly. That way you get the generated instructions you want without having to worry about instruction scheduling, register allocation and loading/storing from/to memory, all of which will cause speed penalties in your code when calling several inlined SSE-using methods in a row (which is a fairly common case with core math classes).Regards.

interesting.. where can i find some good tutorial on this?

##### Share on other sites
I'm assuming you use Visual Studio. As I recall gcc syntax is different.

This is a simple tutorial about SSE intrinsics. Seeing as you already know assembly you mainly need a intrinsic listing for reference, with that it's just a matter of replacing each of the assembly opcodes with its equivalent intrinsic.

Regards.

Edit: fixed a couple of typos.

[Edited by - Abominacion on March 3, 2007 8:27:08 AM]

##### Share on other sites
Quote:
 Original post by Dragon_Strikewell i got it to about 30-40% faster...

Holy moly

##### Share on other sites
well i did another benchmark using windows time and 50000000 normalize, distance and cross operation...

Quote:
 13:43:19 Start SSE13:44:23 End SSE13:44:23 Start13:45:35 End

4 seconds with SSE
12 seconds without SSE

thats 300% faster

i probably got somethin wrong with the previous one..

[Edited by - Dragon_Strike on March 3, 2007 7:52:45 AM]

##### Share on other sites
Gosh!

What processor are you using?

## Create an account

Register a new account

• ### Forum Statistics

• Total Topics
628379
• Total Posts
2982344

• 10
• 9
• 15
• 24
• 11