Sign in to follow this  
Dragon_Strike

fast vector class (source) * not so fast

Recommended Posts

Dragon_Strike    264
just wrote a fast vector class using SSE... free to use for anyone that needs it... its only upto SSE2 instructions.. the dot product could be further improved with a SSE3 instruction that sums the values in the array.. i havent found the name of this instruction though.. suggestions for improvement are welcome! EDIT:: changed some stuff... EDIT2:: added some stuff.. fixed crash bugs EDIT3:: seems like people tryin this code gets worse results than regular code... which means that is a not so fast vector class...

//////////////////////////////////////
/// vec3 BY Dragon_Strike, Robert Nagy
/// 2007-03-01 
//////////////////////////////////////


#ifndef vec3_H
#define vec3_H

#include <math.h>

static bool UseSSE = true;

__declspec(align(16)) class vec3
{
	
	public:

		float x,y,z,w;		

  		vec3(float _x = 0, float _y = 0, float _z = 0, float _w = 1)
		{
			x = _x; y = _y; z = _z; w = _w;
		}

		vec3(const vec3 &vec)
		{
			x = vec.x; y = vec.y; z = vec.z; w = vec.w;
		}

		~vec3() {}

		void Set(float ex, float why, float zee)
		{
			x = ex; y = why; z = zee;
		}

		vec3 operator*(float num) const
		{
			return vec3(x*num, y*num, z*num);	
		}

		vec3 operator*(const vec3 &vec) const
		{
			return vec3(x*vec.x, y*vec.y, z*vec.z);		
		}

		vec3 operator/(float num) const
		{
			return vec3(x/num, y/num, z/num);	
		}

		vec3 operator/(const vec3 &vec) const
		{
			return vec3(x/vec.x, y/vec.y, z/vec.z);		
		}

		vec3 operator+(const vec3 &vec) const
		{
			return vec3(x+vec.x, y+vec.y, z+vec.z);		
		}

		vec3 operator+(float num) const
		{
			return vec3(x+num, y+num, z+num);	
		}

		vec3 operator-(const vec3 &vec) const
		{
			return vec3(x-vec.x, y-vec.y, z-vec.z);		
		}
		vec3 operator-(float num) const
		{
			return vec3(x-num, y-num, z-num);	
		}
		vec3 operator-() const
		{
			return vec3(-x, -y, -z);	
		}

		void operator+=(const vec3 &vec)
		{
			x += vec.x; y += vec.y; z += vec.z;	
		}
		void operator+=(float f)
		{
			x += f; y += f; z += f;	
		}

		void operator-=(const vec3 &vec)
		{
			x -= vec.x; y -= vec.y; z -= vec.z;	
		}
		void operator-=(float f)
		{
			x -= f; y -= f; z -= f;	
		}

		void operator/=(float f)
		{
			x /= f; y /= f; z /= f;	
		}

		void operator*=(float f)
		{
			x *= f; y *= f; z *= f;	
		}

		void operator=(const vec3 &vec) 
		{
			x = vec.x ; y = vec.y ; z = vec.z;	
		}
		void operator=(float f) 
		{
			x = f ; y = f ; z = f;	
		}

		bool operator != (const vec3& vec)
		{
			return ((fabs(x - vec.x) > EPSILON) || (fabs(y - vec.y) > EPSILON) || (fabs(z - vec.z) > EPSILON));
		}

		bool operator == (const vec3& vec)
		{
			return !(*this != vec);
		}

};

inline float length(const vec3 &vec)
{
	float f;
	if (UseSSE)
	{
		__asm {
			mov esi, vec
			movaps xmm0, [esi]		

			mulps xmm0, xmm0
			movaps xmm1, xmm0
			shufps xmm1, xmm1, 1001110b
			addps xmm0, xmm1
			movaps xmm1, xmm0
			shufps xmm1, xmm1, 00010001b
			addps xmm0, xmm1
			sqrtss xmm0, xmm0			
			movss f, xmm0
		}
	}
	else
	{
		f = sqrt(vec.x*vec.x + vec.y*vec.y + vec.z*vec.z);
	}

	return f;
}

inline float distance(const vec3 &vec1, const vec3 &vec2)
{
	return length(vec1-vec2);
}

inline vec3 cross(const vec3 &vec1, const vec3 &vec2)
{
	vec3 vec;

	if (UseSSE)
	{
	
		__asm {

			mov esi, vec1
			mov edi, vec2
			movaps xmm0, [esi]
			movaps xmm1, [edi]

			movaps xmm2, xmm0
			movaps xmm3, xmm1
			shufps xmm0, xmm0, 11001001b
			shufps xmm1, xmm1, 11010010b
			mulps xmm0, xmm1
			shufps xmm2, xmm2, 11010010b
			shufps xmm3, xmm3, 11001001b
			mulps xmm2, xmm3
			subps xmm0, xmm2

			movaps vec, xmm0
		}
		
	}
	else
	{
		vec = vec3(vec1.y * vec2.z - vec1.z * vec2.y, 
				   vec1.z * vec2.x - vec1.x * vec2.z, 
				   vec1.x * vec2.y - vec1.y * vec2.x);

	}

	return vec;	
}

inline float dot(const vec3 &vec1, const vec3 &vec2)
{
	return (vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z );
}

inline vec3 normalize(vec3 &vec1)
{
	vec3 vec;
	if (UseSSE)
	{
		__asm {
			mov esi, vec1
			movaps xmm0, [esi]

			movaps xmm2, xmm0
			mulps xmm0, xmm0
			movaps xmm1, xmm0
			shufps xmm1, xmm1, 1001110b
			addps xmm0, xmm1
			movaps xmm1, xmm0
			shufps xmm1, xmm1, 00010001b
			addps xmm0, xmm1

			rsqrtps xmm0, xmm0
			mulps xmm2, xmm0
			movaps vec, xmm2
		}	
	}
	else
	{
		float l = length(vec1);
		if (l > 0)		
			vec = vec1 / l;
	}

	return vec;
}

inline vec3 faceforward(const vec3 &N, const vec3 &I, const vec3 &Nref)
{
	vec3 vec = N;
	if (dot(Nref, I) < 0)
		return vec;
	else
		return -vec;
}

inline bool any(const vec3 &vec)
{
	return (vec.x > EPSILON || vec.y > EPSILON || vec.z > EPSILON );
}

inline bool all(const vec3 &vec)
{
	return (vec.x > EPSILON && vec.y > EPSILON && vec.z > EPSILON );
}

inline vec3 abs(const vec3 &vec1)
{
	vec3 vec;
	if (vec1.x < 0)
		vec.x = -vec1.x;
	if (vec1.y < 0)
		vec.y = -vec1.y;
	if (vec1.z < 0)
		vec.z = -vec1.z;

	return vec;
}

#endif







[Edited by - Dragon_Strike on March 3, 2007 8:09:58 PM]

Share this post


Link to post
Share on other sites
x452Alba    126
Just my 2 cents on a more abstract, but still high performance, data structure:


enum {x, y, z, w};

struct vec3
{
...
float m[3];
...

float& operator[](int i)
{
return m[i];
}

float operator[](int i) const
{
return m[i];
}




Then you can do stuff like:


vec3 foo;

height = foo[y]; // etc...



And RE: the overloaded == function, you may want to take an epsilon into account:



#define EPSILON 0.0001f

friend bool operator != (const vec3& a, const vec3& b)
{
const float epsilon = 0.0001f;

return ((fabs(a[x] - b[x]) > EPSILON) || (fabs(a[y] - b[y]) > EPSILON) || (fabs(a[z] - b[z]) > EPSILON));
}

friend bool operator == (const vec3& a, const vec3& b)
{
return !(a != b); // Do this so you only need to update one function if == or != changes...
}



HTH

Share this post


Link to post
Share on other sites
JohnBolton    1372
The binary operators should be implemented as non-member functions.

I don't know a lot about the x86 vector unit, but would it be faster if the vector elements were stored in the vector registers rather than main memory? That would eliminate copies to and from memory between consecutive operations (though perhaps the optimizer already does this).

Share this post


Link to post
Share on other sites
Dragon_Strike    264
Quote:
Original post by x452Alba
Just my 2 cents on a more abstract, but still high performance, data structure:

*** Source Snippet Removed ***

why is taht better?
Quote:

Then you can do stuff like:

*** Source Snippet Removed ***

And RE: the overloaded == function, you may want to take an epsilon into account:

i dont quite understand what that would be good for?
Quote:

*** Source Snippet Removed ***

HTH


thx for the suggestions

Share this post


Link to post
Share on other sites
Dragon_Strike    264
Quote:
Original post by JohnBolton
The binary operators should be implemented as non-member functions.

I don't know a lot about the x86 vector unit, but would it be faster if the vector elements were stored in the vector registers rather than main memory? That would eliminate copies to and from memory between consecutive operations (though perhaps the optimizer already does this).


could u explain this a bit more in detail?

Share this post


Link to post
Share on other sites
gjoel    144
Quote:
Original post by x452Alba
And RE: the overloaded == function, you may want to take an epsilon into account


- this goes for the "any(..)" functions too.

Share this post


Link to post
Share on other sites
_swx_    1138
the epsilon code is wrong :P

to compare two floats:

inline bool float_equal( float a, float b )
{
return fabsf( a - b ) <= std::max( 1.0f, std::max(a,b) ) * FLT_EPSILON;
}




Comparing the individual elements in the vectors against each other probably isn't the best way to compare two vectors. Should probably compare the length and angle between the vectors.

And it's much better to use templates for creating vector classes, then you can have vectors of arbitrary size and only need to code a single class :)

Share this post


Link to post
Share on other sites
x452Alba    126
Quote:

Original post by _swx_
the epsilon code is wrong :P


ummmmmmmmmm.... They're both correct, used in the industry, and acceptable. Just that your's is more *mathematically* correct. No big deal. :)

Share this post


Link to post
Share on other sites
x452Alba    126
Quote:
Original post by _swx_
And it's much better to use templates for creating vector classes, then you can have vectors of arbitrary size and only need to code a single class :)


Yes, more flexible, but you'd lose all the hardcoded and highly optimized SSE code (and that's the whole point).

Share this post


Link to post
Share on other sites
stanlo    212
^^^

Not only that, but how many differently sized vectors have you used? Almost everything is done with 3 & 4 dim, some with 2, and I've never had the need for anything over 4. If I did, I'd probably be doing some sort of heavy math related work, in which case I would just some some random math package like pari/gp or matlab or something.

You're not even realy getting flexability by using templates.

Share this post


Link to post
Share on other sites
x452Alba    126
Quote:

And RE: the overloaded == function, you may want to take an epsilon into account:
Quote:

i dont quite understand what that would be good for?



Well, in a simple example (and assuming in the context of a 3D game), what's the difference between A and B?:


vec3 A(0.000000089f, 0.00000012f, 0.0000000f);
vec3 B(0.0f, 0.0f, 0.0f);



A and B are so close that they can be considered to be the same. Your code does not allow for that.

And now for a slightly more complex example:


...
typedef vertex_t vector_t;

struct normal_vertex_t
{
// GL_N3F_V3F
vector_t n;
vertex_t v;
};
...
void Model_md2::GenerateShadowVolume()
{
bool* bEdgeUsed = NULL;

int i = 0;

bEdgeUsed = new bool[3 * nTriangles];
for (int j = 0; j < 3 * nTriangles; bEdgeUsed[j++] = false);
volume = new triangle_face_normals_t[4 * nTriangles];

for (int t1 = 0; t1 < nTriangles; ++t1)
{
vector_t n1 = CalculateNormal(vertices[triangles[t1].vertex[0]].v, vertices[triangles[t1].vertex[1]].v, vertices[triangles[t1].vertex[2]].v);
volume[i ].vertex[0] = triangles[t1].vertex[0];
volume[i ].n[0] = n1;
volume[i ].vertex[1] = triangles[t1].vertex[1];
volume[i ].n[1] = n1;
volume[i ].vertex[2] = triangles[t1].vertex[2];
volume[i++].n[2] = n1;

for (int t2 = (t1 + 1); t2 < nTriangles; ++t2)
{
vector_t n2 = CalculateNormal(vertices[triangles[t2].vertex[0]].v, vertices[triangles[t2].vertex[1]].v, vertices[triangles[t2].vertex[2]].v);

for (int v1 = 0; v1 < 3; ++v1)
{
if (bEdgeUsed[t1 * 3 + v1] == false)
{
for (int v2 = 0; v2 < 3; ++v2)
{
if (bEdgeUsed[t2 * 3 + v2] == false)
{
int v1_next = (v1 + 1) % 3; /* Wrap around. */
int v2_next = (v2 + 1) % 3; /* Wrap around. */

normal_vertex_t pEdge1[2], pEdge2[2];
pEdge1[0] = vertices[triangles[t1].vertex[v1]];
pEdge1[1] = vertices[triangles[t1].vertex[v1_next]];
pEdge2[0] = vertices[triangles[t2].vertex[v2]];
pEdge2[1] = vertices[triangles[t2].vertex[v2_next]];

if (pEdge1[1].v == pEdge2[0].v && pEdge1[0].v == pEdge2[1].v) //// This test is where it really matters! //////
{
/* Build a quad. */
volume[i ].vertex[0] = triangles[t1].vertex[v1]; /* Triangle 1. */
volume[i ].n[0] = n1;
volume[i ].vertex[1] = triangles[t2].vertex[v2_next];
volume[i ].n[1] = n2;
volume[i ].vertex[2] = triangles[t2].vertex[v2];
volume[i++].n[2] = n2;
volume[i ].vertex[0] = triangles[t2].vertex[v2]; /* Triangle 2. */
volume[i ].n[0] = n2;
volume[i ].vertex[1] = triangles[t1].vertex[v1_next];
volume[i ].n[1] = n1;
volume[i ].vertex[2] = triangles[t1].vertex[v1];
volume[i++].n[2] = n1;

bEdgeUsed[t1 * 3 + v1] = true;
bEdgeUsed[t2 * 3 + v2] = true;
break;
}
}
}
}
}
}
}

delete [] bEdgeUsed;

nVolume = i;

return;
}

Share this post


Link to post
Share on other sites
_swx_    1138
Quote:
Original post by x452Alba
Quote:
Original post by _swx_
And it's much better to use templates for creating vector classes, then you can have vectors of arbitrary size and only need to code a single class :)


Yes, more flexible, but you'd lose all the hardcoded and highly optimized SSE code (and that's the whole point).


So use template specialization for the functions that you can produce better code for than the compiler. And IMO it's kinda ugly to overload the == operator for other things than exact equality, especially if you dont use the proper epsilon value. Use a function that accepts a relative error value to compare vectors for near equality.

And if you want a really fast vector class you should be using expression templates.

Share this post


Link to post
Share on other sites
Quote:
Original post by x452Alba
Yes, more flexible, but you'd lose all the hardcoded and highly optimized SSE code (and that's the whole point).


You don't really lose any of the custom hardcoded optimizations with templates:


template< typename BaseType >
class TypeTraits
{
public:
...
typedef BaseType Type;
typedef TypeTraits< Type > MyType;
...
public:
...
static void Add (Type & sum, Type a, Type b);
static void AddArray (Type * sum, Type const * a, Type const * b, unsigned int size);
...
};

template< typename BaseType, unsigned int Dimension, typename BaseTypeTraits = TypeTraits< BaseType > >
class Vector
{
public:
...
typedef BaseType Type;
typedef BaseTypeTraits Traits;
typedef Vector< Type, Dimension, Traits > MyType;
...
private:
...
Type m_fields[Dimension];
...
public:
...
friend MyType const operator+(MyType const & a, MyType const & b)
{
MyType result;
Traits::AddArray(result.m_fields, a.m_fields, b.m_fields, Dimension);
return result;
}
...
};

there would be a default TypeTraits template that could be used with any type. After you get some profiler results, you can specialize the TypeTraits template and define the hard coded, highly optimized SSE code. Though, I would leave the optimizations for the compiler - it is pretty awesome what it can do! :)

Share this post


Link to post
Share on other sites
dalleboy    324
Quote:
Original post by _swx_
the epsilon code is wrong :P

Sorry, but your code does work well for numbers between 0 and 1, or negative numbers. FLT_EPSILON yields the smallest number such that 1.0f + FLT_EPSILON != 1.0f.

Share this post


Link to post
Share on other sites
Dragon_Strike    264
Quote:
Original post by stanlo
I'm a lazy bum so I'll ask: have you benchmarked this against say, the DirectX vectors?


no... i dont use directx... but if somebody tires it id like to know the results

Share this post


Link to post
Share on other sites
stanlo    212
Quote:
Original post by Dragon_Strike
Quote:
Original post by stanlo
I'm a lazy bum so I'll ask: have you benchmarked this against say, the DirectX vectors?


no... i dont use directx... but if somebody tires it id like to know the results


How about against a similar implimentation that doesn't use SSE?

Share this post


Link to post
Share on other sites
Dragon_Strike    264
Quote:
Original post by stanlo
Quote:
Original post by Dragon_Strike
Quote:
Original post by stanlo
I'm a lazy bum so I'll ask: have you benchmarked this against say, the DirectX vectors?


no... i dont use directx... but if somebody tires it id like to know the results


How about against a similar implimentation that doesn't use SSE?


well i got it to about 30-40% faster...

Share this post


Link to post
Share on other sites
Abominacion    207
You should use intrinsics (if supported by your compiler) instead of straight assembly. That way you get the generated instructions you want without having to worry about instruction scheduling, register allocation and loading/storing from/to memory, all of which will cause speed penalties in your code when calling several inlined SSE-using methods in a row (which is a fairly common case with core math classes).

Regards.

Share this post


Link to post
Share on other sites
Dragon_Strike    264
Quote:
Original post by Abominacion
You should use intrinsics (if supported by your compiler) instead of straight assembly. That way you get the generated instructions you want without having to worry about instruction scheduling, register allocation and loading/storing from/to memory, all of which will cause speed penalties in your code when calling several inlined SSE-using methods in a row (which is a fairly common case with core math classes).

Regards.


interesting.. where can i find some good tutorial on this?

Share this post


Link to post
Share on other sites
Abominacion    207
I'm assuming you use Visual Studio. As I recall gcc syntax is different.

This is a simple tutorial about SSE intrinsics. Seeing as you already know assembly you mainly need a intrinsic listing for reference, with that it's just a matter of replacing each of the assembly opcodes with its equivalent intrinsic.

Regards.


Edit: fixed a couple of typos.

[Edited by - Abominacion on March 3, 2007 8:27:08 AM]

Share this post


Link to post
Share on other sites
Dragon_Strike    264
well i did another benchmark using windows time and 50000000 normalize, distance and cross operation...

Quote:

13:43:19 Start SSE
13:44:23 End SSE
13:44:23 Start
13:45:35 End


4 seconds with SSE
12 seconds without SSE

thats 300% faster

i probably got somethin wrong with the previous one..

[Edited by - Dragon_Strike on March 3, 2007 7:52:45 AM]

Share this post


Link to post
Share on other sites

Create an account or sign in to comment

You need to be a member in order to leave a comment

Create an account

Sign up for a new account in our community. It's easy!

Register a new account

Sign in

Already have an account? Sign in here.

Sign In Now

Sign in to follow this