Jump to content

  • Log In with Google      Sign In   
  • Create Account

#Actualjbadams

Posted 10 March 2014 - 06:50 AM

Hey guys, I've done a "quick" first implementation of a vector normalize and dot product using sse intrinsics and was wondering if there's still something that could be optimized further.

 

Here's my code:

_declspec(align(16))
struct Vec4
{
	float x, y, z, w;

	inline const Vec4 Normalize()
	{
		__m128 tmp;

		// copy data into the 128bit register
		tmp = _mm_set_ps(w, z, y, x);

		// 0x7F = 0111 1111 ~ means we don't want the w-component multiplied
		// and the result written to all 4 components
		__m128 dp = _mm_dp_ps(tmp, tmp, 0x7F); 
		
		// compute rsqrt of the dot product
		dp = _mm_rsqrt_ps(dp);

		// vec * rsqrt(dot(vec, vec))
		tmp = _mm_mul_ps(tmp, dp);
		
		Vec4 vec;
		union {__m128 v; float f[4]; } uf; // to access the 4 floats
		uf.v = tmp;

		vec.x = uf.f[0];
		vec.y = uf.f[1];
		vec.z = uf.f[2];
		vec.w = 1.0f;

		return vec;
	}

	inline const float Dot(const Vec4 &v2)
	{
		__m128 a;
		
		// copy data into the 128bit register
		a = _mm_set_ps(w, z, y, x);
		__m128 b = _mm_set_ps(v2.w, v2.z, v2.y, v2.x);

		// 0x7F = 0111 1111 ~ means we don't want w-component multiplied
		// and the result written to all 4 components
		__m128 dp = _mm_dp_ps(a, b, 0x7F); 
		
		Vec4 vec;
		union {__m128 v; float f[4]; } uf; // to access the 4 floats
		uf.v = dp;

		return uf.f[0];
	}
};

Thanks in advance smile.png

: Added tags.


#2lipsryme

Posted 20 September 2013 - 09:27 AM

Hey guys, I've done a "quick" first implementation of a vector normalize and dot product using sse intrinsics and was wondering if there's still something that could be optimized further.

 

Here's my code:

_declspec(align(16))
struct Vec4
{
	float x, y, z, w;

	inline const Vec4 Normalize()
	{
		__m128 tmp;

		// copy data into the 128bit register
		tmp = _mm_set_ps(w, z, y, x);

		// 0x7F = 0111 1111 ~ means we don't want the w-component multiplied
		// and the result written to all 4 components
		__m128 dp = _mm_dp_ps(tmp, tmp, 0x7F); 
		
		// compute rsqrt of the dot product
		dp = _mm_rsqrt_ps(dp);

		// vec * rsqrt(dot(vec, vec))
		tmp = _mm_mul_ps(tmp, dp);
		
		Vec4 vec;
		union {__m128 v; float f[4]; } uf; // to access the 4 floats
		uf.v = tmp;

		vec.x = uf.f[0];
		vec.y = uf.f[1];
		vec.z = uf.f[2];
		vec.w = 1.0f;

		return vec;
	}

	inline const float Dot(const Vec4 &v2)
	{
		__m128 a;
		
		// copy data into the 128bit register
		a = _mm_set_ps(w, z, y, x);
		__m128 b = _mm_set_ps(v2.w, v2.z, v2.y, v2.x);

		// 0x7F = 0111 1111 ~ means we don't want w-component multiplied
		// and the result written to all 4 components
		__m128 dp = _mm_dp_ps(a, b, 0x7F); 
		
		Vec4 vec;
		union {__m128 v; float f[4]; } uf; // to access the 4 floats
		uf.v = dp;

		return uf.f[0];
	}
};

Thanks in advance smile.png


#1lipsryme

Posted 20 September 2013 - 09:26 AM

Hey guys, I've done a "quick" first implementation of a vector normalize and dot product using sse intrinsics and was wondering if there's still something that could be optimized further.

 

Here's my code:

_declspec(align(16))
struct Vec4
{
	float x, y, z, w;

	inline const Vec4 Normalize()
	{
		__m128 tmp;

		// copy data into the 128bit register
		tmp = _mm_set_ps(w, z, y, x);

		// 0x7F = 0111 1111 ~ means we don't want the w-component multiplied
		// and the result written to all 4 components
		__m128 dp = _mm_dp_ps(tmp, tmp, 0x7F); 
		
		// compute rsqrt of the dot product
		dp = _mm_rsqrt_ps(dp);

		// vec * rsqrt(dot(vec, vec))
		tmp = _mm_mul_ps(tmp, dp);
		
		Vec4 vec;
		union {__m128 v; float f[4]; } uf; // to access the 4 floats
		uf.v = tmp;

		vec.x = uf.f[0];
		vec.y = uf.f[1];
		vec.z = uf.f[2];
		vec.w = 1.0f;

		return vec;
	}

	inline const float Dot(const Vec4 &v2)
	{
		__m128 a;
		
		// copy data into the 128bit register
		a = _mm_set_ps(w, z, y, x);
		__m128 b = _mm_set_ps(v2.w, v2.z, v2.y, v2.x);

		// 0x7F = 0111 1111 ~ means we don't want w-component multiplied
		// and the result written to all 4 components
		__m128 dp = _mm_dp_ps(a, b, 0x7F); 
		
		Vec4 vec;
		union {__m128 v; float f[4]; } uf; // to access the 4 floats
		uf.v = dp;

		return uf.f[0];
	}
};

Thanks in advance smile.png


PARTNERS