d3dx library sucks! please read this!

Started by
22 comments, last by DrunkenHyena 19 years, 1 month ago
i tested your code and the loop does not gets forever... post exactly the code with sse and 3dnow! and i'll try to see the error.
Advertisement
Quote:Original post by ashade
i tested your code and the loop does not gets forever... post exactly the code with sse and 3dnow! and i'll try to see the error.

#include <windows.h>#include <stdio.h>#include <d3dx9math.h>__declspec(align(16)) struct myVec3{	float x, y, z, w;	myVec3() {x = y = z = w = 0.0f;}	myVec3(float _x, float _y, float _z)		{x = _x; y = _y; z = _z; w = 0.0f;}};#define MATHinl inlineMATHinl void __declspec(naked) __fastcall MATHMyVec3Cross2(myVec3 *v1, myVec3 *v2, myVec3* pOut){	__asm 	{		mov ebx, [esp+4]		movaps xmm0, [ecx]		movaps xmm1, xmm0		movaps xmm2, [edx]		movaps xmm3, xmm2		shufps xmm0, xmm0, 11001001b		shufps xmm1, xmm1, 11010010b		shufps xmm2, xmm2, 11010010b		shufps xmm3, xmm3, 11001001b		mulps xmm0, xmm2		mulps xmm1, xmm3		subps xmm0, xmm1		movaps [ebx], xmm0		ret 4	}}MATHinl void __declspec(naked) __fastcall MATHMyVec3Cross(myVec3 *v1, myVec3 *v2, myVec3* pOut){	__asm	{		mov ebx, [esp+4]		movq mm0, [ecx+4] //a3 a2		movd mm1, [ecx] //0 a1		movd mm2, [ecx+8] //0 a3		movd mm4, [edx+8] //0 b3		punpckldq mm4, [edx] //b1 b3		movd mm5, [edx+4] //0 b2		movq mm6, [edx+4] //b3 b2		movd mm7, [edx] //0 b1		movq mm3, mm0 //0 a2		punpckldq mm2, mm1 //a1 a3		pfmul mm0, mm4		pfmul mm1, mm5		pfmul mm2, mm6		pfmul mm3, mm7		pfsub mm0, mm2		pfsub mm1, mm3		movq [ebx], mm0		movq [ebx+8], mm1		femms		ret 4	}}int main(int argc, char** argv){	myVec3 v1(1.5f, 2.0f, 1.0f);	myVec3 v2(5.0f, 4.0f, 1.0f), fd;	D3DXVECTOR3 v3(1.5f, 2.0f, 1.0f), v4(5.0f, 4.0f, 1.0f), v5;	LARGE_INTEGER liFreq, liStart, liEnd;	double dTime;	printf("Profiling custom 3DNow!... ");	QueryPerformanceFrequency(&liFreq);	QueryPerformanceCounter(&liStart);	for(DWORD i=0; i<100000000; ++i)	{		MATHMyVec3Cross(&v1, &v2, &fd);	}	QueryPerformanceCounter(&liEnd);	liEnd.QuadPart -= liStart.QuadPart;	dTime = (double)liEnd.QuadPart / ((double)liFreq.QuadPart / 1000.0);	printf("%I64lu ticks, %d ms\n",liEnd,(int)dTime);	printf("Profiling custom SSE!... ");	QueryPerformanceFrequency(&liFreq);	QueryPerformanceCounter(&liStart);	for(DWORD i=0; i<100000000; ++i)	{		MATHMyVec3Cross2(&v1, &v2, &fd);	}	QueryPerformanceCounter(&liEnd);	liEnd.QuadPart -= liStart.QuadPart;	dTime = (double)liEnd.QuadPart / ((double)liFreq.QuadPart / 1000.0);	printf("%I64lu ticks, %d ms\n",liEnd,(int)dTime);	printf("Profiling D3DX... ");	QueryPerformanceFrequency(&liFreq);	QueryPerformanceCounter(&liStart);	for(DWORD i=0; i<100000000; ++i)	{		D3DXVec3Cross(&v3, &v4, &v3);	}	QueryPerformanceCounter(&liEnd);	liEnd.QuadPart -= liStart.QuadPart;	dTime = (double)liEnd.QuadPart / ((double)liFreq.QuadPart / 1000.0);	printf("%I64lu ticks, %d ms\n",liEnd,(int)dTime);}

But I suspect it's MSVC playing silly buggers :P
Quote:Original post by ashade
hey coder, i'm planning to check for one's processor capabilities in the beginning of the program and make a function pointer table for each function containing 3 versions of the same code (d3dx code, sse, 3dnow). And I was giving only an example with the d3xvec3 function, which is only a simple and very used funtion... for more complex functions, I think I could have a remarkable gain of speed, let's say, 2 fps...


That's actually exactly what D3DX does. The first time you call a math function it queries the CPU type and builds a function table. So it can't really inline the core math parts, since it doesn't know until runtime which ones to use. (It can inline the call to the variable fpointer though.)

Also, note that MS doesn't necessarily write those. The CPU vendors (Intel, AMD) write and maintain those pieces of CPU-specific code. If they don't provide an implementation for certain functions, a non-SIMD version gets used.
[sub]My spoon is too big.[/sub]
Quote:Original post by Coder
2) D3DX Functions select optimum versions for the current hardware (i.e. SSE, 3DNow, SSE2). Your code doesn't, it assumes support for something and uses it.


I have a hard time believing that as this is in the D3DX-include file ;)

D3DXINLINE D3DXVECTOR3* D3DXVec3Cross    ( D3DXVECTOR3 *pOut, CONST D3DXVECTOR3 *pV1, CONST D3DXVECTOR3 *pV2 ){    D3DXVECTOR3 v;#ifdef D3DX_DEBUG    if(!pOut || !pV1 || !pV2)        return NULL;#endif    v.x = pV1->y * pV2->z - pV1->z * pV2->y;    v.y = pV1->z * pV2->x - pV1->x * pV2->z;    v.z = pV1->x * pV2->y - pV1->y * pV2->x;    *pOut = v;    return pOut;}


EDIT: However, some functions are taken from the library instead, like D3DXVec3Normalize which makes them slower... I don't know the reason for this though as normalize is a very small function too.


Here is a *legendary* [wink] thread that (a) establishes that game studios use D3DX and (b) goes over a bunch of D3DX optimizations. It was quite like this thread.
Dustin Franklin ( circlesoft :: KBase :: Mystic GD :: ApolloNL )
The 1st call to D3DX is the slowest because during the 1st call D3DX runs a test to see what the CPU of the user is capable of, thus, making the 1st call the slowest

So to get a good reading you should call D3DX twice and compare the second call to your function.
Haven't read the other posts. The first call to a d3dx function a test is taken to see what path is should take. SSE, SSE2, 3DNOW and whatever else there is.

If would be faster to use function pointers but they are too lazy to do that.

EDIT: Here is my cross routine

mov esi, v1 ;
mov edi, v2 ;

movups xmm0, [esi] ;
movups xmm1, [edi] ;
movaps xmm2, xmm0 ;
movaps xmm3, xmm1 ;

shufps xmm0, xmm0, 0xC9 ;
shufps xmm1, xmm1, 0xD2 ;
mulps xmm0, xmm1 ;

shufps xmm2, xmm2, 0xD2 ;
shufps xmm3, xmm3, 0xC9 ;
mulps xmm2, xmm3 ;

subps xmm0, xmm2 ;

mov esi, this ;
movups [esi], xmm0 ;
I just have to point this out...

"don't forget that the 3dnow! version only works on amd processors!!!"

Sorry D3DX doesn't suck, your 3dnow version sucks. No offense meant that comes over harsh but you opened the door to this. I'll explain the problem with your assertion.

First off to assume a solution is better, it must be better for usage, as well as compatiblity and of course speed.

If your programming a game, would you rather support

a. all windows machines.

b. Only windows machines running AMD processors?

I've never heard any of any major game going "We're only going to make a game that works for X processors." In fact many go "we're only going to make a game for all x86 processors, running such and such OS.

Now I can understand using this for a very specialized application that only a few people and machines use, all using AMD, BUT in this case you don't have a reason to use DirectX, directX is basically a GAME API.

Second As people have meantioned the code actually is not Always faster, so in that case it's not better.

Third, DirectX watches for a lot of errors and is optimized for many of the major devices and systems. It almost never fails on it's end unless bad data was pushed in. Error checking and redundancy might seem like a waste of time if you know what your doing,but when writing and API you have to do a little more of that because every type of monkey will try to do stupid things.

Other people have meantioned other problems too.

In conclusion, I'd like to say Congradulations. Even though your code has proven to be 100 percent better, beating a standardized API in anything is always a big step and shows at least some mastery of your skill in coding. The fact you use assembly proves you have at least confidence. Congradulations again, and I hope you don't take everything I said as negative, just that you can't just assume your better at something then a company the size of Microsoft (though I wish someone could prove they were and take them down a couple notches :) )

I hope you don't take this as anything really other then advice/explination of what really goes on.

P.S. And as someone meantioned, major companies don't really use DirectX that often. (They may if they are using just PC versions of games but other then that.)
Quote:
myVec3 v1(1.5f, 2.0f, 1.0f);
myVec3 v2(5.0f, 4.0f, 1.0f), fd;

D3DXVECTOR3 v3(1.5f, 2.0f, 1.0f), v4(5.0f, 4.0f, 1.0f), v5;

unsigned long i1 = timeGetTime();

for(unsigned int i = 0; i<100000000; i++) {

MATHMyVec3Cross(&v1, &v2, &fd);
//MATHMyVec3Cross2(&v1, &v2, &fd);
//D3DXVec3Cross(&v5, &v4, &v3);

}

unsigned long i2 = timeGetTime();
i2-=i1;


HEY, COMPILE ON VC++ 2003!!


Quite frankly, I'm surprised you got any real meaningful data. VC should have removed the loop since it doesn't do anything. Unless you are running a debug build. You can't just put a function in a loop and call it a million times - you have to do non-trivial operation the compiler can't just optimize away.

Cross product is also a very uninteresting test. Since the call goes through a dispatch table, you always have a small amount of overhead for any math routine. This shows up in very simple math operations. My suggestion: Try beating the Matrix Multiply.

EvilDecl81
I suspect the d3dx library has two versions, one lies inside d3dx9math.inl and other is already compiled on d3dx9.lib. This can justify why I've got vector crossing two slow (it was using d3dx9math.inl, which is not optimized). Now, i'm getting really fast replies from d3dx code. Moreover, it's much faster than any processor especific optimization, so I think the d3dx9.lib functions are using the graphics hardware to make the calculations instead of 3dnow, sse or the standard fpu... do you think am I right?
Hey, please test this code for me and tell me how fast it is:


[iframe]

#define M00 0
#define M01 4
#define M02 8
#define M03 12
#define M10 16
#define M11 20
#define M12 24
#define M13 28
#define M20 32
#define M21 36
#define M22 40
#define M23 44
#define M30 48
#define M31 52
#define M32 56
#define M33 60
void applyTransform (myVec3 *RES, const myVec3 *V, const float *M, int NUMVERTS)
{
_asm {
MOV EDX, [V] //EDX = source vector ptr
MOV EAX, [M] //EAX = matrix ptr
MOV EBX, [RES] //EBX = destination vector ptr
MOV ECX, [NUMVERTS] //ECX = number of vertices to transform
//3DNow! version of fully general 3D vertex tranformation.
//Optimal for AMD Athlon (completes in 16 cycles)
FEMMS //clear MMX state
ALIGN 16 //for optimal branch alignment
$$xform:
ADD EBX, 16 //res++
MOVQ MM0, QWORD PTR [EDX] // v->y | v->x
MOVQ MM1, QWORD PTR [EDX+8] // v->w | v->z
ADD EDX, 16 //v++
MOVQ MM2, MM0 // v->y | v->x
MOVQ MM3, QWORD PTR [EAX+M00] // m[0][1] | m[0][0]
PUNPCKLDQ MM0, MM0 // v->x | v->x
MOVQ MM4, QWORD PTR [EAX+M10] // m[1][1] | m[1][0]
PFMUL MM3, MM0 //v->x*m[0][1] | v->x*m[0][0]
PUNPCKHDQ MM2, MM2 // v->y | v->y
PFMUL MM4, MM2 //v->y*m[1][1] | v->y*m[1][0]
MOVQ MM5, QWORD PTR [EAX+M02] // m[0][3] | m[0][2]
MOVQ MM7, QWORD PTR [EAX+M12] // m[1][3] | m[1][2]
MOVQ MM6, MM1 // v->w | v->z
PFMUL MM5, MM0 //v->x*m[0][3] | v0>x*m[0][2]
MOVQ MM0, QWORD PTR [EAX+M20] // m[2][1] | m[2][0]
PUNPCKLDQ MM1, MM1 // v->z | v->z
PFMUL MM7, MM2 //v->y*m[1][3] | v->y*m[1][2]
MOVQ MM2, QWORD PTR [EAX+M22] // m[2][3] | m[2][2]
PFMUL MM0, MM1 //v->z*m[2][1] | v->z*m[2][0]
PFADD MM3, MM4 //v->x*m[0][1]+v->y*m[1][1] |
// v->x*m[0][0]+v->y*m[1][0]
MOVQ MM4, QWORD PTR [EAX+M30] // m[3][1] | m[3][0]
PFMUL MM2, MM1 //v->z*m[2][3] | v->z*m[2][2]
PFADD MM5, MM7 //v->x*m[0][3]+v->y*m[1][3] |
// v->x*m[0][2]+v->y*m[1][2]
MOVQ MM1, QWORD PTR [EAX+M32] // m[3][3] | m[3][2]
PUNPCKHDQ MM6, MM6 // v->w | v->w
PFADD MM3, MM0 //v->x*m[0][1]+v->y*m[1][1]+v->z*m[2][1] |
// v->x*m[0][0]+v->y*m[1][0]+v->z*m[2][0]
PFMUL MM4, MM6 //v->w*m[3][1] | v->w*m[3][0]
PFMUL MM1, MM6 //v->w*m[3][3] | v->w*m[3][2]
PFADD MM5, MM2 //v->x*m[0][3]+v->y*m[1][3]+v->z*m[2][3] |
// v->x*m[0][2]+v->y*m[1][2]+v->z*m[2][2]
PFADD MM3, MM4 //v->x*m[0][1]+v->y*m[1][1]+v->z*m[2][1]+
// v->w*m[3][1] | v->x*m[0][0]+v->y*m[1][0]+
// v->z*m[2][0]+v->w*m[3][0]
MOVQ [EBX-16], MM3 //store res->y | res->x
PFADD MM5, MM1 //v->x*m[0][3]+v->y*m[1][3]+v->z*m[2][3]+
// v->w*m[3][3] | v->x*m[0][2]+v->y*m[1][2]+
// v->z*m[2][2]+v->w*m[3][2]
MOVQ [EBX-8], MM5 //store res->w | res->z
DEC ECX //numverts--
JNZ $$XFORM //until numverts == 0
FEMMS //clear MMX state
}
}
[/iframe]

compare it to D3DXVec3TransformCoordArray like this:


myVec3 *arr = new myVec3[1000000];
// D3DXVECTOR3 * arr2 = new D3DXVECTOR3[1000000];
for(unsigned int i = 0; i<1000000; i++) {

arr.x = (float)i * (float)timeGetTime() * 0.00001f;
arr.y = (float)i * (float)timeGetTime() * 0.00001f;
arr.z = (float)i * (float)timeGetTime() * 0.00001f;
//arr2.x = (float)i * (float)timeGetTime() * 0.00001f;
}

float mm[16] = {15.0f, 55.0f, -9.0f, 1.0f, 5.0f, 15.3f, 7.2f,
15.0f, 55.0f, -9.0f, 1.0f, 5.0f, 15.3f, 7.2f, 5.6f, 8.8f
};

LARGE_INTEGER liFreq, liStart, liEnd;
double dTime;

DWORD i2=0;

printf("Profiling... ");
QueryPerformanceFrequency(&liFreq);
QueryPerformanceCounter(&liStart);

while (i2 < 100) {
applyTransform(arr, arr, &mm[0], 1000000);
//D3DXVec3TransformCoordArray(arr2, 12, arr2, 12, (D3DXMATRIX*)mm, 1000000);
__asm inc i2
}

QueryPerformanceCounter(&liEnd);
liEnd.QuadPart -= liStart.QuadPart;
dTime = (double)liEnd.QuadPart / ((double)liFreq.QuadPart / 1000.0f);
printf("%I64lu ticks, %d ms\n",liEnd,(int)dTime);

This topic is closed to new replies.

Advertisement