Sign in to follow this  
masterbubu

Vertex Arrays Speed Issue

Recommended Posts

Hi, I have built an simple particle engine. Basically i have an Emitter class that hold a list of particles. On each iteration I tell each particle (anther class) to update its translation (glTranslate(), glRotate ) and draw it self. I wanted to gain speed, so i thought , reducing the API calls by changing the structure. On the update function of each particle I manually multiply each particle vertices on rotation and translation matrix. Then i save the transformed vertex. So after I finish updating all the particle, I create a vertex array and copy the transformed data of all particles. then on one call, i draw the particles. The problem is the it much slower then the old method. what am i doing wrong, and how can i improve the particle engine speed?

Share this post


Link to post
Share on other sites
You shouldn't be creating a vertex array each frame. Allocate one that's as large as the max number of particles you'll have in a scene and update that one each frame.

It's important to profile your code and see where the bottleneck is. Maybe your math code is slow, or maybe you're wasting time in another part of your code. Also make sure you're not running a debug build :)

Share this post


Link to post
Share on other sites
How much slower is "much slower" ?

Where is the bottleneck? If you're trying to make things fast then you need to profile this stuff so we can tell you how to fix whatever your bottleneck is.

Are you CPU or GPU bound? Has the GPU time decreased over the old method?

Have you considered doing the vertex transform on the verts in the vertex shader, perhaps via instancing?

If you've got a set of transforms and a bunch of verts to apply them to, and want to do it on the CPU, write the transform loop using SIMD intrinsics (probably SSE in your case, but depends on the target processor)

Share this post


Link to post
Share on other sites
hi,

so the first code is the one without vertex arrays that perform faster:

void Emitter::Update(long time)
{
for (int i = 0; i < _numEmission; ++i)
addParticle();

glMatrixMode(GL_MODELVIEW);
glPushMatrix();

glTranslatef(_position._x, _position._y, _position._z);
glEnable(GL_TEXTURE_2D);
glBindTexture(GL_TEXTURE_2D, _texture->_textID);

for (list<Particle * >::iterator it = _particles.begin(); it != _particles.end(); )
{
(*it)->Draw(time);

if (!(*it)->_active)
{
delete (*it);
it = _particles.erase(it);
}
else
it++;
}

glEnable(GL_LIGHTING);
glEnable(GL_DEPTH_TEST);

glMatrixMode(GL_MODELVIEW);
glPopMatrix();
}


void Particle::Draw(long time)
{
glMatrixMode(GL_MODELVIEW);
glPushMatrix();

float change = float(time - _lastTime) / 1000.0f;
_velocity += _acceleration * change;
_position += _velocity * change;

float x = _position._x;
float y = _position._y;
float z = _position._z;

if (_position._y < 0.0f)
{
_velocity._y = _velocity._y * -_bounciness;
_position._y = 0.0f;
}

glTranslatef(x, y, z);

if (_ind % 5 == 0)
glRotatef(_life * 100.0f, 0 , 0 ,1);
else
glRotatef(_life * -100.0f, 0 , 0 ,1);

glBegin(GL_TRIANGLE_STRIP);
glTexCoord2f(1.0f, 1.0f); glVertex3f ( _size , _size, 0);
glTexCoord2f(0.0f, 1.0f); glVertex3f (-_size , _size, 0);
glTexCoord2f(1.0f, 0.0f); glVertex3f ( _size , -_size, 0);
glTexCoord2f(0.0f, 0.0f); glVertex3f (-_size , -_size, 0);
glEnd();

_life -= change;

if (_life <= 0.0f)
_active = false;

_lastTime = time;

glMatrixMode(GL_MODELVIEW);
glPopMatrix();
}

so now the new code:

void Emitter::Update(long time)
{
for (int i = 0; i < _numEmission; ++i)
addParticle();

glMatrixMode(GL_MODELVIEW);
glPushMatrix();

glTranslatef(_position._x, _position._y, _position._z);
glEnable(GL_TEXTURE_2D);
glBindTexture(GL_TEXTURE_2D, _texture->_textID);

for (list<Particle * >::iterator it = _particles.begin(); it != _particles.end(); )
{
(*it)->Draw(time);

if (!(*it)->_active)
{
delete (*it);
it = _particles.erase(it); }
else
it++;
}

glEnableClientState(GL_VERTEX_ARRAY);
glEnableClientState(GL_TEXTURE_COORD_ARRAY);
glEnableClientState(GL_COLOR_ARRAY);

glVertexPointer(3, GL_FLOAT, 0, tmpV );
glTexCoordPointer(2,GL_FLOAT,0, tmpTex);

int p = 0;
for (list<Particle * >::iterator it = _particles.begin(); it != _particles.end(); ++it)
{
for (int c = 0 ; c < 4; ++c){ //matrix cols
for (int r = 0 ; r < 3; ++r) //matrix rows
tmpV[ p*3*4 + c*3 + r] = (*it)->_transfromedObj[r][c];

for (int r = 0 ; r < 2; ++r) //matrix rows
tmpTex[p*2*4 + c*2 + r] = _particleTextCoord[c][r];


a[p*4 + c] = p*4 + c;
}
++p;
}

glDrawElements(GL_QUADS, _particles.size()*4, GL_UNSIGNED_INT, a); //its quad cause polygon and triangle_strip cause problems

glVertexPointer(3, GL_FLOAT, 0, NULL );
glTexCoordPointer(2,GL_FLOAT,0, NULL );

glDisableClientState(GL_VERTEX_ARRAY);
glDisableClientState(GL_TEXTURE_COORD_ARRAY);

glEnable(GL_LIGHTING);
glEnable(GL_DEPTH_TEST);

glMatrixMode(GL_MODELVIEW);
glPopMatrix();
}


void Particle::Draw(long time)
{

float change = float(time - _lastTime) / 1000.0f;

_velocity += _acceleration * change;
_position += _velocity * change;

float x = _position._x;
float y = _position._y;
float z = _position._z;

if (_position._y < 0.0f)
{
_velocity._y = _velocity._y * -_bounciness;
_position._y = 0.0f;
}

_translate[0][3] = x; _translate[1][3] = y; _translate[2][3] = z;

_transfromedObj = (_polygon * _rotate) * _translate;

if (_ind % 5 == 0 )
{
_rotate[0][0] = cos(_life);
_rotate[0][1] = -sin(_life);
_rotate[1][0] = sin( _life);
_rotate[1][1] = cos( _life);
}
else
{
_rotate[0][0] = cos(-_life);
_rotate[0][1] = -sin(-_life);
_rotate[1][0] = sin( -_life);
_rotate[1][1] = cos( -_life);
}

_life -= change;

if (_life <= 0.0f)
_active = false;

_lastTime = time;

}

i am new to this forum so i am sorry the code post look like that.

the matrices _translate _rotate _polygon are all KFbxmatrix (4x4) of AUTODESK FBXSDK package.
i allocated the memory for the vertex arrays on Emitter contractor (there is a room for 500 particles).

"a" is the index array, sorry about the names, i have the the code fast just to
see if i can gain performances.

tnx





Share this post


Link to post
Share on other sites
Use [source][/source] tags to present source code. They will add syntax highlighting, and prevent the code from taking up the whole page.

I very much doubt that you actually need to do all that math for your particles. That is most likely the cause of your slowdowns, as you appear to be performing a large number of trigonometric operations as well as several matrix multiplications per particle.

You can cut a lot of this out entirely by usin a vertex shader (or geometry shader) to generate the quads on the GPU, from just the list of points. After that, you can simplify the rest by just calculating the vertex positions, while should be a simple vector operation, rather than a matrix operation.

Share this post


Link to post
Share on other sites
tnx i will definitely try doing the calculations on GPU.
As you can see i have tryied imitating older version of my particle engine by adding the vertex array functionality. The only diffrence between the versions is that on the last one i have one call the to draw all the particles. The matrix multipliction done on both version, either by opengl (glTranslate, glRotate) or manually by me. but its should be the same effect.

Why the first version is faster?


Share this post


Link to post
Share on other sites
Quote:
Original post by masterbubu
Why the first version is faster?


Because it's doing less work than the first one. You have done your own matrix calculations instead of relying on the hardware in your system built for the job (the GPU). You'd be better off feeding the rotation and position data to the GPU as varying attributes to a geometry shader.


Dynamically allocating each particle separately, and then storing each seperate allocation in a separate allocation for each list elements is going to kill your performance. Use a better container - Ideally an array! You could use a basic free-list to remove the need for continual allocation/deallocation and the memory fragmentation it will lead to.


template<typename T>
class FreeList<T>
{
public:
typdef T* iterator;
typdef const T* const_iterator;
FreeList<T>(const unsigned max_size)
{
mMax = max_size;
mNumUsed=0;
mArray = new T[max_size];
}
~FreeList<T>() { delete [] mArray; }

T* alloc()
{
assert(mNumUsed != mMax);
return mArray+(mNumUsed++);
}

void free(T* element)
{
if(mNumUsed>1)
{
memcpy(element,mArray+mNumUsed-1,sizeof(T));
--mNumUsed;
}
}
iterator begin() { return mArray; }
iterator end() { return mArray+mNumUsed; }
const_iterator begin() const { return mArray; }
const_iterator end() const { return mArray+mNumUsed; }
private:
T* mArray;
unsigned mNumUsed;
unsigned mMax;
};


struct Particle
{
// position
float x;
float y;
float z;
// velocity
float vx;
float vy;
float vz;
// colour
float r;
float g;
float b;
float life_remaining;
};

FreeList<Particle> ParticleArray(10000);

// allocate some particles
Particle* p0 = ParticleArray.alloc();
Particle* p1 = ParticleArray.alloc();
Particle* p2 = ParticleArray.alloc();

// process them... - remove any that are dead!
for(FreeList<Particle>::iterator it = ParticleArray.begin();it<ParticleArray.end(); /*nothing*/ )
{
if(it->life_remaining<0)
{
ParticleArray.free(it);
}
else
++it;
}



advantage of vertex arrays in this situation is pretty simple ;)


glVertexPointer(3, GL_FLOAT, sizeof(Particle), &ParticleArray.begin()->x );
glColorPointer(3, GL_FLOAT, sizeof(Particle), &ParticleArray.begin()->r );



Things bugger up somewhat if you wish to use billboarded quads, however the simplest way around that is to send point data from the particle array to the GPU, and use a geometry shader to generate the actual polygons...

Share this post


Link to post
Share on other sites

Create an account or sign in to comment

You need to be a member in order to leave a comment

Create an account

Sign up for a new account in our community. It's easy!

Register a new account

Sign in

Already have an account? Sign in here.

Sign In Now

Sign in to follow this