Just thought I'd share a loop hoisting optimization technique I came up with the other day, which greatly simplifies loop hoisting in situations where you have many invariants you would like to hoist. For example if you're hoisting 6 invariants from a single inner loop, that's 64 possible combinations to deal with (assuming you want to handle all of them)... what's a lazy programmer to do??
It's highly unlikely nobody has stumbled across this approach before, I've just not personally come across this particular technique. It would have been handy to know about when I last worked on a particle system. Specifically a variant of this technique was used in said particle system, except the hoisting was done manually with monolithic and ugly switch statements (several pages long *shudder*).
I wasn't sure what performance this would squeeze out when used with modern optimizing compilers, certainly it will be heavily platform dependent - though I was quite surprised at how much a jump in performance I got in my own tests. It generally hovered around 5% improvement (worthwhile IMO considering little sacrifice in terms of functionality/readability), though I contrived an example where I got well over a 10% improvement in performance. This on a clean install of visual studio 2010, release mode (all default settings), on an Intel i5-2500K CPU. I'm interested to see what results others get, so I've pasted some code which people can use to test - but I'll leave people to use their own timing methodology, to see if anyone interested can come up with similar results (in case I've done something to bias results..).
So first off, for those unfamiliar with loop hoisting, it's just the following
void function(bool invariantCondition)
{
for(int i = 0; i < 100; ++i)
{
if(invariantCondition)
doSomething;
else
doSomethingElse;
}
versus
// hoist condition out
if(invariantCondition)
{
for(int i = 0; i < 100; ++i)
doSomething;
}
else
{
for(int i = 0; i < 100; ++i)
doSomethingElse;
}
All the following technique does is allow you to automatically generate a flag set parameter to pass into a function, which can be used to replace all the contained if statements. So say I have the following function (thrown together as a simple test case), where flags defines what properties of the particle that need to be updated. Ignore the use of globals and the fact you're going to get a pretty crap looking particle simulation, my aim was to keep it simple. Think of 'flags' as parameters artists have configured in the editor, defining the behavior of the particle system.
void UpdateParticles(unsigned int systemIndex, unsigned int flags)
{
const float dt = 1.0f / 60.0f;
const float accel[3] = {-1.0f, -9.8f, -1.0f};
const float NOISE_SCALE = 2.0f * dt;
VertexPCT* buffer = g_vertices[systemIndex];
for(int i = 0; i < PARTICLE_COUNT; ++i)
{
VertexPCT& vert = buffer[i];
if(flags & F_UPDATE_VEL)
{
vert.vel[0] += accel[0] * dt;
vert.vel[1] += accel[1] * dt;
vert.vel[2] += accel[2] * dt;
}
if(flags & F_VEL_NOISE)
{
vert.vel[0] += NOISE_SCALE * (float(rand()) / RAND_MAX);
vert.vel[1] += NOISE_SCALE * (float(rand()) / RAND_MAX);
vert.vel[2] += NOISE_SCALE * (float(rand()) / RAND_MAX);
}
vert.pos[0] += vert.vel[0] * dt;
vert.pos[1] += vert.vel[1] * dt;
vert.pos[2] += vert.vel[2] * dt;
if(flags & F_UPDATE_COLOR)
{
vert.color = 0xffffff00 | (i % 0xff);
}
if(flags & F_UPDATE_U)
{
vert.uv[0] = fmodf(vert.pos[0], 10.0f);
}
if(flags & F_UPDATE_V)
{
vert.uv[1] = fmodf(vert.pos[2], 10.0f);
}
if(flags & F_UPDATE_SCALE)
{
vert.scale[0] = std::min<float>(vert.scale[0] + dt, 1.0f);
vert.scale[1] = std::min<float>(vert.scale[1] + dt, 1.0f);
}
}
}
Above you can see there are six invariants, for the various things that we update in the loop. We can rewrite this method as:
template <int flags>
void UpdateParticles2(unsigned int systemIndex)
{
const float dt = 1.0f / 60.0f;
const float accel[3] = {-1.0f, -9.8f, -1.0f};
const float NOISE_SCALE = 2.0f * dt;
VertexPCT* buffer = g_vertices[systemIndex];
for(int i = 0; i < PARTICLE_COUNT; ++i)
{
VertexPCT& vert = buffer[i];
if(flags & F_UPDATE_VEL)
{
vert.vel[0] += accel[0] * dt;
vert.vel[1] += accel[1] * dt;
vert.vel[2] += accel[2] * dt;
}
if(flags & F_VEL_NOISE)
{
vert.vel[0] += NOISE_SCALE * (float(rand()) / RAND_MAX);
vert.vel[1] += NOISE_SCALE * (float(rand()) / RAND_MAX);
vert.vel[2] += NOISE_SCALE * (float(rand()) / RAND_MAX);
}
vert.pos[0] += vert.vel[0] * dt;
vert.pos[1] += vert.vel[1] * dt;
vert.pos[2] += vert.vel[2] * dt;
if(flags & F_UPDATE_COLOR)
{
vert.color = 0xffffff00 | (i % 0xff);
}
if(flags & F_UPDATE_U)
{
vert.uv[0] = fmodf(vert.pos[0], 10.0f);
}
if(flags & F_UPDATE_V)
{
vert.uv[1] = fmodf(vert.pos[2], 10.0f);
}
if(flags & F_UPDATE_SCALE)
{
vert.scale[0] = std::min<float>(vert.scale[0] + dt, 1.0f);
vert.scale[1] = std::min<float>(vert.scale[1] + dt, 1.0f);
}
}
}
Now the flags set is defined at compile time, effectively eliminating those branches (effectively hoisting them out) of the loop. By passing in flags as a template parameter, the conditionals become known at compile time, so it's no different from a set of if(true) or if(false) statements. How do we generate the template parameter 'flags' to pass in? A basic implementation would be a huge switch statement that invokes every required variant of UpdateParticles2, such as.
switch(flags)
{
case F_UPDATE_VEL:
UpdateParticles2<F_UPDATE_VEL>(systemIndex);
break;
case F_UPDATE_VEL | F_UPDATE_COLOR:
UpdateParticles2<F_UPDATE_VEL | F_UPDATE_COLOR>(systemIndex);
break;
..etc
Instead we can use the following template class to generate the flags for us (for any given number of flags)
template <int STATE_COUNT, int FLAGS>
class GenerateFlags
{
public:
static const int CHECK_BIT = (1 << (STATE_COUNT - 1));
static void Invoke(unsigned int systemIndex, unsigned int flags)
{
if(flags & CHECK_BIT)
{
GenerateFlags<STATE_COUNT - 1, FLAGS | CHECK_BIT>::Invoke(systemIndex, flags);
}
else
{
GenerateFlags<STATE_COUNT - 1, FLAGS>::Invoke(systemIndex, flags);
}
}
};
template <int FLAGS>
class GenerateFlags<0, FLAGS>
{
public:
static void Invoke(unsigned int systemIndex, unsigned int flags)
{
assert(flags == FLAGS);
UpdateParticles2<FLAGS>(systemIndex);
}
};
and call
GenerateFlags<FLAG_COUNT, 0>::Invoke(flags);
Where FLAG_COUNT is the maximum number of bit flags, in this case, 6. The compiler will now generate 64 specializations of the UpdateParticles2 method, each with the invariants as constant conditionals. As such if you have the relevant compile warnings enabled, you'll get a heap of warnings such as this: 'warning C4127: conditional expression is constant' .
In terms of performance you have to be mindful that you could easily wreak havoc with the instruction cache by bloating the code footprint (lowering performance), but the nice thing is you can easily test functions to see if there is any performance benefit.
Anyway that's about it, interested to hear feedback from others. I've attached the remainder of code that will allow the above to compile, for people who want to test the performance themself. For reference, the non hoisted version (single call to TestFunction) took a pretty consistent 137ms in my tests.
#include <conio.h>
#include <windows.h>
#include <math.h>
#include <numeric>
#include <assert.h>
void TestFunction(unsigned int flags);
void TestFunction2(unsigned int flags);
void UpdateParticles(unsigned int systemIndex, unsigned int flags);
template <int flags>
void UpdateParticles2(unsigned int systemIndex);
enum UpdateFlags
{
F_UPDATE_VEL = (1 << 0),
F_UPDATE_COLOR = (1 << 1),
F_UPDATE_U = (1 << 2),
F_UPDATE_V = (1 << 3),
F_UPDATE_SCALE = (1 << 4),
F_VEL_NOISE = (1 << 5),
FLAG_COUNT = 6,
F_UPDATE_ALL = 0xffffffff,
};
static const int NUM_ITERATIONS = 8;
static const int NUM_PARTICLE_SYSTEMS = 50;
static const int PARTICLE_COUNT = 50000;
struct VertexPCT
{
float pos[3];
float vel[3];
unsigned int color;
float uv[2];
float scale[2];
};
VertexPCT** g_vertices = 0;
//#pragma warning(1: 4127)
int _tmain(int argc, _TCHAR* argv[])
{
// this particular combination of flags demonstrates significant speedup when using loop hoisting
unsigned int flags = F_UPDATE_COLOR | F_UPDATE_VEL;
if(argc == 4)
{
// just in case the compiler magically optimizes our non hoisted version
// using the knowledge 'flags' is in fact constant (as it happens the compiler does, at least a little..)
flags |= 0xff000000;
}
// initialise our particle system memory once off
g_vertices = new VertexPCT*[NUM_PARTICLE_SYSTEMS];
for(int i = 0; i < NUM_PARTICLE_SYSTEMS; ++i)
{
g_vertices[i] = new VertexPCT[PARTICLE_COUNT];
memset(g_vertices[i], 0, sizeof(VertexPCT) * PARTICLE_COUNT);
}
///////////////////////////////
// [timer code here, for timing TestFunction and TestFunction2]
///////////////////////////////
return 0;
}
void TestFunction(unsigned int flags)
{
const unsigned int mask = (1 << FLAG_COUNT) - 1;
for(int i = 0; i < NUM_ITERATIONS; ++i)
{
for(int j = 0; j < NUM_PARTICLE_SYSTEMS; ++j)
{
UpdateParticles(j, flags & mask);
}
}
}
void TestFunction2(unsigned int flags)
{
const unsigned int mask = (1 << FLAG_COUNT) - 1;
for(int i = 0; i < NUM_ITERATIONS; ++i)
{
for(int j = 0; j < NUM_PARTICLE_SYSTEMS; ++j)
{
GenerateFlags<FLAG_COUNT, 0>::Invoke(j, flags & mask);
}
}
}



















