The pixel resolution has a minor impact on the perfromance. One could run this program at 60 fps with a resolution of 1024x768, the quality however would be really bad. This for example runs at 60 fps (without hole filling):
[attachment=29510:60fps.PNG]
I also forgot to mention that my grafics card is kind of old. I use a Nvidia GeForce GTX 550Ti, which has a theoretical performance of 690 gflops (https://en.wikipedia.org/wiki/GeForce_500_series), 10x more than my cpu. It could outperform my cpu on a tree traversal algorithm.
Anyway, here's the whole shader code:
#define B_0000 0
#define B_0001 1
#define B_0010 2
#define B_0011 3
#define B_0100 4
#define B_0101 5
#define B_0110 6
#define B_0111 7
#define LAST 0x80000000
#define MAXDEPTH 2
#define MAXDEPTH2 3
#define CALLS 256
#define MINPIX 2048
cbuffer matrixBuff : register( b0 )
{
uint size : packoffset(c0); //buffer size per thread
uint pxsize : packoffset(c0.y); //pixel size
uint2 puffer : packoffset(c0.z);
};
struct BFSDataINT
{
int4 ab; // quadtree projection points
int4 dadb; // change of the projection
uint pointer; //pointer to the octree node
};
StructuredBuffer<uint> octree : register(t0);
StructuredBuffer<BFSDataINT> intrace : register(t1);
StructuredBuffer<uint> incount : register(t2);
RWTexture2D<float4> BufferOut : register(u0);
RWStructuredBuffer<BFSDataINT> outtrace : register(u1);
RWStructuredBuffer<uint> outcount : register(u2);
/*
Decode morton code from
https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
*/
uint Compact1By1(uint x)
{
x &= 0x55555555; // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
x = (x ^ (x >> 1)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10
x = (x ^ (x >> 2)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
x = (x ^ (x >> 4)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210
x = (x ^ (x >> 8)) & 0x0000ffff; // x = ---- ---- ---- ---- fedc ba98 7654 3210
return x;
}
uint DecodeMorton2X(uint code)
{
return Compact1By1(code >> 0);
}
uint DecodeMorton2Y(uint code)
{
return Compact1By1(code >> 1);
}
uint2 DecodeMorton(uint code)
{
return uint2(DecodeMorton2X(code),
DecodeMorton2Y(code));
}
bool hasChildren(int ptr)
{
return octree[ptr] & LAST;
}
uint getChild(int ptr, int now)
{
if (now != 0)
return ptr + octree[ptr + now];
else
return ptr + 8;
}
bool isFilled(int ptr)
{
return !(octree[ptr] & 0x2);
}
float4 getColor(int ptr)
{
uint val = octree[ptr] >> 8;
return float4((val & 0xFF) / (float)0xFF, ((val >> 8) & 0xFF) / (float)0xFF, ((val >> 16) & 0x7F) / (float)0x7F, 1.0f);
}
[numthreads(CALLS, 1, 1)]
void CSPASS( uint3 GID : SV_GroupID, uint3 TID : SV_GroupThreadID )
{
const int I1 = 0x10000;
const int I2 = 0x20000;
const uint indices[8] = { B_0000, B_0001, B_0010, B_0100, B_0011, B_0101, B_0110, B_0111 };
uint resultcount = 0;
uint idx = GID.x * CALLS + TID.x;
for (uint i = 0; i < incount[idx / 4]; ++i)
{
//Prepare
int offset = i + size * (idx / 4);
int startptr = intrace[offset].pointer;
BFSDataINT _strace = intrace[offset];
BFSDataINT strace;
strace.ab =
int4( (idx & 1) == 0 ? _strace.ab.x : (_strace.ab.x + _strace.ab.z) / 2,
(idx & 2) == 0 ? _strace.ab.y : (_strace.ab.y + _strace.ab.w) / 2,
(idx & 1) == 0 ? (_strace.ab.x + _strace.ab.z) / 2 : _strace.ab.z,
(idx & 2) == 0 ? (_strace.ab.y + _strace.ab.w) / 2 : _strace.ab.w);
strace.dadb =
int4( (idx & 1) == 0 ? _strace.dadb.x : (_strace.dadb.x + _strace.dadb.z) / 2,
(idx & 2) == 0 ? _strace.dadb.y : (_strace.dadb.y + _strace.dadb.w) / 2,
(idx & 1) == 0 ? (_strace.dadb.x + _strace.dadb.z) / 2 : _strace.dadb.z,
(idx & 2) == 0 ? (_strace.dadb.y + _strace.dadb.w) / 2 : _strace.dadb.w);
strace.pointer = startptr;
if (strace.ab.z - strace.ab.x > I2)
{
//must split
int resultoffset = resultcount + (size / 4) * idx;
outtrace[resultoffset] = strace;
++resultcount;
if (resultcount == size / 4)
{
outcount[idx] = resultcount;
if(pxsize < MINPIX)
{
uint2 code = DecodeMorton(idx) * pxsize;
for(uint _y = 0; _y < pxsize; ++_y)
for(uint _x = 0; _x < pxsize; ++_x)
{
BufferOut[code + uint2(_x, _y)] = getColor(intrace[size * (idx / 4)].pointer);
}
}
return;
}
}
else
{
//Start trace
int depth = 0;
uint _now[MAXDEPTH + 1]; //0...7
BFSDataINT trace[MAXDEPTH + 1];
bool haschildren[MAXDEPTH + 1];
for (int j = 0; j < MAXDEPTH + 1; ++j)
{
_now[j] = 0;
}
trace[depth] = strace;
haschildren[depth] = hasChildren(trace[depth].pointer);
while (depth >= 0)
{
if (depth < MAXDEPTH)
{
if (haschildren[depth])
{
uint itrace = indices[_now[depth]];
BFSDataINT nexttrace;
nexttrace.pointer = getChild(trace[depth].pointer, itrace);
haschildren[depth + 1] = hasChildren(nexttrace.pointer);
if(haschildren[depth + 1] || isFilled(nexttrace.pointer))
{
BFSDataINT nowtrace = trace[depth];
//compute next projection
nexttrace.ab = (itrace & 0x1 ? nowtrace.ab : nowtrace.ab - strace.dadb) * 2
+ int4((itrace & 0x2) ? -I1 : I1,
(itrace & 0x4) ? -I1 : I1,
(itrace & 0x2) ? -I1 : I1,
(itrace & 0x4) ? -I1 : I1);
//traverse
++_now[depth];
//if octree is inside projection
if (nexttrace.ab.z - nexttrace.ab.x > 0 && nexttrace.ab.z > -I1 && nexttrace.ab.x - strace.dadb.x * 2 <= I1
&& nexttrace.ab.w > -I1 && nexttrace.ab.y - strace.dadb.y * 2 <= I1)
{
if (nexttrace.ab.z - nexttrace.ab.x <= I2)
{
nexttrace.dadb = int4(0, 0, 0, 0);
//push stack
_now[depth + 1] = 0;
trace[depth + 1] = nexttrace;
++depth;
}
else
{
nexttrace.dadb = strace.dadb;
//must split => return result to next pass
int resultoffset = resultcount + (size / 4) * idx;
outtrace[resultoffset] = nexttrace;
++resultcount;
if (resultcount == size / 4)
{
outcount[idx] = resultcount;
if(pxsize < MINPIX)
{
uint2 code = DecodeMorton(idx) * pxsize;
for(uint _y = 0; _y < pxsize; ++_y)
for(uint _x = 0; _x < pxsize; ++_x)
{
BufferOut[code + uint2(_x, _y)] = getColor(intrace[size * (idx / 4)].pointer);
}
}
return;
}
}
}
}
else
++_now[depth];
}
else if (isFilled(trace[depth].pointer))
{
trace[depth].dadb = strace.dadb;
//add to result
int resultoffset = resultcount + (size / 4) * idx;
outtrace[resultoffset] = trace[depth];
++resultcount;
if (resultcount == size / 4)
{
outcount[idx] = resultcount;
if(pxsize < MINPIX)
{
uint2 code = DecodeMorton(idx) * pxsize;
for(uint _y = 0; _y < pxsize; ++_y)
for(uint _x = 0; _x < pxsize; ++_x)
{
BufferOut[code + uint2(_x, _y)] = getColor(intrace[size * (idx / 4)].pointer);
}
}
return;
}
//pop stack
--depth;
}
else
{
//pop stack
--depth;
}
}
else
{
trace[depth].dadb = strace.dadb;
//add to result
int resultoffset = resultcount + (size / 4) * idx;
outtrace[resultoffset] = trace[depth];
++resultcount;
if (resultcount == size / 4)
{
outcount[idx] = resultcount;
if(pxsize < MINPIX)
{
uint2 code = DecodeMorton(idx) * pxsize;
for(uint _y = 0; _y < pxsize; ++_y)
for(uint _x = 0; _x < pxsize; ++_x)
{
BufferOut[code + uint2(_x, _y)] = getColor(intrace[size * (idx / 4)].pointer);
}
}
return;
}
//pop stack
--depth;
}
while (_now[depth] == 8 && depth >= 0)
{
//end of traversion
if (depth == 0)
{
depth = -1;
break;
}
//pop stack
--depth;
}
}
}
}
outcount[idx] = resultcount;
}
[numthreads(CALLS, 1, 1)]
void CSLASTPASS( uint3 GID : SV_GroupID, uint3 TID : SV_GroupThreadID )
{
const int I1 = 0x10000;
const int I2 = 0x20000;
const uint indices[8] = { B_0000, B_0001, B_0010, B_0100, B_0011, B_0101, B_0110, B_0111 };
uint idx = GID.x * CALLS + TID.x;
uint gid = idx / (pxsize * pxsize);
uint pix = idx % (pxsize * pxsize);
uint2 local = uint2(pix % pxsize, pix / pxsize);
uint2 code = DecodeMorton(gid);
uint2 globa = code * pxsize;
float2 div = local / (float)pxsize;
float2 di2 = (local + uint2(1,1)) / (float)pxsize;
for (uint i = 0; i < incount[gid]; ++i)
{
//Prepare
int offset = i + size * gid;
int startptr = intrace[offset].pointer;
BFSDataINT _strace = intrace[offset];
BFSDataINT strace;
strace.ab =
int4(_strace.ab.x * (1.0f - div.x) + _strace.ab.z * div.x,
_strace.ab.y * (1.0f - div.y) + _strace.ab.w * div.y,
_strace.ab.x * (1.0f - di2.x) + _strace.ab.z * di2.x,
_strace.ab.y * (1.0f - di2.y) + _strace.ab.w * di2.y);
strace.dadb =
int4(_strace.dadb.x * (1.0f - div.x) + _strace.dadb.z * div.x,
_strace.dadb.y * (1.0f - div.y) + _strace.dadb.w * div.y,
_strace.dadb.x * (1.0f - di2.x) + _strace.dadb.z * di2.x,
_strace.dadb.y * (1.0f - di2.y) + _strace.dadb.w * di2.y);
strace.pointer = startptr;
if (strace.ab.z - strace.ab.x > I2)
{
BufferOut[globa + local] = getColor(startptr);
return;
}
else
{
//Start trace
int depth = 0;
uint _now[MAXDEPTH2 + 1]; //0...7
BFSDataINT trace[MAXDEPTH2 + 1];
bool haschildren[MAXDEPTH2 + 1];
for (int j = 0; j < MAXDEPTH + 1; ++j)
{
_now[j] = 0;
}
trace[depth] = strace;
haschildren[depth] = hasChildren(trace[depth].pointer);
while (depth >= 0)
{
if(depth < MAXDEPTH2)
{
if (haschildren[depth])
{
uint itrace = indices[_now[depth]];
BFSDataINT nexttrace;
nexttrace.pointer = getChild(trace[depth].pointer, itrace);
haschildren[depth + 1] = hasChildren(nexttrace.pointer);
if(haschildren[depth + 1] || isFilled(nexttrace.pointer))
{
BFSDataINT nowtrace = trace[depth];
//compute next projection
nexttrace.ab = (itrace & 0x1 ? nowtrace.ab : nowtrace.ab - strace.dadb) * 2
+ int4((itrace & 0x2) ? -I1 : I1,
(itrace & 0x4) ? -I1 : I1,
(itrace & 0x2) ? -I1 : I1,
(itrace & 0x4) ? -I1 : I1);
//traverse
++_now[depth];
//if octree is inside projection
if (nexttrace.ab.z - nexttrace.ab.x > 0 && nexttrace.ab.z > -I1 && nexttrace.ab.x - strace.dadb.x * 2 <= I1
&& nexttrace.ab.w > -I1 && nexttrace.ab.y - strace.dadb.y * 2 <= I1)
{
if (nexttrace.ab.z - nexttrace.ab.x <= I2)
{
nexttrace.dadb = int4(0, 0, 0, 0);
//push stack
_now[depth + 1] = 0;
trace[depth + 1] = nexttrace;
++depth;
}
else
{
BufferOut[globa + local] = getColor(nexttrace.pointer);
return;
}
}
}
else
++_now[depth];
}
else if (isFilled(trace[depth].pointer))
{
BufferOut[globa + local] = getColor(trace[depth].pointer);
return;
}
else
{
//pop stack
--depth;
}
}
else
{
BufferOut[globa + local] = getColor(trace[depth].pointer);
return;
}
while (_now[depth] == 8 && depth >= 0)
{
//end of traversion
if (depth == 0)
{
depth = -1;
break;
}
//pop stack
--depth;
}
}
}
}
}
This is the fixed point version. Just replace I1 with 1.0f, and I2 with 2.0f and every int4 with float4 to get the floating point one. The main problem with every tree traversal algorithm is that one has to rely heavily on branching, which is painfully slow on the gpu, because of their SIMD architecture.
If there are questions regarding the code, just ask.