Jump to content

  • Log In with Google      Sign In   
  • Create Account

_undex

Member Since 18 Oct 2012
Offline Last Active May 29 2016 01:23 PM

#5260113 Performance of an experimental SVO raycaster

Posted by on 02 November 2015 - 08:18 AM

The pixel resolution has a minor impact on the perfromance. One could run this program at 60 fps with a resolution of 1024x768, the quality however would be really bad. This for example runs at 60 fps (without hole filling):

 

60fps.PNG

 

I also forgot to mention that my grafics card is kind of old. I use a Nvidia GeForce GTX 550Ti, which has a theoretical performance of 690 gflops (https://en.wikipedia.org/wiki/GeForce_500_series), 10x more than my cpu. It could outperform my cpu on a tree traversal algorithm.

 

Anyway, here's the whole shader code:

#define B_0000  0
#define B_0001  1
#define B_0010  2
#define B_0011  3
#define B_0100  4
#define B_0101  5
#define B_0110  6
#define B_0111  7
#define LAST 0x80000000

#define MAXDEPTH 2
#define MAXDEPTH2 3
#define CALLS 256

#define MINPIX 2048

cbuffer matrixBuff : register( b0 )
{
	uint size : packoffset(c0); //buffer size per thread
	uint pxsize : packoffset(c0.y); //pixel size
	uint2 puffer : packoffset(c0.z);
};

struct BFSDataINT
{
	int4 ab; // quadtree projection points
	int4 dadb; // change of the projection
	uint pointer; //pointer to the octree node
};

StructuredBuffer<uint> octree : register(t0);
StructuredBuffer<BFSDataINT> intrace : register(t1);
StructuredBuffer<uint> incount : register(t2);

RWTexture2D<float4> BufferOut : register(u0);
RWStructuredBuffer<BFSDataINT> outtrace : register(u1);
RWStructuredBuffer<uint> outcount : register(u2);

/*
Decode morton code from
https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
*/
uint Compact1By1(uint x)
{
  x &= 0x55555555;                  // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
  x = (x ^ (x >>  1)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10
  x = (x ^ (x >>  2)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
  x = (x ^ (x >>  4)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210
  x = (x ^ (x >>  8)) & 0x0000ffff; // x = ---- ---- ---- ---- fedc ba98 7654 3210
  return x;
}

uint DecodeMorton2X(uint code)
{
  return Compact1By1(code >> 0);
}

uint DecodeMorton2Y(uint code)
{
  return Compact1By1(code >> 1);
}

uint2 DecodeMorton(uint code)
{
	return uint2(DecodeMorton2X(code),
				DecodeMorton2Y(code));
}

bool hasChildren(int ptr)
{
	return octree[ptr] & LAST;
}

uint getChild(int ptr, int now)
{
	if (now != 0)
		return ptr + octree[ptr + now];
	else
		return ptr + 8;
}

bool isFilled(int ptr)
{
	return !(octree[ptr] & 0x2);
}

float4 getColor(int ptr)
{
	uint val = octree[ptr] >> 8;
	return float4((val & 0xFF) / (float)0xFF, ((val >> 8) & 0xFF) / (float)0xFF, ((val >> 16) & 0x7F) / (float)0x7F, 1.0f);
}

[numthreads(CALLS, 1, 1)]
void CSPASS( uint3 GID : SV_GroupID, uint3 TID : SV_GroupThreadID )
{
	const int I1 = 0x10000;
	const int I2 = 0x20000;
	const uint indices[8] = { B_0000, B_0001, B_0010, B_0100, B_0011, B_0101, B_0110, B_0111 };
	uint resultcount = 0;
	uint idx = GID.x * CALLS + TID.x;
	
	for (uint i = 0; i < incount[idx / 4]; ++i)
	{
		//Prepare
		int offset = i + size * (idx / 4);
		int startptr = intrace[offset].pointer;
		BFSDataINT _strace = intrace[offset];
		BFSDataINT strace;

		strace.ab = 
		int4( (idx & 1) == 0 ? _strace.ab.x : (_strace.ab.x + _strace.ab.z) / 2,
				(idx & 2) == 0 ? _strace.ab.y : (_strace.ab.y + _strace.ab.w) / 2,
				(idx & 1) == 0 ? (_strace.ab.x + _strace.ab.z) / 2 : _strace.ab.z,
				(idx & 2) == 0 ? (_strace.ab.y + _strace.ab.w) / 2 : _strace.ab.w);

		strace.dadb = 
		int4( (idx & 1) == 0 ? _strace.dadb.x : (_strace.dadb.x + _strace.dadb.z) / 2,
				(idx & 2) == 0 ? _strace.dadb.y : (_strace.dadb.y + _strace.dadb.w) / 2,
				(idx & 1) == 0 ? (_strace.dadb.x + _strace.dadb.z) / 2 : _strace.dadb.z,
				(idx & 2) == 0 ? (_strace.dadb.y + _strace.dadb.w) / 2 : _strace.dadb.w);

		strace.pointer = startptr;

		if (strace.ab.z - strace.ab.x > I2)
		{
			//must split
			int resultoffset = resultcount + (size / 4) * idx;
			outtrace[resultoffset] = strace;
			++resultcount;
			if (resultcount == size / 4)
			{
				outcount[idx] = resultcount;
				if(pxsize < MINPIX)
				{
					uint2 code = DecodeMorton(idx) * pxsize;
					for(uint _y = 0; _y < pxsize; ++_y)
					for(uint _x = 0; _x < pxsize; ++_x)
					{
						BufferOut[code + uint2(_x, _y)] = getColor(intrace[size * (idx / 4)].pointer);
					}
				}
				return;
			}
		}
		else
		{

		//Start trace
		int depth = 0;
		uint _now[MAXDEPTH + 1]; //0...7
		BFSDataINT trace[MAXDEPTH + 1];
		bool haschildren[MAXDEPTH + 1];
		for (int j = 0; j < MAXDEPTH + 1; ++j)
		{
			_now[j] = 0;
		}
		trace[depth] = strace;
		haschildren[depth] = hasChildren(trace[depth].pointer);

		while (depth >= 0)
		{
			if (depth < MAXDEPTH)
			{
				if (haschildren[depth])
				{
					uint itrace = indices[_now[depth]];
					BFSDataINT nexttrace;
					nexttrace.pointer = getChild(trace[depth].pointer, itrace);
					haschildren[depth + 1] = hasChildren(nexttrace.pointer);
					if(haschildren[depth + 1] || isFilled(nexttrace.pointer))
					{
						BFSDataINT nowtrace = trace[depth];
                                                //compute next projection
						nexttrace.ab = (itrace & 0x1 ? nowtrace.ab : nowtrace.ab - strace.dadb) * 2
									+ int4((itrace & 0x2) ? -I1 : I1,
											(itrace & 0x4) ? -I1 : I1,
											(itrace & 0x2) ? -I1 : I1,
											(itrace & 0x4) ? -I1 : I1);
							
						//traverse
						++_now[depth];
                                                //if octree is inside projection
						if (nexttrace.ab.z - nexttrace.ab.x > 0 && nexttrace.ab.z > -I1 && nexttrace.ab.x - strace.dadb.x * 2 <= I1
							&& nexttrace.ab.w > -I1 && nexttrace.ab.y - strace.dadb.y * 2 <= I1)
						{
								
							if (nexttrace.ab.z - nexttrace.ab.x <= I2)
							{
								nexttrace.dadb = int4(0, 0, 0, 0);
								//push stack
								_now[depth + 1] = 0;
								trace[depth + 1] = nexttrace;
								++depth;
							}
							else
							{
								nexttrace.dadb = strace.dadb;
								//must split => return result to next pass
								int resultoffset = resultcount + (size / 4) * idx;
								outtrace[resultoffset] = nexttrace;
								++resultcount;
								if (resultcount == size / 4)
								{
									outcount[idx] = resultcount;
									if(pxsize < MINPIX)
									{
										uint2 code = DecodeMorton(idx) * pxsize;
										for(uint _y = 0; _y < pxsize; ++_y)
										for(uint _x = 0; _x < pxsize; ++_x)
										{
											BufferOut[code + uint2(_x, _y)] = getColor(intrace[size * (idx / 4)].pointer);
										}
									}
									return;
								}
							}
						}
					}
					else
						++_now[depth];
				}
				else if (isFilled(trace[depth].pointer))
				{
					trace[depth].dadb = strace.dadb;
					//add to result
					int resultoffset = resultcount + (size / 4) * idx;
					outtrace[resultoffset] = trace[depth];
					++resultcount;
					if (resultcount == size / 4)
					{
						outcount[idx] = resultcount;
						if(pxsize < MINPIX)
						{
							uint2 code = DecodeMorton(idx) * pxsize;
							for(uint _y = 0; _y < pxsize; ++_y)
							for(uint _x = 0; _x < pxsize; ++_x)
							{
								BufferOut[code + uint2(_x, _y)] = getColor(intrace[size * (idx / 4)].pointer);
							}
						}
						return;
					}
					//pop stack
					--depth;
				}
				else
				{
					//pop stack
					--depth;
				}
			}
			else
			{
				trace[depth].dadb = strace.dadb;
				//add to result
				int resultoffset = resultcount + (size / 4) * idx;
				outtrace[resultoffset] = trace[depth];
				++resultcount;
				if (resultcount == size / 4)
				{
					outcount[idx] = resultcount;
					if(pxsize < MINPIX)
					{
						uint2 code = DecodeMorton(idx) * pxsize;
						for(uint _y = 0; _y < pxsize; ++_y)
						for(uint _x = 0; _x < pxsize; ++_x)
						{
							BufferOut[code + uint2(_x, _y)] = getColor(intrace[size * (idx / 4)].pointer);
						}
					}
					return;
				}
				//pop stack
				--depth;
			}
			while (_now[depth] == 8 && depth >= 0)
			{
				//end of traversion
				if (depth == 0)
				{
					depth = -1;
					break;
				}
				//pop stack
				--depth;
			}
		}

		}
	}
	outcount[idx] = resultcount;
}


[numthreads(CALLS, 1, 1)]
void CSLASTPASS( uint3 GID : SV_GroupID, uint3 TID : SV_GroupThreadID )
{
	const int I1 = 0x10000;
	const int I2 = 0x20000;
	const uint indices[8] = { B_0000, B_0001, B_0010, B_0100, B_0011, B_0101, B_0110, B_0111 };
	uint idx = GID.x * CALLS + TID.x;
	uint gid = idx / (pxsize * pxsize);
	uint pix = idx % (pxsize * pxsize);
	uint2 local = uint2(pix % pxsize, pix / pxsize);
	uint2 code = DecodeMorton(gid);
	uint2 globa = code * pxsize;
	float2 div = local / (float)pxsize;
	float2 di2 = (local + uint2(1,1)) / (float)pxsize;
	for (uint i = 0; i < incount[gid]; ++i)
	{
		//Prepare
		int offset = i + size * gid;
		int startptr = intrace[offset].pointer;
		BFSDataINT _strace = intrace[offset];
		BFSDataINT strace;

		strace.ab = 
		int4(_strace.ab.x * (1.0f - div.x) + _strace.ab.z * div.x,
				_strace.ab.y * (1.0f - div.y) + _strace.ab.w * div.y,
				_strace.ab.x * (1.0f - di2.x) + _strace.ab.z * di2.x,
				_strace.ab.y * (1.0f - di2.y) + _strace.ab.w * di2.y);

		strace.dadb = 
		int4(_strace.dadb.x * (1.0f - div.x) + _strace.dadb.z * div.x,
				_strace.dadb.y * (1.0f - div.y) + _strace.dadb.w * div.y,
				_strace.dadb.x * (1.0f - di2.x) + _strace.dadb.z * di2.x,
				_strace.dadb.y * (1.0f - di2.y) + _strace.dadb.w * di2.y);

		strace.pointer = startptr;
		if (strace.ab.z - strace.ab.x > I2)
		{
			BufferOut[globa + local] = getColor(startptr);
			return;
		}
		else
		{
			//Start trace
			int depth = 0;
			uint _now[MAXDEPTH2 + 1]; //0...7
			BFSDataINT trace[MAXDEPTH2 + 1];
			bool haschildren[MAXDEPTH2 + 1];
			for (int j = 0; j < MAXDEPTH + 1; ++j)
			{
				_now[j] = 0;
			}
			trace[depth] = strace;
			haschildren[depth] = hasChildren(trace[depth].pointer);
			while (depth >= 0)
			{
				if(depth < MAXDEPTH2)
				{
					if (haschildren[depth])
					{
						uint itrace = indices[_now[depth]];
						BFSDataINT nexttrace;
						nexttrace.pointer = getChild(trace[depth].pointer, itrace);
						haschildren[depth + 1] = hasChildren(nexttrace.pointer);
						if(haschildren[depth + 1] || isFilled(nexttrace.pointer))
						{
							BFSDataINT nowtrace = trace[depth];
                                                        //compute next projection
							nexttrace.ab = (itrace & 0x1 ? nowtrace.ab : nowtrace.ab - strace.dadb) * 2
									+ int4((itrace & 0x2) ? -I1 : I1,
											(itrace & 0x4) ? -I1 : I1,
											(itrace & 0x2) ? -I1 : I1,
											(itrace & 0x4) ? -I1 : I1);
							//traverse
							++_now[depth];
                                                        //if octree is inside projection
							if (nexttrace.ab.z - nexttrace.ab.x > 0 && nexttrace.ab.z > -I1 && nexttrace.ab.x - strace.dadb.x * 2 <= I1
								&& nexttrace.ab.w > -I1 && nexttrace.ab.y - strace.dadb.y * 2 <= I1)
							{
								if (nexttrace.ab.z - nexttrace.ab.x <= I2)
								{
									nexttrace.dadb = int4(0, 0, 0, 0);
									//push stack
									_now[depth + 1] = 0;
									trace[depth + 1] = nexttrace;
									++depth;
								}
								else
								{
									BufferOut[globa + local] = getColor(nexttrace.pointer);
									return;
								}
							}
						}
						else
							++_now[depth];
					}
					else if (isFilled(trace[depth].pointer))
					{
						BufferOut[globa + local] = getColor(trace[depth].pointer);
						return;
					}
					else
					{
						//pop stack
						--depth;
					}
				}
				else
				{
					BufferOut[globa + local] = getColor(trace[depth].pointer);
					return;
				}
				while (_now[depth] == 8 && depth >= 0)
				{
					//end of traversion
					if (depth == 0)
					{
						depth = -1;
						break;
					}
					//pop stack
					--depth;
				}
			}
		}
	}
}

This is the fixed point version. Just replace I1 with 1.0f, and I2 with 2.0f and every int4 with float4 to get the floating point one. The main problem with every tree traversal algorithm is that one has to rely heavily on branching, which is painfully slow on the gpu, because of their SIMD architecture.

 

If there are questions regarding the code, just ask.




#5260030 Performance of an experimental SVO raycaster

Posted by on 01 November 2015 - 03:10 PM

I've implemented both solutions (by reducing the stack to 3 and drawing the whole quadtree node if an overflow occurs).

 

Here's a picture of the church without these at a resolution of 512x512:

 

20fps.PNG

Here's one with both active:

 

16fps.PNG

Both run at about 20 fps. One could get a better performance by reducing the size of the buffer, but this would greatly reduce the quality even more. The same is true for increasing the voxel density (it's now at 2^12 per octree side).

 

I also found out that a fixed point implementation of bcmpinc's (old) algorithm runs on the gpu at the same speed as a floating point one. I find it a bit strange as I thought the gpu is optimized for floating point operations.

I would also like to say that I'm not really a gpu wizard (like these people here: https://docs.nvidia.com/cuda/samples/6_Advanced/reduction/doc/reduction.pdf). I'm just doing this as a hobby. So it's quite possible that there're a dozen of optimizations one can make with the code. So if someone wants too see and/or optimize my code I could upload it here or somewhere else.

 

@bcmpinc Theres something strange about the point data of the model. The coordinates of them aren't normalized, there're all between 30800 and 34740. So an octree depth of 13 and greater doesn't make sense because of that precision loss.




#5253367 Performance of an experimental SVO raycaster

Posted by on 21 September 2015 - 05:24 PM

Hi folks
 
I've made a GPU-raycaster based of bcmpinc's algorithm: https://bcmpinc.wordpress.com/. In this pdf, he/she descibes how it works: https://app.box.com/s/rxvvymcz4nfygvs6fz7a.
 
 
I used HLSL and DirectCompute for the computation. So here is what I've come up with:
 

#define I1 1.0f

//trace order indices:
//the algorithm begins with the voxel 000
//than it looks at the voxels 010,001 or 100,001, depending on the direction of the ray
//etc
const uint start[5] = { 0, 1, 3, 5, 6 };
const uint indices1[6] = { B_0000, B_0100, B_0001, B_0101, B_0110, B_0111 };
const uint indices2[6] = { B_0000, B_0010, B_0001, B_0011, B_0110, B_0111 };

...

//_n is a stack of pointers to the visited voxels
//_x is a float2 stack, which stores the ray plane intersections. 
//the voxel planes are parallel to the current cubemap side
//_now is a stack of uint, that indicate which voxels are already traversed by the algorithm
//dir is a float2 that stores the direction of the ray
iter = 0;
while (depth != -1)
{
	++iter;
	//safety first
	if(iter == 200)
	{
		return float4(0.0f,0.0f,1.0f,1.0f);
	}
	if (isLeaf(_n[depth]))
	{
		return float4(iter/64.0f, 0, iter > 100 ? 1.0f : 0.0f, 1);
	}
        if (_now[depth] == 4)
	{
		//pop stack
		--depth;
	}
	else
	{
		//goto all next voxels
		bool found = false;
		for (uint i = start[_now[depth]]; i < start[_now[depth] + 1]; ++i)
		{
			//get index of the next voxel
			uint trace = _x[depth].x * dir.y < _x[depth].y * dir.x ? indices1[i] : indices2[i];						//get intersections with voxel
			_x[depth + 1] = ((trace & B_0001 ? _x[depth] : (_x[depth] - dir)) * 2.0f)
					+ float2((trace & B_0010) ? -I1 : I1, (trace & B_0100) ? -I1 : I1);
			//if ray intersects voxel
			if(_x[depth + 1].x >= -I1 && _x[depth + 1].y >= -I1 && _x[depth + 1].x - 2.0f * dir.x < I1 && _x[depth + 1].y - 2.0f * dir.y < I1)
			{
				//get pointer to next voxel
				uint ne = getChild(_n[depth], trace, flip, rot);
				if (!isNothing(ne))
				{
					//traverse to next set of voxels
					++(_now[depth]);
					//push stack
					++depth;
					_n[depth] = ne;
					//start at first voxel
					_now[depth] = 0;
					found = true;
					break;
				}
			}
		}
		if (!found)
		{
			//traverse to next set of voxels
			++(_now[depth]);
		}
	}
}
return float4(iter/64.0f, 0, iter > 100 ? 1.0f : 0.0f, 1);
 
This algorithm projects the octree on a cubemap. This cubemap will then get rendered to the screen.
 
I also use the original algorithm to raycast with lower resolution on the cpu and use the output as a starting point for the pixels on the GPU.
 
The problem is, even with a octree depth of 7, I get low framerates, especially if the camera is close to and facing the surface, because a lot of pixels are hitting voxels. Also pretracing with lower resolution doesn't really help. I get the same framerates with and without this acceleration structur. Is there some major optimization I forgotten? How do these guys https://www.youtube.com/watch?v=ca7xcP3-c18 https://www.youtube.com/watch?v=km0DpZUgvbg get such a good performance? In fact this raycaster should perform better as ordinary ones, as the algorithm I use only approximates cubes. From a far one shouldn't notice that.
 
Here are some pictures of the raycaster:
 
LHpEfoX.png
 
Here is a picture without optimization:
 
5QUSSqP.png
 
The amount of red is porportional to the number of iterations per pixel. If a pixel exceeds 100 iterations, it turn purple.
Here is a picture with the acceleration structur:
 
L4wrMWj.png



#5032566 CreatePixelShader access violation

Posted by on 15 February 2013 - 05:02 AM

I solved it. It was so dumb ^^
I forgot to set m_pixelShader to 0.
 




PARTNERS