If all you need to do is to render cube then you don't need an IB or VB at all:
14 tristrip cube in the vertex shader.
b = 1 << i;
x = (0x287a & b) != 0
y = (0x02af & b) != 0
z = (0x31e3 & b) != 0
Where i is SV_VertexID (add a modulo operation to i to draw multiple cubes in the same draw).
Then set the position rotation and scale by pulling the transform from a Const buffer or SRV indexed by SV_VertexID / 14;
No index buffer, no vertex buffer, no GS, and no instancing. Only raw ALU and one SRV with each cube's transform.
The magic number above will connect separate cubes, in case someone need it I figured out the number for attaching degenerate triangle at begin and end of each cube so the strip connect each cube is just a line. Here is the number
16 tristrip cube in the vertex shader.
b = 1 << i;
x = (0xd0f4 & b) != 0
y = (0x055f & b) != 0
z = (0xe3c7 & b) != 0
However, when I benchmark it, it's actually slightly slower than instancing!
I am so confused...
in case someone wondering, here is the shader code for both instancing/noInstance,
//=======================================================================
// This uses instance
//=======================================================================
#include "TSDFVolume.inl"
#include "TSDFVolume.hlsli"
#include "CalibData.inl"
Texture3D<int> tex_srvRenderBlockVol : register(t1);
void main(uint uInstanceID : SV_InstanceID, in float4 f4Pos : POSITION,
out float4 f4ProjPos : SV_POSITION, out float2 f2Depths : NORMAL0)
{
uint3 u3Idx = MakeU3Idx(uInstanceID,
vParam.u3VoxelReso / vParam.uVoxelRenderBlockRatio);
f4ProjPos = float4(0.f, 0.f, 0.f, 1.f);
f2Depths = float2(0.f, 0.f);
// check whether it is occupied
if (tex_srvRenderBlockVol[u3Idx] != 0) {
float3 f3BrickOffset =
u3Idx * vParam.uVoxelRenderBlockRatio * vParam.fVoxelSize -
(vParam.u3VoxelReso >> 1) * vParam.fVoxelSize;
f4Pos.xyz = (f4Pos.xyz + 0.5f) * vParam.fVoxelSize *
vParam.uVoxelRenderBlockRatio + f3BrickOffset;
#if FOR_VCAMERA
f4ProjPos = mul(mProjView, f4Pos);
float fVecLength = length(mul(mView, f4Pos).xyz);
#endif // FOR_VCAMERA
#if FOR_SENSOR
float4 f4Temp = mul(mDepthView, f4Pos);
float fz = -f4Temp.z;
float2 f2HalfReso = i2DepthReso >> 1;
float2 f2xy = (f4Temp.xy / fz * DEPTH_F + DEPTH_C
- f2HalfReso) / f2HalfReso;
f4ProjPos = float4(f2xy, 1.f, 1.f);
float fVecLength = length(f4Temp.xyz);
#endif // FOR_SENSOR
f2Depths = float2(fVecLength, -fVecLength);
}
}
//=======================================================================
// This uses noinstance
//=======================================================================
#include "TSDFVolume.inl"
#include "TSDFVolume.hlsli"
#include "CalibData.inl"
#define TRISTRIPSIZE 16
#define MAGICFORX 0xd0f4
#define MAGICFORY 0x055f
#define MAGICFORZ 0xe3c7
//#define TRISTRIPSIZE 14
//#define MAGICFORX 0x287a
//#define MAGICFORY 0x02af
//#define MAGICFORZ 0x31e3
Texture3D<int> tex_srvRenderBlockVol : register(t1);
void main(uint uVertID : SV_VertexID,
out float4 f4ProjPos : SV_POSITION, out float2 f2Depths : NORMAL0)
{
uint3 u3Idx = MakeU3Idx(uVertID / TRISTRIPSIZE,
vParam.u3VoxelReso / vParam.uVoxelRenderBlockRatio);
f4ProjPos = float4(0.f, 0.f, 0.f, 1.f);
f2Depths = float2(0.f, 0.f);
// check whether it is occupied
if (tex_srvRenderBlockVol[u3Idx] != 0) {
uint uMask = 1 << (uVertID % TRISTRIPSIZE);
uint3 u3Pos = (uint3(MAGICFORX, MAGICFORY, MAGICFORZ) & uMask) != 0;
float4 f4Pos = float4(u3Pos, 1.f);
float3 f3BrickOffset =
u3Idx * vParam.uVoxelRenderBlockRatio * vParam.fVoxelSize -
(vParam.u3VoxelReso >> 1) * vParam.fVoxelSize;
f4Pos.xyz = f4Pos.xyz * vParam.fVoxelSize *
vParam.uVoxelRenderBlockRatio + f3BrickOffset;
#if FOR_VCAMERA
f4ProjPos = mul(mProjView, f4Pos);
float fVecLength = length(mul(mView, f4Pos).xyz);
#endif // FOR_VCAMERA
#if FOR_SENSOR
float4 f4Temp = mul(mDepthView, f4Pos);
float fz = -f4Temp.z;
float2 f2HalfReso = i2DepthReso >> 1;
float2 f2xy = (f4Temp.xy / fz * DEPTH_F + DEPTH_C
- f2HalfReso) / f2HalfReso;
f4ProjPos = float4(f2xy, 1.f, 1.f);
float fVecLength = length(f4Temp.xyz);
#endif // FOR_SENSOR
f2Depths = float2(fVecLength, -fVecLength);
}
}
Though I got D3D12 debug layer warning
D3D12 WARNING: ID3D12CommandList::DrawInstanced: Vertex Buffer at the input vertex slot 0 is not big enough for what the Draw*() call expects to traverse. This is OK, as reading off the end of the Buffer is defined to return 0. However the developer probably did not intend to make use of this behavior. [ EXECUTION WARNING #210: COMMAND_LIST_DRAW_VERTEX_BUFFER_TOO_SMALL]
But I think it should't affect performance... Any idea why this is slightly slower than instancing? (my test machine use GTX680m, and it is slower in any cube number case)