Sign in to follow this  

MS HLSL Shader Compiler bugs

This topic is 2546 days old which is more than the 365 day threshold we allow for new replies. Please post a new topic.

If you intended to correct an error in the post then please contact us.

Recommended Posts

Hello !

I have to report few bugs i have found in the Microsoft HLSL Shader Compiler 9.29.952.3111 (latest DX SDK, aug 2010)

I am trying to implement GPU-based shader that will preprocess height field for ConeStep mapping, the code is complex for pixel shader but is possible and fast. See the file attached.

BUGS:
1. Lines 347. There is infinitive loop
for ( ; true; )
that will break out when the recusive job is done. The compiler use REP instruction that has max count of iterations 255. So the loop is not infinitive, the code is wrong.
The alternative NVidia CGC compiler uses the LOOP instruction with the StepInc == 0, so the loop stays infinitive

2. MS compiler CAN compile this in low optimization switch only, the O3 optimization switch ends in message "can't unroll loop that marked with 'loop'". Why it tries to unroll anything ???

3. The compilation time for O3 switch is ~40 minutes for Core2Quad 3GHZ (and results in message above). For example CGC compiler takes less than a second.

4. I tried to workaround the finitive loop by using 2 nested loops (uncomment line 345). This fails even in O1 optimization mode with message "error X4505: maximum temp register index exceeded". I do not understand why the second loop requires more registers. Also i cannot understand why the number of extra-required registers is GREATER than full number of registers used (factor > 2 because the shader uses 15 registers only) ?

Thank you.

Mikhail
l-mik@yandex.ru


// support file for Cone Step texture prepare
// switchers:
// TSIZE8 | TSIZE9 | TSIZE10 | TSIZE11 | TSIZE12

// restore pass : YES

#define TSIZE8 // use 1<<8 texture size
#define MAXTAN 1
#define FLATTEN // define if use pyramid placed in one level of texture, else true mips will be used
#define STEP_INC 8 // num directions per state

#define DBG_INC(a, b) a += b;
#define DBG_SET(a, b) a = b;

#ifdef TSIZE8
#define TSIZE 32//128//256
#define NUMMIPS 6//8//9
#define MAXDEEP 8
#endif

texture txtCap, txtH;

sampler capSampler = // height cap sampler : pyramid of MAX2x2
sampler_state
{
Texture = <txtCap>;
MipFilter = POINT;
MinFilter = POINT;
MagFilter = POINT;
AddressU = CLAMP;//BORDER;
AddressV = CLAMP;//BORDER;
BorderColor = 0;
};

sampler hSampler = // height sampler
sampler_state
{
Texture = <txtH>;
MipFilter = POINT;
MinFilter = POINT;
MagFilter = POINT;
AddressU = BORDER;
AddressV = BORDER;
BorderColor = 0;
};

struct VS_INPUT
{
float2 Origin : POSITION; // vertex position
float2 uv : TEXCOORD0; // material UV : external UV
};

struct VS_OUTPUT
{
float4 Position : POSITION; // vertex position
float2 uv : TEXCOORD0;
};


VS_OUTPUT RenderVS( VS_INPUT Input )
{
VS_OUTPUT Output;

Output.Position = float4(Input.Origin.x, Input.Origin.y, 0.5, 1);
Output.uv = Input.uv;
return Output;
}

struct PS_OUTPUT
{
float4 Color : COLOR;
};


void Push8(inout float4 stack[8], inout float sp, float4 rect)
{
// 0 1 2 3 4 5 6 7
[branch]if (sp >= 4)
{
// 4 5 6 7
[branch]if (sp >= 6)
{
// 6 7
if (sp == 7)
stack[7] = rect;
else
stack[6] = rect;
}
else
{
// 4 5
if (sp == 5)
stack[5] = rect;
else
stack[4] = rect;
}
}
else
{
// 0 1 2 3
[branch]if (sp >= 2)
{
// 2 3
if (sp == 3)
stack[3] = rect;
else
stack[2] = rect;
}
else
{
// 0 1
if (sp == 1)
stack[1] = rect;
else
stack[0] = rect;
}
}

sp += 1;
}


void Push(inout float4 stack[MAXDEEP], inout float sp, float4 rect)
{
#ifdef TSIZE8
Push8(stack, sp, rect);
#endif
}

float4 GetTopIncStage8(inout float4 stack[8], float sp)
{
float4 rect;

// 1 2 3 4 5 6 7 8
[branch]if (sp >= 5)
{
// 5 6 7 8
[branch]if (sp >= 7)
{
// 7 8
if (sp == 8)
{
// 8
rect = stack[8-1];
stack[8-1].z += STEP_INC;
}
else
{
// 7
rect = stack[7-1];
stack[7-1].z += STEP_INC;
}
}
else
{
// 5 6
if (sp == 6)
{
// 6
rect = stack[6-1];
stack[6-1].z += STEP_INC;
}
else
{
// 5
rect = stack[5-1];
stack[5-1].z += STEP_INC;
}
}
}
else
{
// 1 2 3 4
[branch]if (sp >= 3)
{
// 3 4
if (sp == 4)
{
// 4
rect = stack[4-1];
stack[4-1].z += STEP_INC;
}
else
{
// 3
rect = stack[3-1];
stack[3-1].z += STEP_INC;
}
}
else
{
// 1 2
if (sp == 2)
{
// 2
rect = stack[2-1];
stack[2-1].z += STEP_INC;
}
else
{
// 1
rect = stack[1-1];
stack[1-1].z += STEP_INC;
}
}
}

return rect;
}


// pop rect, choose subrect and push tail; return new rect to process
float3 PopAndSubRect(inout float4 stack[MAXDEEP], inout float sp)
{
float4 rect;

// get top of the stack and set next stage in the top

#ifdef TSIZE8
rect = GetTopIncStage8(stack, sp);
#endif

// the rect has selector+stage attached - use it



float s = pow(0.5, rect.w + 1);
float2 sht;

[branch]if ( rect.z >= STEP_INC*2 )
{
// STEP_INC*2 ... STEP_INC*2 + 7
// 3, no push
sht.y = frac(rect.z / 4);

// 0: + +
// 1: 0 +
// 2: + 0
// 3: 0 0

sht.x = (sht.y == 1.0/4) ? 0 : s;
sht.x = (sht.y == 3.0/4) ? 0: sht.x;
sht.y = (sht.y <= 1.0/4) ? s : 0;

sp -= 1;
}
else
{
[branch]if ( rect.z >= STEP_INC )
{
// STEP_INC ... STEP_INC+7
// 23, push 3
sht.y = frac(rect.z / 8);
if ( sht.y > 3.0/8 ) sht.y = 7.0/8 - sht.y;

// 0: + 0
// 1: 0 0
// 2: + +
// 3: 0 +

sht.x = (sht.y == 1.0/8) ? 0: s;
sht.x = (sht.y == 3.0/8) ? 0: sht.x;
sht.y = (sht.y >= 2.0/8) ? s: 0;
}
else
{ // 0 ... 7
// 123, push 23
sht.y = frac(rect.z / 8);
if ( sht.y > 3.0/8 ) sht.y = 7.0/8 - sht.y;

// 0: 0 +
// 1: + +
// 2: 0 0
// 3: + 0


sht.x = (sht.y == 0.0/8) ? 0: s;
sht.x = (sht.y == 2.0/8) ? 0: sht.x;
sht.y = (sht.y >= 2.0/8) ? 0: s;
}
}

rect.xy += sht;
rect.z = rect.w + 1;

return rect.xyz;
}

float4 GetVirtualUVZW( float2 uv, float lodsize, float lod)
{
float4 ret = 0;

#ifdef FLATTEN
ret.x = (lodsize + uv.x*lodsize) * 0.5;
ret.y = uv.y*lodsize;
#else
ret.xy = uv;
ret.w = lod;
#endif

return ret;
}

PS_OUTPUT RenderPS_ConeStep( VS_OUTPUT In)
{
PS_OUTPUT Output;
//////////////////

float4 stack[MAXDEEP]; // [x, y, state]
float4 rect = float4(0.5/TSIZE, 0.5/TSIZE, 0, 0); // x y lod dh
float4 nextrect; // x y lod d
float4 virtUV;
float2 uv00;

float sp, curtan, capH, needPop;

// temp vars
//float dh, d;
#define dh rect.w
#define d nextrect.w

// debug vars
float dbg_numops = 0;
float dbg_loopcnt = 0;

sp = 0;
curtan = MAXTAN;


// snap uv to grid
uv00 = In.uv * TSIZE;
uv00 = uv00 - frac(uv00);

// snap uv to texel center
uv00 += float2(0.5, 0.5);
uv00 = uv00 / TSIZE;


virtUV = GetVirtualUVZW(uv00, 1, 0);
capH = tex2Dlod( capSampler, virtUV ).x;
needPop = 0;

DBG_INC(dbg_numops, 14)

//[loop]for ( ; !needPop; ) // if uncomment, the compilation fails
{
[loop]for ( ; true; )
{
// have new rect == [x, y, level]
DBG_INC(dbg_loopcnt , 1)

[branch]if ( rect.z == NUMMIPS-1 )
{
// 1 texel
// find real tan : use single point at rect.xy and 4 corners of the pixel

virtUV = rect.xyxy * float4(1,1,0,0);
dh = tex2Dlod(hSampler, virtUV).x;

[branch]if ( dh > capH )
{
dh = dh - capH;
dh = 1 / dh;

// virtUV.z stays 0

// 00
virtUV.xy = rect.xy - uv00;
d = dot(virtUV.xyz, virtUV.xyz); // d is sqr(distance)

[branch]if ( d >= 1.0/(TSIZE*TSIZE) )
{
d = sqrt(d);
d = d * dh; // tan
curtan = min(curtan, d);

DBG_INC(dbg_numops, 6)
}

// 10
virtUV.xy = rect.xy - uv00;
virtUV.xy -= float2(1.0/TSIZE, 0.0/TSIZE);
d = dot(virtUV.xyz, virtUV.xyz); // d is sqr(distance)

[branch]if ( d >= 1.0/(TSIZE*TSIZE) )
{
d = sqrt(d);
d = d * dh; // tan
curtan = min(curtan, d);

DBG_INC(dbg_numops, 6)
}

// 11
virtUV.xy = rect.xy - uv00;
virtUV.xy -= float2(1.0/TSIZE, 1.0/TSIZE);
d = dot(virtUV.xyz, virtUV.xyz); // d is sqr(distance)

[branch]if ( d >= 1.0/(TSIZE*TSIZE) )
{
d = sqrt(d);
d = d * dh; // tan
curtan = min(curtan, d);

DBG_INC(dbg_numops, 6)
}

// 01
virtUV.xy = rect.xy - uv00;
virtUV.xy -= float2(0.0/TSIZE, 0.0/TSIZE);
d = dot(virtUV.xyz, virtUV.xyz); // d is sqr(distance)

[branch]if ( d >= 1.0/(TSIZE*TSIZE) )
{
d = sqrt(d);
d = d * dh; // tan
curtan = min(curtan, d);

DBG_INC(dbg_numops, 6)
}


DBG_INC(dbg_numops, 12)
}

// pop rect from the stack
needPop = 1;

DBG_INC(dbg_numops, 5)
}
else // large rect
{

dh = pow(0.5, rect.z); // size of the current rect
virtUV.xy = rect.xy + float2(dh, dh); // far corner

// find distance for outside

virtUV.xy = uv00 - virtUV.xy; // uv - farCorner
virtUV.zw = rect.xy - uv00;
virtUV.zw -= float2(1.0/TSIZE, 1.0/TSIZE);// nearCorner - farUV

virtUV = max(virtUV, float4(0,0,0,0)); // x and z are exclusive, y and w are exclusive
d = dot( virtUV, virtUV ); // d*d
d = sqrt(d);

d = d == 0 ? 1.0/TSIZE : d;

virtUV.x = NUMMIPS-1 - rect.z; // lodR
virtUV.y = pow(0.5, virtUV.x); // lodR size

virtUV = GetVirtualUVZW(rect.xy, virtUV.y, virtUV.x);
virtUV.x = tex2Dlod( capSampler, virtUV).x; // cap for rect

virtUV.x = virtUV.x - capH;
virtUV.x = 1/virtUV.x;
d = d * virtUV.x; // up tan for rect
d = d < 0 ? MAXTAN : d; // skip negative tan
[branch]if ( d >= curtan )
{
// skip this rect
// pop
needPop = 1;

DBG_INC(dbg_numops, 1)
}
else
{
// explore this rect
// subdivide rect
virtUV.xy = rect.xy + float2(dh, dh) * 0.5; // mid
virtUV.zw = uv00 - virtUV.xy;
virtUV.zw = abs(virtUV.zw); // abs(uv00 - rect00)

// prepare next subrect
nextrect.z = rect.z + 1;
nextrect.xy = uv00.xy >= virtUV.xy ? virtUV.xy : rect.xy;

// push current rect
virtUV.z = virtUV.w >= virtUV.z ? 4 : 0;
virtUV.xy = uv00.xy >= virtUV.xy ? float2(1,2) : float2(0, 0);

virtUV.z = dot(virtUV.xyz, float3(1,1,1)); // direction selector with stage == 0
virtUV.xyw = rect.xyz; // x y lod

Push(stack, sp, virtUV); // x y selector lod

rect.xyz = nextrect.xyz;

// continue with the new rect

DBG_INC(dbg_numops, 22)
}

DBG_INC(dbg_numops, 22)
}

[branch]if ( needPop )
{
// pop
if ( sp == 0 )
{
// finished
// keep needPop == 1 to break from the outer loops
break;
}
rect.xyz = PopAndSubRect(stack, sp);
needPop = 0;
// continue with this rect

DBG_INC(dbg_numops, 27)
}
}
}

curtan = curtan / MAXTAN;
//curtan = sqrt(curtan);

//Output.Color = float4(curtan, capH, 0, 0);
//Output.Color = float4(dbg_loopcnt, dbg_numops, curtan, 0);
Output.Color = curtan;

//////////////////
return Output;
}

technique tConeStep
{
pass P0
{
VertexShader = compile vs_3_0 RenderVS();
PixelShader = compile ps_3_0 RenderPS_ConeStep();

ZEnable = false;
ZFunc = ALWAYS;
ZWriteEnable = false;

}
pass P1
{
ZEnable = true;
ZFunc = LESSEQUAL;
ZWriteEnable = true;
}
}




Share this post


Link to post
Share on other sites

This topic is 2546 days old which is more than the 365 day threshold we allow for new replies. Please post a new topic.

If you intended to correct an error in the post then please contact us.

Create an account or sign in to comment

You need to be a member in order to leave a comment

Create an account

Sign up for a new account in our community. It's easy!

Register a new account

Sign in

Already have an account? Sign in here.

Sign In Now

Sign in to follow this