So then I started thinking about doing a depth prepass, but before going through the work of implementing it, I decided to first try a simple test to see if it had potential. For the test, I simply cleared the depth buffer to 0 instead of 1. The idea is that every pixel in the scene would then be rejected before the PS and I would see pretty much the same significant increase in performance as with the null shader above. However, I saw absolutely no speed increase at all.
Does this imply that my PS is being executed even if it's occluded by closer depth values? How is that possible?
Here's the PS and the functions it calls (see PSMain for the pixel shader):
StructuredBuffer<SpecBuf_Params_s> SpecBuf_Params;
SpecBuf_Params_s GetParams( in uint uInstIndex, in uint uSubMtlIndex ) {
return SpecBuf_Params[ Spec_GetParamsIndex( uInstIndex, uSubMtlIndex ) ];
}
float4 Spec_Motif( SpecMotif_s Motif ) {
float4 vTableColor = g_avMotifColor[ Motif.m_uMotifIndex ];
float4 vBiasedColor = (vTableColor * Motif.m_fScale + Motif.m_fOffset) * Motif.m_vBaseColor;
float3 vFinalColor = (Motif.m_uFlags & 1) ? Motif.m_vBaseColor.rgb : vBiasedColor.rgb;
float fFinalAlpha = (Motif.m_uFlags & 2) ? Motif.m_vBaseColor.a : vBiasedColor.a;
return float4( vFinalColor, fFinalAlpha );
}
uint Spec_AlphaToCoverage( float fUnitAlpha ) {
uint uCoverage;
#if SPEC_MSAA_COUNT == 2
if( fUnitAlpha < (1.0f / 3.0f) ) {
uCoverage = 0;
} else if( fUnitAlpha < (2.0f / 3.0f) ) {
uCoverage = 1;
} else {
uCoverage = 3;
}
#elif SPEC_MSAA_COUNT == 4
if( fUnitAlpha < (1.0f / 5.0f) ) {
uCoverage = 0;
} else if( fUnitAlpha < (2.0f / 5.0f) ) {
uCoverage = 1;
} else if( fUnitAlpha < (3.0f / 5.0f) ) {
uCoverage = 3;
} else if( fUnitAlpha < (4.0f / 5.0f) ) {
uCoverage = 7;
} else {
uCoverage = 15;
}
#else
uCoverage = 0xffffffff;
#endif
return uCoverage;
}
SpecRawGBuffer_s Spec_PackGBuffer( in SpecGBufferSource_s Source ) {
SpecRawGBuffer_s RawGBuffer;
// Compute flags field for Tex2...
uint uEdgePixel = any( frac( Source.m_vCentroidPosXY_SS ) - 0.5f );
uint uFlags = (Source.m_uNoAO << 5) | ((uEdgePixel & 1) << 4) | max( min( uint( Source.m_fSpecUnitSharpness * 15.0f ), 15 ), 1 );
// Store values in packed GBuffer...
RawGBuffer.m_vTex0 = float4( Source.m_vDiffuseColor, 0 );
RawGBuffer.m_vTex1 = float4( Source.m_vEmissiveColor, Source.m_fSpecUnitIntensity );
RawGBuffer.m_vuTex2 = uint4( (255.0f/2.0f) + (255.0f/2.0f)*Source.m_vUnitNorm_WS, uFlags );
return RawGBuffer;
}
SpecRawGBuffer_s PSMain( VS_Out Input, out uint uCoverage : SV_Coverage ) : SV_TARGET {
SpecBuf_Params_s Params = GetParams( Input.uInstanceID, Input.uSubMtlIndex );
uint uFlags = Params.m_uFlags;
uint uFlag_VtxRGB_Tint = uFlags & FLAG_VTX_RGB_TINT;
uint uFlag_VtxRGB_Emis = uFlags & FLAG_VTX_RGB_EMIS;
uint uFlag_VtxA_Tint = uFlags & FLAG_VTX_A_TINT;
uint uFlag_VtxA_Emis = uFlags & FLAG_VTX_A_EMIS;
uint uFlag_VtxA_Opac = uFlags & FLAG_VTX_A_OPAC;
uint uFlag_VtxA_Glos = uFlags & FLAG_VTX_A_GLOS;
uint uFlag_BaseA_Emis = uFlags & FLAG_BASE_A_EMIS;
uint uFlag_BaseA_Opac = uFlags & FLAG_BASE_A_OPAC;
uint uFlag_BaseA_Glos = uFlags & FLAG_BASE_A_GLOS;
float4 vMotifTintOpac = Spec_Motif( Params.m_MotifTintOpac );
float4 vMotifEmisGlos = Spec_Motif( Params.m_MotifEmisGlos );
float3 vVtxTint = (uFlag_VtxRGB_Tint ? Input.vColorVtx.rgb : 1) * (uFlag_VtxA_Tint ? Input.vColorVtx.a : 1);
float3 vVtxEmis = Params.m_fAddEmis + (uFlag_VtxRGB_Emis ? Input.vColorVtx.rgb : 0) + (uFlag_VtxA_Emis ? Input.vColorVtx.a : 0);
float fVtxOpac = (uFlag_VtxA_Opac ? Input.vColorVtx.a : 1);
float fVtxGlos = (uFlag_VtxA_Glos ? Input.vColorVtx.a : 0);
float4 vVtxTintOpac = float4( vVtxTint, fVtxOpac ) * vMotifTintOpac;
float4 vVtxEmisGlos = float4( vVtxEmis, fVtxGlos );
// Compute normal...
float3 vUnitNorm_WS = normalize( Input.vNormal_WS );
// Compute base color...
float3 vTC_BaseRGB = float3( Input.vTC_Base.xy, Params.m_uTexSliceIndexBaseRGB );
float3 vTC_BaseA = float3( Params.m_fTexCoordScale_BaseA * Input.vTC_Base.zw, Params.m_uTexSliceIndexBaseA );
float3 vTexColorBaseRGB = TexBase.Sample( SamplerBase, vTC_BaseRGB ).rgb;
float fTexColorBaseA = TexBase.Sample( SamplerBase, vTC_BaseA ).a;
float fDetailMult = lerp( 1, 2 * fTexColorBaseA, Params.m_Switch_fBaseA_Detl );
vTexColorBaseRGB *= fDetailMult;
// Compute reflection color...
float4 vMotifCube = Spec_Motif( Params.m_MotifCube );
float3 vUnitVtxToCam_WS = normalize( Spec_GetCamPos() - Input.vPos_WS );
float3 vUnitReflect_WS = reflect( -vUnitVtxToCam_WS, vUnitNorm_WS );
float3 vReflectionColor = vMotifCube.rgb * TexCube.Sample( SamplerCube, float4( vUnitReflect_WS, Params.m_uTexSliceIndexCube ) ).rgb;
// Compute final values...
float fFinalOpac = Input.fUnitFadeAlpha * vVtxTintOpac.a * (uFlag_BaseA_Opac ? fTexColorBaseA : 1);
float fFinalGlos = vMotifEmisGlos.a * (vVtxEmisGlos.a + (uFlag_BaseA_Glos ? fTexColorBaseA : 0));
float3 vFinalDiff = saturate( vVtxTintOpac.rgb * vTexColorBaseRGB ) + fFinalGlos * vReflectionColor;
float3 vFinalEmis = vMotifEmisGlos.rgb * (vVtxEmisGlos.rgb + (uFlag_BaseA_Emis ? fTexColorBaseA : 0));
uCoverage = Spec_AlphaToCoverage( fFinalOpac );
// Store everything into our gbuffers...
SpecGBufferSource_s GBufSource = Spec_GetDefaultGBufSource();
GBufSource.m_vDiffuseColor = vFinalDiff;
GBufSource.m_vEmissiveColor = vFinalDiff * vFinalEmis;
GBufSource.m_vCentroidPosXY_SS = Input.vPos_HS.xy;
GBufSource.m_vUnitNorm_WS = vUnitNorm_WS;
GBufSource.m_fSpecUnitSharpness = Params.m_fSpecUnitSharpness;
GBufSource.m_fSpecUnitIntensity = fFinalGlos;
return Spec_PackGBuffer( GBufSource );
}
Edit: After some more testing, it's really looking like early Z rejection just isn't working. Though, I'm not sure why yet. Do NVIDIA and ATI provide docs that describe the conditions which must be met to keep early Z rejection enabled?






