Hello.
I am trying to add Tiled Deferred Shading to my engine. I am at the beginning of it right now.however, I have some serious problems.
The first problem is the poor performance that I am getting right now. With my old Light Pre Pass renderer I get about 270-300 fps. But with this approach and only one point light I get about 105 fps ! Even it doesn't have any specular reflection. In my tests I figured out the bottle neck is when I dispatch the composite shader . It's the final shader that uses G-buffer and also information for each tile to shader individual pixels. This is the shader I am talking about:
cbuffer Globals
{
int GroupCountX;
float FarPlane;
float4 FarPlaneCorners[4];
int Width;
int Height;
int LightCount;
};
struct LightShadingInfo
{
float3 ViewSpacePos;
/// Light's direction in view space
float3 Direction;
/// used for SpotLights Only
float CosTheta;
/// 0=>Directional , 1=> PointLight , 2=> SpotLight
int LightType;
float Range;
};
struct LightIndicesStruct
{
int indices[MAXIMUM_PER_TILE];
};
Texture2D NormalSmoothness;
Texture2D DepthBuffer;//in view space unnormalized depth. int [NearPlane , FarPlane ] Range
Texture2D Albedo;
Texture2D SpecularColor;
StructuredBuffer<LightShadingInfo> LightShadingBuffer;
StructuredBuffer<LightIndicesStruct> LightIndicesBuffer;
RWTexture2D<float4> Output;//HDR output Accumulation Buffer
float3 ShadeDirectionalLight(LightShadingInfo LightInfo, float3 Normal, float3 DiffuseColor, float3 ReflectiveColor)
{
float NdotL = saturate(dot(Normal, LightInfo.Direction));
return NdotL*DiffuseColor;
}
float3 ShadePointLight(LightShadingInfo LightInfo, float3 ViewSpacePos, float3 Normal, float3 DiffuseColor, float3 ReflectiveColor)
{
float3 LightVector = LightInfo.ViewSpacePos - ViewSpacePos;
float atten = 1 - saturate(length(LightVector) / LightInfo.Range);
atten *= atten;
LightVector = normalize(LightVector);
float NdotL = saturate(dot(Normal, LightVector));
return NdotL*DiffuseColor*atten;
}
float3 GetViewSpacePos(int2 Position, float Depth)
{
/* Layout Of FarPlane Corners
[0] ---- [1]
----------
----------
[2] ---- [3]
*/
//Use Bilnear Filtering to Get Correct FarPlane Pos
float XLerp = ((float)(Position.x)) / Width;
float YLerp = ((float)(Position.y)) / Height;
float3 Upper = lerp(FarPlaneCorners[0].xyz, FarPlaneCorners[1].xyz, XLerp);
float3 Lower = lerp(FarPlaneCorners[2].xyz, FarPlaneCorners[3].xyz, XLerp);
float3 ToFarPlane = lerp(Upper, Lower, YLerp);
return (Depth / FarPlane)*ToFarPlane;
}
groupshared LightIndicesStruct indicesBuffer;
[numthreads(32, 32, 1)]
void main(int3 dispathThreadId:SV_DispatchThreadID, int3 groupId : SV_GroupID,int3 groupThreadId:SV_GroupThreadID)
{
if (groupThreadId.x == 0 && groupThreadId.y==0)
indicesBuffer = LightIndicesBuffer[groupId.y * GroupCountX + groupId.x];
GroupMemoryBarrierWithGroupSync();//wait for all threads
float depth = DepthBuffer[dispathThreadId.xy].r;
float4 normalSmoothness = NormalSmoothness[dispathThreadId.xy];
float3 albedo = Albedo[dispathThreadId.xy].rgb;
float3 specColor = SpecularColor[dispathThreadId.xy].rgb;
float3 ViewSpacePos = GetViewSpacePos(dispathThreadId.xy, depth);
// we have the initial variables we needed let's shade !
int index = 0;
//const LightIndicesStruct indicesBuffer = LightIndicesBuffer[groupId.y * GroupCountX + groupId.x];
int LightIndex = indicesBuffer.indices[index];
float3 color = float3(0,0,0);
while (LightIndex != -1 && index<LightCount)
{
const LightShadingInfo LightInfo = LightShadingBuffer[LightIndex];
[branch]
switch (LightInfo.LightType)
{
case 0:
color += ShadeDirectionalLight(LightInfo, normalSmoothness.xyz, albedo, specColor);
break;
case 1:
color += ShadePointLight(LightInfo, ViewSpacePos, normalSmoothness.xyz, albedo, specColor);
break;
}
index++;
LightIndex = indicesBuffer.indices[index];
}
Output[dispathThreadId.xy] = float4(color, 1.0f);
}
technique11 Tech0
{
pass P0
{
SetVertexShader(NULL);
SetPixelShader(NULL);
SetComputeShader(CompileShader(cs_5_0, main()));
}
}
Can you please tell me how can I further optimize this shader ?
Another Problem that I am facing is checking if a light has an effect on a tile or not. Currently I have point lights that have bounds.
I create AABB for it in WorldSpace and transform it to ViewSpace and ClipSpace to do my tests. It's working 80% of the time but When camera is near the light or maybe in the range of the light I sometimes get wrong results.
This is how I am doing the job:
public void GetBoundingInfo(Camera cam,out LightBoundInfo BoundInfo)
{
Vector3 Center = Owner.Position;
Vector3 min = new Vector3(Center.X - Range , Center.Y - Range , Center.Z - Range );
Vector3 max = new Vector3(Center.X + Range , Center.Y + Range , Center.Z + Range );
BoundingBox BoxInWorld = new BoundingBox(min,max);
BoundingBox BoxInViewSpace = Utility.MathUtility.TransformBox(BoxInWorld, cam.View);
BoundingBox BoxInClipSpace = Utility.MathUtility.TransformBox(BoxInWorld, cam.ViewProjection);
Vector2 MinMaxZ = new Vector2(BoxInViewSpace.Minimum.Z, BoxInViewSpace.Maximum.Z);
Vector2 MinClipSpace = new Vector2(Math.Max(BoxInClipSpace.Minimum.X, -1.0f), Math.Max(BoxInClipSpace.Minimum.Y,-1.0f));
Vector2 MaxClipSpace = new Vector2(Math.Min(BoxInClipSpace.Maximum.X, 1.0f), Math.Min(BoxInClipSpace.Maximum.Y,1.0f));
MinClipSpace.X = ((MinClipSpace.X / 2.0f) + 0.5f) * cam.TargetBuffer.width;
MaxClipSpace.X = ((MaxClipSpace.X / 2.0f) + 0.5f) * cam.TargetBuffer.width;
MinClipSpace.Y = (1.0f-((MinClipSpace.Y / 2.0f) + 0.5f)) * cam.TargetBuffer.height;
MaxClipSpace.Y = (1.0f-((MaxClipSpace.Y / 2.0f) + 0.5f)) * cam.TargetBuffer.height;
float temp = MinClipSpace.Y;
MinClipSpace.Y = MaxClipSpace.Y;
MaxClipSpace.Y = temp;
int width = (int)Math.Ceiling(MaxClipSpace.X - MinClipSpace.X);
int height = (int)Math.Ceiling(MaxClipSpace.Y - MinClipSpace.Y);
BoundInfo = new LightBoundInfo((int)MinClipSpace.X, width, (int)MinClipSpace.Y, height, MinMaxZ);
}
///
public static BoundingBox TransformBox(BoundingBox box,Matrix matrix)
{
Vector3[] CornerPoints = box.GetCorners();
Vector3 min = new Vector3(float.MaxValue);
Vector3 max = new Vector3(float.MinValue);
for (int i = 0; i < CornerPoints.Length; i++)
{
Vector4 transformed = Vector3.Transform(CornerPoints[i], matrix);
Vector3 Vec3 = new Vector3(transformed.X / transformed.W, transformed.Y / transformed.W, transformed.Z / transformed.W);
min = Vector3.Min(Vec3,min);
max = Vector3.Max(Vec3, max);
}
return new BoundingBox(min, max);
}
And the shader to do the tests :
cbuffer Globals
{
int LightCount;//number of lights that we currently have for this frame
int Width;
int Height;
int GroupCountX;
};
struct LightBoundInfo
{
float2 MinMaxZ;//Minimum and Maximum View Space Z
float4 BoundingRect; //(x,y,width,height)
bool AlwaysVisible;
};
struct LightIndicesStruct
{
int indices[MAXIMUM_PER_TILE]; // MAXIMUM_PER_TILE is a define
};
StructuredBuffer<LightBoundInfo> LightsBoundInfos;
Texture2D DepthBufferMinMaxZ;
RWStructuredBuffer<LightIndicesStruct> Output;
bool RectCheck(float4 rect1,float4 rect2)
{
return (rect1.x<=min(Width, rect2.x + rect2.z)) && (min(rect1.x + rect1.z, Width)>=rect2.x) && (rect1.y<=min(Height, rect2.y + rect2.w)) && (min(rect1.y + rect1.w, Height)>=rect2.y);
}
groupshared uint LastIndex = 0;
groupshared float2 GroupMinMaxZ;
void AddToIndices(int LightIndex,int2 groupId)
{
int index;
InterlockedAdd(LastIndex, 1, index);
Output[groupId.y*GroupCountX + groupId.x].indices[index] = LightIndex;
}
void EndIndices(int2 groupId)
{
int index;
InterlockedAdd(LastIndex, 1, index);
Output[groupId.y*GroupCountX + groupId.x].indices[index] = -1;
}
void ProcessLight(int LightIndex, float4 RectTile,int2 groupId)
{
if (LightIndex < LightCount)
{// we have to process this light , let's see if we need it in this tile
LightBoundInfo info = LightsBoundInfos[LightIndex];
if (info.AlwaysVisible)//it's directional light we need this
AddToIndices(LightIndex, groupId);
else
{//it's not directional
bool ZReject = (info.MinMaxZ.x>GroupMinMaxZ.y) || (info.MinMaxZ.y<GroupMinMaxZ.x);
if (!ZReject && RectCheck(RectTile, info.BoundingRect))
{// this light hass effect on this tile
AddToIndices(LightIndex, groupId);
}
}
}
}
[numthreads(1024,1,1)]
void main(int3 dispathThreadId:SV_DispatchThreadID,int3 groupID:SV_GroupID , int3 groupThreadId:SV_GroupThreadId) //supports 2048 lights at maximum now , executed per tile
{
//first thread reads MinMaxZ for group , preventing other threads to read from texture again
if (groupThreadId.x == 0)
GroupMinMaxZ = DepthBufferMinMaxZ[groupID.xy].xy;
GroupMemoryBarrierWithGroupSync();//wait for all threads
ProcessLight(groupThreadId.x, float4(groupID.xy * 32, 32, 32), groupID.xy);
ProcessLight(groupThreadId.x + 1024, float4(groupID.xy * 32, 32, 32), groupID.xy);
GroupMemoryBarrierWithGroupSync();//wait for all threads
if (groupThreadId.x == 0)//set -1 to LightIndices , it's a sign for end of lights
EndIndices(groupID.xy);
}
technique11 Tech0
{
pass P0
{
SetVertexShader(NULL);
SetPixelShader(NULL);
SetComputeShader(CompileShader(cs_5_0, main()));
}
}
Zreject is working properly as far as I see
Thanks for your help in advance.