I have this animation demo, which takes quite a few seconds to load. I initially thought that it was related to the model, but apparently the culprit is the skinning VS, and specifically the bones-matrices.
The VS looks like
cbuffer cbPerMesh : register(b1)
{
matrix gBones[256];
}
struct VS_IN
{
float4 PosL : POSITION;
float3 NormalL : NORMAL;
float2 TexC : TEXCOORD;
float4 BonesWeights[2] : BONE_WEIGHTS;
uint4 BonesIDs[2] : BONE_IDS;
};
struct VS_OUT
{
float4 svPos : SV_POSITION;
float2 TexC : TEXCOORD;
float3 NormalW : NORMAL;
};
float4x4 CalculateWorldMatrixFromBones(float4 BonesWeights[2], uint4 BonesIDs[2], float4x4 Bones[256])
{
float4x4 WorldMat = { float4(0, 0, 0, 0), float4(0, 0, 0, 0), float4(0, 0, 0, 0), float4(0, 0, 0, 0) };
for(int i = 0; i < 2; i++)
{
WorldMat += Bones[BonesIDs[i].x] * BonesWeights[i].x;
WorldMat += Bones[BonesIDs[i].y] * BonesWeights[i].y;
WorldMat += Bones[BonesIDs[i].z] * BonesWeights[i].z;
WorldMat += Bones[BonesIDs[i].w] * BonesWeights[i].w;
}
return WorldMat;
}
VS_OUT VS(VS_IN vIn)
{
VS_OUT vOut;
float4x4 World = CalculateWorldMatrixFromBones(vIn.BonesWeights, vIn.BonesIDs, gBones);
vOut.svPos = mul(mul(vIn.PosL, World), gVPMat);
vOut.TexC = vIn.TexC;
vOut.NormalW = mul(float4(vIn.NormalL, 0), World).xyz;
return vOut;
}
As you can see, this shader supports up to 256 bones per model. Compiling this shader takes around 5 seconds on my Core-i7 CPU.
If I reduce the number of supported bones to 16, it compiles almost immediately.
Funny thing is that the generated assembly is exactly the same (except for the CB declaration).
I find it weird - the code doesn't rely in any way on the matrices count.
Anyone has any idea why the performance degradation?