D3D9 pixel shaders have a major limitation, which is that they can't dynamically index into shader constants. This means that it can't use an actual loop construct in assembly to implement your for loop, instead it has to unroll it and do something like this:

It's possible to use a loop with D3D9, take a look at this sample:

http://www.dhpoware.com/demos/d3d9NormalMappingWithManyLights.html

Have you looked at the generated assembly? It looks like this:

ps_3_0
def c46, -4, -5, -6, -7
def c47, 0, 1, 2, 3
dcl_texcoord v0.xyz
dcl_texcoord1 v1.xy
dcl_texcoord2 v2.xyz
dcl_texcoord3 v3.xyz
dcl_2d s0
nrm r0.xyz, v3
dp3 r0.w, v2, v2
rsq r0.w, r0.w
mov r1, c47.x
mov r2.x, c47.x
rep i0
add r3, r2.x, -c47
add r4, r2.x, c46
mov r5.x, c47.x
cmp r2.yzw, -r3_abs.x, c0.xxyz, r5.x
cmp r2.yzw, -r3_abs.y, c5.xxyz, r2
cmp r2.yzw, -r3_abs.z, c10.xxyz, r2
cmp r2.yzw, -r3_abs.w, c15.xxyz, r2
cmp r2.yzw, -r4_abs.x, c20.xxyz, r2
cmp r2.yzw, -r4_abs.y, c25.xxyz, r2
cmp r2.yzw, -r4_abs.z, c30.xxyz, r2
cmp r2.yzw, -r4_abs.w, c35.xxyz, r2
add r2.yzw, r2, -v0.xxyz
cmp r5.y, -r3_abs.x, c4.x, r5.x
cmp r5.y, -r3_abs.y, c9.x, r5.y
cmp r5.y, -r3_abs.z, c14.x, r5.y
cmp r5.y, -r3_abs.w, c19.x, r5.y
cmp r5.y, -r4_abs.x, c24.x, r5.y
cmp r5.y, -r4_abs.y, c29.x, r5.y
cmp r5.y, -r4_abs.z, c34.x, r5.y
cmp r5.y, -r4_abs.w, c39.x, r5.y
rcp r5.y, r5.y
mul r2.yzw, r2, r5.y
dp3 r5.y, r2.yzww, r2.yzww
add r5.z, -r5.y, c47.y
max r6.x, r5.z, c47.x
rsq r5.y, r5.y
mul r2.yzw, r2, r5.y
mad r5.yzw, v2.xxyz, r0.w, r2
nrm r7.xyz, r5.yzww
dp3_sat r2.y, r0, r2.yzww
dp3_sat r2.z, r0, r7
pow r5.y, r2.z, c44.x
cmp r7, -r3_abs.x, c1, r5.x
cmp r7, -r3_abs.y, c6, r7
cmp r7, -r3_abs.z, c11, r7
cmp r7, -r3_abs.w, c16, r7
cmp r7, -r4_abs.x, c21, r7
cmp r7, -r4_abs.y, c26, r7
cmp r7, -r4_abs.z, c31, r7
cmp r7, -r4_abs.w, c36, r7
mad r7, r6.x, r7, c45
cmp r8, -r3_abs.x, c2, r5.x
cmp r8, -r3_abs.y, c7, r8
cmp r8, -r3_abs.z, c12, r8
cmp r8, -r3_abs.w, c17, r8
cmp r8, -r4_abs.x, c22, r8
cmp r8, -r4_abs.y, c27, r8
cmp r8, -r4_abs.z, c32, r8
cmp r8, -r4_abs.w, c37, r8
mul r8, r8, c41
mul r8, r2.y, r8
mul r8, r6.x, r8
mad r7, c40, r7, r8
cmp r8, -r3_abs.x, c3, r5.x
cmp r8, -r3_abs.y, c8, r8
cmp r8, -r3_abs.z, c13, r8
cmp r3, -r3_abs.w, c18, r8
cmp r3, -r4_abs.x, c23, r3
cmp r3, -r4_abs.y, c28, r3
cmp r3, -r4_abs.z, c33, r3
cmp r3, -r4_abs.w, c38, r3
mul r3, r3, c43
mul r3, r5.y, r3
cmp r3, -r2.y, c47.x, r3
mad r3, r3, r6.x, r7
add r1, r1, r3
add r2.x, r2.x, c47.y
endrep
texld r0, v1, s0
mul oC0, r0, r1

Because of the constant indexing limitation it has to do a compare and select for every single constant register. It's just a different variant of what I mentioned. Basically it's like doing this:

for(uint i = 0; i < NumLights; ++i)
{
float3 LightPos = Lights[0].Position;
if(i == 1)
LightPos = Lights[1].Position;
else if(i == 2)
LightPos = Lights[2].Position;
else if(i == 3)
LightPos = Lights[3].Position;
...
else if(i == 7)
LightPos = Lights[7].Position;
float3 LightColor = Lights[0].Color;
if(i == 1)
LightColor = Lights[1].Color;
else if(i == 2)
LightColor = Lights[2].Color;
else if(i == 3)
LightColor = Lights[3].Color;
...
else if(i == 7)
LightColor = Lights[7].Color;
// and so on
}

**Edited by MJP, 17 July 2013 - 11:27 PM.**