That's really interesting, I seem to have had some misconceptions about this.

Here's the asm: (with i = 3)

//
// Generated by Microsoft (R) HLSL Shader Compiler 9.29.952.3111
//
//
// Buffer Definitions:
//
// cbuffer $Globals
// {
//
// bool UseCubeBlending; // Offset: 0 Size: 4
// float4x4 WorldViewProjection; // Offset: 16 Size: 64 [unused]
// float4x4 WorldView; // Offset: 80 Size: 64 [unused]
// float4x4 World; // Offset: 144 Size: 64 [unused]
// float3 CameraPosition; // Offset: 208 Size: 12
// float FarPlane; // Offset: 220 Size: 4 [unused]
// float Radius; // Offset: 224 Size: 4
// float SlopeIntensity; // Offset: 228 Size: 4
// float BumpIntensity; // Offset: 232 Size: 4 [unused]
// float Temperature; // Offset: 236 Size: 4
//
// }
//
//
// Resource Bindings:
//
// Name Type Format Dim Slot Elements
// ------------------------------ ---------- ------- ----------- ---- --------
// AnisoSampler sampler NA NA 0 1
// AnisoClampSampler sampler NA NA 1 1
// DiffuseCube texture float4 cube 0 1
// NormalCube texture float4 cube 1 1
// DiffuseArray texture float4 2darray 2 1
// $Globals cbuffer NA NA 0 1
//
//
//
// Input signature:
//
// Name Index Mask Register SysValue Format Used
// -------------------- ----- ------ -------- -------- ------ ------
// SV_POSITION 0 xyzw 0 POS float
// TEXCOORD 0 xyz 1 NONE float xyz
// TEXCOORD 2 w 1 NONE float w
// TEXCOORD 1 xyz 2 NONE float xyz
// TEXCOORD 3 w 2 NONE float w
//
//
// Output signature:
//
// Name Index Mask Register SysValue Format Used
// -------------------- ----- ------ -------- -------- ------ ------
// SV_TARGET 0 xyzw 0 TARGET float xyzw
// SV_TARGET 1 xyzw 1 TARGET float xyzw
// SV_TARGET 2 xyzw 2 TARGET float xyzw
//
ps_4_0
dcl_constantbuffer cb0[15], immediateIndexed
dcl_sampler s0, mode_default
dcl_sampler s1, mode_default
dcl_resource_texturecube (float,float,float,float) t0
dcl_resource_texturecube (float,float,float,float) t1
dcl_resource_texture2darray (float,float,float,float) t2
dcl_input_ps linear v1.xyz
dcl_input_ps linear v1.w
dcl_input_ps linear v2.xyz
dcl_input_ps linear v2.w
dcl_output o0.xyzw
dcl_output o1.xyzw
dcl_output o2.xyzw
dcl_temps 10
dp3 r0.x, v1.xyzx, v1.xyzx
rsq r0.y, r0.x
mul r1.xyz, r0.yyyy, v1.xyzx
dp3 r0.y, v2.xyzx, v2.xyzx
rsq r0.y, r0.y
mul r0.yzw, r0.yyyy, v2.xxyz
add r2.xyz, |r0.yzwy|, l(-0.200000, -0.200000, -0.200000, 0.000000)
mul r2.xyz, r2.xyzx, l(7.000000, 7.000000, 7.000000, 0.000000)
max r2.xyz, r2.xyzx, l(0.000000, 0.000000, 0.000000, 0.000000)
add r2.w, r2.y, r2.x
add r2.w, r2.z, r2.w
div r2.xyz, r2.xyzx, r2.wwww
mul r3.xyz, v1.zxxz, l(2.000000, 2.000000, 2.000000, 0.000000)
lt r4.xyz, l(0.000000, 0.000000, 0.000000, 0.000000), r0.yzwy
lt r5.xyz, r0.yzwy, l(0.000000, 0.000000, 0.000000, 0.000000)
iadd r4.xyz, r5.xyzx, -r4.xyzx
itof r4.xyz, r4.xyzx
mul r5.xz, r3.xxyx, r4.xxyx
mul r3.x, r3.z, -r4.z
sqrt r0.x, r0.x
div r0.x, r0.x, cb0[14].x
add r0.x, r0.x, l(-1.000000)
mad r2.w, cb0[14].w, l(4500.000000), l(4500.000000)
mad r4.x, -|cb0[14].w|, l(1000.000000), l(1000.000000)
max r4.x, r4.x, l(40.000000)
max r4.y, cb0[14].w, l(0.000000)
dp3 r4.z, r0.yzwy, r1.xyzx
log r4.z, |r4.z|
mul r4.z, r4.z, cb0[14].y
exp r4.z, r4.z
add r4.z, -r4.z, l(1.000000)
max r4.z, r4.z, l(0.000000)
mul r4.zw, r4.zzzz, l(0.000000, 0.000000, 3000.000000, 500.000000)
mad r4.z, r0.x, l(6371000.000000), r4.z
mad r4.y, r4.y, l(3000.000000), l(2000.000000)
add r4.y, -r4.y, r4.z
mul_sat r4.y, r4.y, l(0.000500)
add r4.z, -r4.y, l(1.000000)
mad r0.x, r0.x, l(6371000.000000), -r4.w
add r4.w, r2.w, -r4.x
add r2.w, r2.w, r4.x
add r0.x, r0.x, -r4.w
add r2.w, -r4.w, r2.w
div_sat r0.x, r0.x, r2.w
add r2.w, -r0.x, r4.z
add r4.x, -r0.x, r4.y
lt r4.y, l(0.000000), r2.w
mul r5.y, v1.y, l(-2.000000)
mov r5.w, l(0)
sample r6.xyzw, r5.xywx, t2.xyzw, s0
mov r7.xz, r5.zzwz
mul r7.y, v1.z, l(-2.000000)
sample r8.xyzw, r7.xyzx, t2.xyzw, s0
mul r3.y, v1.y, l(-2.000000)
mov r3.zw, l(0,0,0,1.000000)
sample r9.xyzw, r3.xyzx, t2.xyzw, s0
if_nz r4.y
mul r4.yzw, r2.yyyy, r8.xxyz
mad r4.yzw, r6.xxyz, r2.xxxx, r4.yyzw
mad r4.yzw, r9.xxyz, r2.zzzz, r4.yyzw
mul r4.yzw, r2.wwww, r4.yyzw
else
mov r4.yzw, l(0,0,0,0)
endif
lt r2.w, l(0.000000), r4.x
mov r5.zw, l(0,0,1.000000,2.000000)
sample r6.xyzw, r5.xyzx, t2.xyzw, s0
mov r7.w, l(1.000000)
sample r8.xyzw, r7.xywx, t2.xyzw, s0
sample r9.xyzw, r3.xywx, t2.xyzw, s0
if_nz r2.w
mul r8.xyz, r2.yyyy, r8.xyzx
mad r6.xyz, r6.xyzx, r2.xxxx, r8.xyzx
mad r6.xyz, r9.xyzx, r2.zzzz, r6.xyzx
mad r4.yzw, r4.xxxx, r6.xxyz, r4.yyzw
endif
lt r2.w, l(0.000000), r0.x
sample r5.xyzw, r5.xywx, t2.xyzw, s0
mov r7.z, l(2.000000)
sample r6.xyzw, r7.xyzx, t2.xyzw, s0
mov r3.z, l(2.000000)
sample r3.xyzw, r3.xyzx, t2.xyzw, s0
if_nz r2.w
mul r6.xyz, r2.yyyy, r6.xyzx
mad r2.xyw, r5.xyxz, r2.xxxx, r6.xyxz
mad r2.xyz, r3.xyzx, r2.zzzz, r2.xywx
mad r4.yzw, r0.xxxx, r2.xxyz, r4.yyzw
endif
if_nz cb0[0].x
mov r1.w, -r1.z
sample r2.xyzw, r1.xywx, t1.xyzw, s1
sample r1.xyzw, r1.xywx, t0.xyzw, s1
mad r2.xyz, r2.xyzx, l(2.000000, 2.000000, 2.000000, 0.000000), -r0.yzwy
add r3.xyz, -v1.xyzx, cb0[13].xyzx
dp3 r0.x, r3.xyzx, r3.xyzx
sqrt r0.x, r0.x
div r0.x, r0.x, cb0[14].x
add r0.x, r0.x, l(-0.015000)
mul_sat r0.x, r0.x, l(19.985001)
add r2.xyz, r2.xyzx, l(-1.000000, -1.000000, -1.000000, 0.000000)
mad r2.xyz, r0.xxxx, r2.xyzx, r0.yzwy
dp3 r1.w, r2.xyzx, r2.xyzx
rsq r1.w, r1.w
mul r0.yzw, r1.wwww, r2.xxyz
add r1.xyz, -r4.yzwy, r1.xyzx
mad r4.yzw, r0.xxxx, r1.xxyz, r4.yyzw
endif
add r0.x, v1.w, l(0.001000)
mul_sat r0.x, r0.x, l(1000.000000)
add r1.xyz, r4.yzwy, l(-0.000000, -0.000000, -1.000000, 0.000000)
mad o0.xyz, r0.xxxx, r1.xyzx, l(0.000000, 0.000000, 1.000000, 0.000000)
mov o0.w, l(1.000000)
mov o1.xyz, r0.yzwy
mov o1.w, l(1.000000)
mov o2.x, v2.w
mov o2.yzw, l(0,0,0,1.000000)
ret
// Approximately 117 instruction slots used

I'm not really sure how to interpret this.

t2 is the texture array in question, so I guess this part is responsible for sampling:

lt r4.y, l(0.000000), r2.w
mul r5.y, v1.y, l(-2.000000)
mov r5.w, l(0)
sample r6.xyzw, r5.xywx, t2.xyzw, s0
mov r7.xz, r5.zzwz
mul r7.y, v1.z, l(-2.000000)
sample r8.xyzw, r7.xyzx, t2.xyzw, s0
mul r3.y, v1.y, l(-2.000000)
mov r3.zw, l(0,0,0,1.000000)
sample r9.xyzw, r3.xyzx, t2.xyzw, s0
if_nz r4.y
mul r4.yzw, r2.yyyy, r8.xxyz
mad r4.yzw, r6.xxyz, r2.xxxx, r4.yyzw
mad r4.yzw, r9.xxyz, r2.zzzz, r4.yyzw
mul r4.yzw, r2.wwww, r4.yyzw
else
mov r4.yzw, l(0,0,0,0)
endif
lt r2.w, l(0.000000), r4.x
mov r5.zw, l(0,0,1.000000,2.000000)
sample r6.xyzw, r5.xyzx, t2.xyzw, s0
mov r7.w, l(1.000000)
sample r8.xyzw, r7.xywx, t2.xyzw, s0
sample r9.xyzw, r3.xywx, t2.xyzw, s0
if_nz r2.w
mul r8.xyz, r2.yyyy, r8.xyzx
mad r6.xyz, r6.xyzx, r2.xxxx, r8.xyzx
mad r6.xyz, r9.xyzx, r2.zzzz, r6.xyzx
mad r4.yzw, r4.xxxx, r6.xxyz, r4.yyzw
endif
lt r2.w, l(0.000000), r0.x
sample r5.xyzw, r5.xywx, t2.xyzw, s0
mov r7.z, l(2.000000)
sample r6.xyzw, r7.xyzx, t2.xyzw, s0
mov r3.z, l(2.000000)
sample r3.xyzw, r3.xyzx, t2.xyzw, s0
if_nz r2.w
mul r6.xyz, r2.yyyy, r6.xyzx
mad r2.xyw, r5.xyxz, r2.xxxx, r6.xyxz
mad r2.xyz, r3.xyzx, r2.zzzz, r2.xywx
mad r4.yzw, r0.xxxx, r2.xxyz, r4.yyzw
endif

It indeed doesn't seem to branch out the sample instructions.

I know that I'm bottlenecking on texture sampling (bandwidth) though, which isn't a big surprise because I do way too much texture sampling.

When I add the if statements the performance increases massively, so I'm assuming the gpu is somehow able to skip the sample instructions if I add these conditions?