The zero case is kind of degenerate, anyways. Assuming we assign the index variable to the matching array slot, and print a random value from the array after the loop to prevent it being optimised away, the results are even stranger...Of course a contrived case can be set up to prove any point, so it is important to do an equivalent comparison.

At -O2 optimisation, both loops get unrolled, and converted to SSE. However, the up case weights in at 120 lines of ASM, whereas the down case is a whopping 164.

Now, my SSE isn't up to scratch enough to tell which of these is more efficient, but it's fairly obvious which one is simpler

// up.c #include <stdio.h> #include <stdlib.h> int main() { char buf[10000]; const unsigned int limit = 10000; for(unsigned int i = 0; i != limit; ++i){ buf[i] = i; } printf("%d\n", buf[rand() % 10000]); }

// down.c #include <stdio.h> #include <stdlib.h> int main() { char buf[10000]; const unsigned int limit = 10000; for(unsigned int i = limit; i--;){ buf[i] = i; } printf("%d\n", buf[rand() % 10000]); }

// up.s .section __TEXT,__text,regular,pure_instructions .section __TEXT,__const .align 4 LCPI0_0: .byte 0 ## 0x0 .byte 1 ## 0x1 .byte 2 ## 0x2 .byte 3 ## 0x3 .byte 4 ## 0x4 .byte 5 ## 0x5 .byte 6 ## 0x6 .byte 7 ## 0x7 .byte 8 ## 0x8 .byte 9 ## 0x9 .byte 10 ## 0xa .byte 11 ## 0xb .byte 12 ## 0xc .byte 13 ## 0xd .byte 14 ## 0xe .byte 15 ## 0xf LCPI0_1: .byte 16 ## 0x10 .byte 17 ## 0x11 .byte 18 ## 0x12 .byte 19 ## 0x13 .byte 20 ## 0x14 .byte 21 ## 0x15 .byte 22 ## 0x16 .byte 23 ## 0x17 .byte 24 ## 0x18 .byte 25 ## 0x19 .byte 26 ## 0x1a .byte 27 ## 0x1b .byte 28 ## 0x1c .byte 29 ## 0x1d .byte 30 ## 0x1e .byte 31 ## 0x1f .section __TEXT,__text,regular,pure_instructions .globl _main .align 4, 0x90 _main: ## @main .cfi_startproc ## BB#0: ## %vector.ph pushq %rbp Ltmp3: .cfi_def_cfa_offset 16 Ltmp4: .cfi_offset %rbp, -16 movq %rsp, %rbp Ltmp5: .cfi_def_cfa_register %rbp pushq %rbx subq $10008, %rsp ## imm = 0x2718 Ltmp6: .cfi_offset %rbx, -24 movq ___stack_chk_guard@GOTPCREL(%rip), %rbx movq (%rbx), %rax movq %rax, -16(%rbp) xorl %eax, %eax movdqa LCPI0_0(%rip), %xmm0 movdqa LCPI0_1(%rip), %xmm1 .align 4, 0x90 LBB0_1: ## %vector.body ## =>This Inner Loop Header: Depth=1 movd %eax, %xmm2 punpcklbw %xmm2, %xmm2 ## xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] punpcklbw %xmm2, %xmm2 ## xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] pshufd $0, %xmm2, %xmm2 ## xmm2 = xmm2[0,0,0,0] movdqa %xmm2, %xmm3 paddb %xmm0, %xmm3 paddb %xmm1, %xmm2 movdqa %xmm3, -10016(%rbp,%rax) movdqa %xmm2, -10000(%rbp,%rax) addq $32, %rax cmpq $9984, %rax ## imm = 0x2700 jne LBB0_1 ## BB#2: ## %scalar.ph.preheader xorl %eax, %eax .align 4, 0x90 LBB0_3: ## %scalar.ph ## =>This Inner Loop Header: Depth=1 movb %al, -32(%rbp,%rax) incq %rax cmpq $16, %rax jne LBB0_3 ## BB#4: callq _rand cltq imulq $1759218605, %rax, %rcx ## imm = 0x68DB8BAD movq %rcx, %rdx shrq $63, %rdx sarq $44, %rcx addl %edx, %ecx imull $10000, %ecx, %ecx ## imm = 0x2710 subl %ecx, %eax cltq movsbl -10016(%rbp,%rax), %esi leaq L_.str(%rip), %rdi xorl %eax, %eax callq _printf movq (%rbx), %rax cmpq -16(%rbp), %rax jne LBB0_6 ## BB#5: xorl %eax, %eax addq $10008, %rsp ## imm = 0x2718 popq %rbx popq %rbp ret LBB0_6: callq ___stack_chk_fail .cfi_endproc .section __TEXT,__cstring,cstring_literals L_.str: ## @.str .asciz "%d\n" .subsections_via_symbols

// down.s .section __TEXT,__text,regular,pure_instructions .section __TEXT,__const .align 4 LCPI0_0: .quad -8 ## 0xfffffffffffffff8 .quad -9 ## 0xfffffffffffffff7 LCPI0_1: .quad -4 ## 0xfffffffffffffffc .quad -5 ## 0xfffffffffffffffb LCPI0_2: .quad -12 ## 0xfffffffffffffff4 .quad -13 ## 0xfffffffffffffff3 LCPI0_3: .quad -2 ## 0xfffffffffffffffe .quad -3 ## 0xfffffffffffffffd LCPI0_4: .quad -10 ## 0xfffffffffffffff6 .quad -11 ## 0xfffffffffffffff5 LCPI0_5: .quad -6 ## 0xfffffffffffffffa .quad -7 ## 0xfffffffffffffff9 LCPI0_6: .quad -14 ## 0xfffffffffffffff2 .quad -15 ## 0xfffffffffffffff1 LCPI0_7: .byte 15 ## 0xf .byte 14 ## 0xe .byte 13 ## 0xd .byte 12 ## 0xc .byte 11 ## 0xb .byte 10 ## 0xa .byte 9 ## 0x9 .byte 8 ## 0x8 .byte 7 ## 0x7 .byte 6 ## 0x6 .byte 5 ## 0x5 .byte 4 ## 0x4 .byte 3 ## 0x3 .byte 2 ## 0x2 .byte 1 ## 0x1 .byte 0 ## 0x0 .section __TEXT,__text,regular,pure_instructions .globl _main .align 4, 0x90 _main: ## @main .cfi_startproc ## BB#0: ## %vector.ph pushq %rbp Ltmp3: .cfi_def_cfa_offset 16 Ltmp4: .cfi_offset %rbp, -16 movq %rsp, %rbp Ltmp5: .cfi_def_cfa_register %rbp pushq %rbx subq $10136, %rsp ## imm = 0x2798 Ltmp6: .cfi_offset %rbx, -24 movq ___stack_chk_guard@GOTPCREL(%rip), %rbx movq (%rbx), %rax movq %rax, -16(%rbp) movl $10000, %eax ## imm = 0x2710 movq $-1, %rcx movd %rcx, %xmm0 pslldq $8, %xmm0 movdqa LCPI0_1(%rip), %xmm9 movdqa LCPI0_2(%rip), %xmm10 movdqa LCPI0_3(%rip), %xmm11 movdqa LCPI0_4(%rip), %xmm12 movdqa LCPI0_5(%rip), %xmm13 movdqa LCPI0_6(%rip), %xmm14 movdqa LCPI0_7(%rip), %xmm15 .align 4, 0x90 LBB0_1: ## %vector.body ## =>This Inner Loop Header: Depth=1 leaq -1(%rax), %rcx movd %rcx, %xmm2 movlhps %xmm2, %xmm2 ## xmm2 = xmm2[0,0] movaps %xmm2, %xmm3 movaps %xmm2, %xmm4 movaps %xmm2, %xmm5 movaps %xmm2, %xmm6 movaps %xmm2, %xmm7 movaps %xmm2, %xmm1 movaps %xmm2, %xmm8 paddq %xmm13, %xmm8 paddq %xmm14, %xmm2 movdqa %xmm2, -10144(%rbp) movdqa %xmm8, -10080(%rbp) punpcklbw %xmm2, %xmm8 ## xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] paddq %xmm11, %xmm7 paddq %xmm12, %xmm1 movdqa %xmm1, -10112(%rbp) movdqa %xmm7, -10048(%rbp) punpcklbw %xmm1, %xmm7 ## xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] punpcklbw %xmm8, %xmm7 ## xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] paddq %xmm0, %xmm3 paddq LCPI0_0(%rip), %xmm4 paddq %xmm9, %xmm5 paddq %xmm10, %xmm6 movdqa %xmm6, -10128(%rbp) movdqa %xmm5, -10064(%rbp) movdqa %xmm4, -10096(%rbp) movdqa %xmm3, -10032(%rbp) punpcklbw %xmm6, %xmm5 ## xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] punpcklbw %xmm4, %xmm3 ## xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] punpcklbw %xmm5, %xmm3 ## xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] punpcklbw %xmm7, %xmm3 ## xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] movd -10136(%rbp), %xmm1 movd -10072(%rbp), %xmm2 punpcklbw %xmm1, %xmm2 ## xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] movd -10104(%rbp), %xmm1 movd -10040(%rbp), %xmm4 punpcklbw %xmm1, %xmm4 ## xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] punpcklbw %xmm2, %xmm4 ## xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] movd -10120(%rbp), %xmm1 movd -10056(%rbp), %xmm2 punpcklbw %xmm1, %xmm2 ## xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] movd -10088(%rbp), %xmm1 movd -10024(%rbp), %xmm5 punpcklbw %xmm1, %xmm5 ## xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] punpcklbw %xmm2, %xmm5 ## xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] punpcklbw %xmm4, %xmm5 ## xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] punpcklbw %xmm5, %xmm3 ## xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] pshufb %xmm15, %xmm3 movdqu %xmm3, -10032(%rbp,%rax) addq $-16, %rax jne LBB0_1 ## BB#2: ## %middle.block callq _rand cltq imulq $1759218605, %rax, %rcx ## imm = 0x68DB8BAD movq %rcx, %rdx shrq $63, %rdx sarq $44, %rcx addl %edx, %ecx imull $10000, %ecx, %ecx ## imm = 0x2710 subl %ecx, %eax cltq movsbl -10016(%rbp,%rax), %esi leaq L_.str(%rip), %rdi xorl %eax, %eax callq _printf movq (%rbx), %rax cmpq -16(%rbp), %rax jne LBB0_4 ## BB#3: ## %middle.block xorl %eax, %eax addq $10136, %rsp ## imm = 0x2798 popq %rbx popq %rbp ret LBB0_4: ## %middle.block callq ___stack_chk_fail .cfi_endproc .section __TEXT,__cstring,cstring_literals L_.str: ## @.str .asciz "%d\n" .subsections_via_symbols