• Create Account

## loop break

54 replies to this topic

### #41swiftcoder  Senior Moderators

17826
Like
2Likes
Like

Posted 01 May 2014 - 07:49 AM

Of course a contrived case can be set up to prove any point, so it is important to do an equivalent comparison.

The zero case is kind of degenerate, anyways. Assuming we assign the index variable to the matching array slot, and print a random value from the array after the loop to prevent it being optimised away, the results are even stranger...

At -O2 optimisation, both loops get unrolled, and converted to SSE. However, the up case weights in at 120 lines of ASM, whereas the down case is a whopping 164.

Now, my SSE isn't up to scratch enough to tell which of these is more efficient, but it's fairly obvious which one is simpler

// up.c
#include <stdio.h>
#include <stdlib.h>

int main() {
char buf[10000];
const unsigned int limit = 10000;

for(unsigned int i = 0; i != limit; ++i){
buf[i] = i;
}

printf("%d\n", buf[rand() % 10000]);
}
// down.c
#include <stdio.h>
#include <stdlib.h>

int main() {
char buf[10000];
const unsigned int limit = 10000;

for(unsigned int i = limit; i--;){
buf[i] = i;
}

printf("%d\n", buf[rand() % 10000]);
}
// up.s
.section	__TEXT,__text,regular,pure_instructions
.section	__TEXT,__const
.align	4
LCPI0_0:
.byte	0                       ## 0x0
.byte	1                       ## 0x1
.byte	2                       ## 0x2
.byte	3                       ## 0x3
.byte	4                       ## 0x4
.byte	5                       ## 0x5
.byte	6                       ## 0x6
.byte	7                       ## 0x7
.byte	8                       ## 0x8
.byte	9                       ## 0x9
.byte	10                      ## 0xa
.byte	11                      ## 0xb
.byte	12                      ## 0xc
.byte	13                      ## 0xd
.byte	14                      ## 0xe
.byte	15                      ## 0xf
LCPI0_1:
.byte	16                      ## 0x10
.byte	17                      ## 0x11
.byte	18                      ## 0x12
.byte	19                      ## 0x13
.byte	20                      ## 0x14
.byte	21                      ## 0x15
.byte	22                      ## 0x16
.byte	23                      ## 0x17
.byte	24                      ## 0x18
.byte	25                      ## 0x19
.byte	26                      ## 0x1a
.byte	27                      ## 0x1b
.byte	28                      ## 0x1c
.byte	29                      ## 0x1d
.byte	30                      ## 0x1e
.byte	31                      ## 0x1f
.section	__TEXT,__text,regular,pure_instructions
.globl	_main
.align	4, 0x90
_main:                                  ## @main
.cfi_startproc
## BB#0:                                ## %vector.ph
pushq	%rbp
Ltmp3:
.cfi_def_cfa_offset 16
Ltmp4:
.cfi_offset %rbp, -16
movq	%rsp, %rbp
Ltmp5:
.cfi_def_cfa_register %rbp
pushq	%rbx
subq	10008, %rsp ## imm = 0x2718 Ltmp6: .cfi_offset %rbx, -24 movq ___stack_chk_guard@GOTPCREL(%rip), %rbx movq (%rbx), %rax movq %rax, -16(%rbp) xorl %eax, %eax movdqa LCPI0_0(%rip), %xmm0 movdqa LCPI0_1(%rip), %xmm1 .align 4, 0x90 LBB0_1: ## %vector.body ## =>This Inner Loop Header: Depth=1 movd %eax, %xmm2 punpcklbw %xmm2, %xmm2 ## xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] punpcklbw %xmm2, %xmm2 ## xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] pshufd0, %xmm2, %xmm2        ## xmm2 = xmm2[0,0,0,0]
movdqa	%xmm2, %xmm3
movdqa	%xmm3, -10016(%rbp,%rax)
movdqa	%xmm2, -10000(%rbp,%rax)
addq	$32, %rax cmpq$9984, %rax             ## imm = 0x2700
jne	LBB0_1
xorl	%eax, %eax
.align	4, 0x90
LBB0_3:                                 ## %scalar.ph
## =>This Inner Loop Header: Depth=1
movb	%al, -32(%rbp,%rax)
incq	%rax
cmpq	$16, %rax jne LBB0_3 ## BB#4: callq _rand cltq imulq$1759218605, %rax, %rcx ## imm = 0x68DB8BAD
movq	%rcx, %rdx
shrq	$63, %rdx sarq$44, %rcx
imull	$10000, %ecx, %ecx ## imm = 0x2710 subl %ecx, %eax cltq movsbl -10016(%rbp,%rax), %esi leaq L_.str(%rip), %rdi xorl %eax, %eax callq _printf movq (%rbx), %rax cmpq -16(%rbp), %rax jne LBB0_6 ## BB#5: xorl %eax, %eax addq$10008, %rsp            ## imm = 0x2718
popq	%rbx
popq	%rbp
ret
LBB0_6:
callq	___stack_chk_fail
.cfi_endproc

.section	__TEXT,__cstring,cstring_literals
L_.str:                                 ## @.str
.asciz	"%d\n"

.subsections_via_symbols
// down.s
.section	__TEXT,__text,regular,pure_instructions
.section	__TEXT,__const
.align	4
LCPI0_0:
LCPI0_1:
LCPI0_2:
LCPI0_3:
LCPI0_4:
LCPI0_5:
LCPI0_6:
LCPI0_7:
.byte	15                      ## 0xf
.byte	14                      ## 0xe
.byte	13                      ## 0xd
.byte	12                      ## 0xc
.byte	11                      ## 0xb
.byte	10                      ## 0xa
.byte	9                       ## 0x9
.byte	8                       ## 0x8
.byte	7                       ## 0x7
.byte	6                       ## 0x6
.byte	5                       ## 0x5
.byte	4                       ## 0x4
.byte	3                       ## 0x3
.byte	2                       ## 0x2
.byte	1                       ## 0x1
.byte	0                       ## 0x0
.section	__TEXT,__text,regular,pure_instructions
.globl	_main
.align	4, 0x90
_main:                                  ## @main
.cfi_startproc
## BB#0:                                ## %vector.ph
pushq	%rbp
Ltmp3:
.cfi_def_cfa_offset 16
Ltmp4:
.cfi_offset %rbp, -16
movq	%rsp, %rbp
Ltmp5:
.cfi_def_cfa_register %rbp
pushq	%rbx
subq	$10136, %rsp ## imm = 0x2798 Ltmp6: .cfi_offset %rbx, -24 movq ___stack_chk_guard@GOTPCREL(%rip), %rbx movq (%rbx), %rax movq %rax, -16(%rbp) movl$10000, %eax            ## imm = 0x2710
movq	$-1, %rcx movd %rcx, %xmm0 pslldq$8, %xmm0
movdqa	LCPI0_1(%rip), %xmm9
movdqa	LCPI0_2(%rip), %xmm10
movdqa	LCPI0_3(%rip), %xmm11
movdqa	LCPI0_4(%rip), %xmm12
movdqa	LCPI0_5(%rip), %xmm13
movdqa	LCPI0_6(%rip), %xmm14
movdqa	LCPI0_7(%rip), %xmm15
.align	4, 0x90
LBB0_1:                                 ## %vector.body
## =>This Inner Loop Header: Depth=1
leaq	-1(%rax), %rcx
movd	%rcx, %xmm2
movlhps	%xmm2, %xmm2            ## xmm2 = xmm2[0,0]
movaps	%xmm2, %xmm3
movaps	%xmm2, %xmm4
movaps	%xmm2, %xmm5
movaps	%xmm2, %xmm6
movaps	%xmm2, %xmm7
movaps	%xmm2, %xmm1
movaps	%xmm2, %xmm8
movdqa	%xmm2, -10144(%rbp)
movdqa	%xmm8, -10080(%rbp)
punpcklbw	%xmm2, %xmm8    ## xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7]
movdqa	%xmm1, -10112(%rbp)
movdqa	%xmm7, -10048(%rbp)
punpcklbw	%xmm1, %xmm7    ## xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7]
punpcklbw	%xmm8, %xmm7    ## xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
movdqa	%xmm6, -10128(%rbp)
movdqa	%xmm5, -10064(%rbp)
movdqa	%xmm4, -10096(%rbp)
movdqa	%xmm3, -10032(%rbp)
punpcklbw	%xmm6, %xmm5    ## xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
punpcklbw	%xmm4, %xmm3    ## xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
punpcklbw	%xmm5, %xmm3    ## xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
punpcklbw	%xmm7, %xmm3    ## xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
movd	-10136(%rbp), %xmm1
movd	-10072(%rbp), %xmm2
punpcklbw	%xmm1, %xmm2    ## xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
movd	-10104(%rbp), %xmm1
movd	-10040(%rbp), %xmm4
punpcklbw	%xmm1, %xmm4    ## xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
punpcklbw	%xmm2, %xmm4    ## xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
movd	-10120(%rbp), %xmm1
movd	-10056(%rbp), %xmm2
punpcklbw	%xmm1, %xmm2    ## xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
movd	-10088(%rbp), %xmm1
movd	-10024(%rbp), %xmm5
punpcklbw	%xmm1, %xmm5    ## xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
punpcklbw	%xmm2, %xmm5    ## xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
punpcklbw	%xmm4, %xmm5    ## xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
punpcklbw	%xmm5, %xmm3    ## xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
pshufb	%xmm15, %xmm3
movdqu	%xmm3, -10032(%rbp,%rax)
addq	$-16, %rax jne LBB0_1 ## BB#2: ## %middle.block callq _rand cltq imulq$1759218605, %rax, %rcx ## imm = 0x68DB8BAD
movq	%rcx, %rdx
shrq	$63, %rdx sarq$44, %rcx
imull	$10000, %ecx, %ecx ## imm = 0x2710 subl %ecx, %eax cltq movsbl -10016(%rbp,%rax), %esi leaq L_.str(%rip), %rdi xorl %eax, %eax callq _printf movq (%rbx), %rax cmpq -16(%rbp), %rax jne LBB0_4 ## BB#3: ## %middle.block xorl %eax, %eax addq$10136, %rsp            ## imm = 0x2798
popq	%rbx
popq	%rbp
ret
LBB0_4:                                 ## %middle.block
callq	___stack_chk_fail
.cfi_endproc

.section	__TEXT,__cstring,cstring_literals
L_.str:                                 ## @.str
.asciz	"%d\n"

.subsections_via_symbols

Tristam MacDonald - Software Engineer @ Amazon - [swiftcoding] [GitHub]

### #42mhagain  Members

12439
Like
0Likes
Like

Posted 03 May 2014 - 06:03 PM

At this stage I feel a new rule of the internet coming on.

"There is no coding standard so reprehensible, so obviously wrong-looking, that somebody somewhere won't try to defend it."  (See also comma-first.)

It appears that the gentleman thought C++ was extremely difficult and he was overjoyed that the machine was absorbing it; he understood that good C++ is difficult but the best C++ is well-nigh unintelligible.

### #43Ectara  Members

3097
Like
0Likes
Like

Posted 03 May 2014 - 06:08 PM

It's actually really cute what happens when you remove the volatile from each case:

Heck, may as well write

;


for both, and watch the compiler optimize away.

### #44Ectara  Members

3097
Like
0Likes
Like

Posted 03 May 2014 - 06:32 PM

Amazing what happens when you actually compare like-with-like, isn't it?

That's not what that is comparing. That's pretty much saying x*0 vs. x - x, where the observer can clearly see that the calculation is useless, and thus do anything they want. If I turn the optimization higher, without volatile, it emits an empty program.

Of course a contrived case can be set up to prove any point, so it is important to do an equivalent comparison.

How is a case where the compiler is permitted to do nothing at all not contrived, again? It feels like you're agreeing with anything that faintly sounds against my argument.

It appears that the optimiser does not know that the two loops are identical, as the more normal upwards loop is optimised to a call to bzero(), whereas the downwards case actually generates a loop.

I acknowledge that the compiler is intelligent enough to convert this to a bzero() call (how it doesn't discard the array entirely is interesting). However, writing zeroes is an example, as the contents of the loop aren't the important part. The fact that the compiler can cheat the test and remove the loop if the loop looks like a certain pattern is not an argument for either loop preference.

To be honest, this doesn't feel like a valid debate anymore. I'm content with its current course of analyzing optimizer behavior.

### #45Ectara  Members

3097
Like
0Likes
Like

Posted 03 May 2014 - 06:35 PM

At -O2 optimisation, both loops get unrolled, and converted to SSE. However, the up case weights in at 120 lines of ASM, whereas the down case is a whopping 164.

That's scary, where an optimizer can take nearly equal code, and make the one that performs better in theory perform worse in practice. However, this does assert a monopoly of the registers, again.

### #46mhagain  Members

12439
Like
0Likes
Like

Posted 04 May 2014 - 05:16 AM

At -O2 optimisation, both loops get unrolled, and converted to SSE. However, the up case weights in at 120 lines of ASM, whereas the down case is a whopping 164.

That's scary, where an optimizer can take nearly equal code, and make the one that performs better in theory perform worse in practice. However, this does assert a monopoly of the registers, again.

At this stage I think you really need to read this: http://blogs.msdn.com/b/oldnewthing/archive/2004/12/16/317157.aspx

The technical specifics in it are of course out of date, but the lesson remains valid, and it's instructive to follow him walking through an optimization scenario where he produces code that - going by the disassembly - should be faster in theory, but turns out to be slower in practice because it ends up totally blowing the return address predictor stack.

This is the same Raymond Chen who has credits on early releases of the Linux kernel, by the way.

It appears that the gentleman thought C++ was extremely difficult and he was overjoyed that the machine was absorbing it; he understood that good C++ is difficult but the best C++ is well-nigh unintelligible.

### #47samoth  Members

8965
Like
1Likes
Like

Posted 04 May 2014 - 05:51 AM

It's actually really cute what happens when you remove the volatile from each case:

Heck, may as well write

;


for both, and watch the compiler optimize away.

printf("%d\n", buf[rand() % 10000]);

That is the proper way (or one proper way) of preventing the compiler from optimizing out the loop. Which, contrary to using volatile on the loop counter, does not include some very different (and wrong) assumptions about the variable and its contents and does not screw up the optimizer.

The difference is that Swiftcoder's program is non-trivial (well it is of course kind of "trivial", but it's not trivial insofar as it actually does something by consuming a value of the array) whereas your solution with volatile is a trivial program (one that basically does nothing at all) that inhibits optimization of that variable.

It would probably be entirely legal for a very aggressively optimizing compiler to optimize out the array alltogether, and only count the volatile loop counter up (or down). Because, in the end, whether or not there is an array at all makes no externally observable difference, it's not consumed anywhere, and the program ends right after initializing it.

Edited by samoth, 04 May 2014 - 05:51 AM.

### #48Ectara  Members

3097
Like
0Likes
Like

Posted 04 May 2014 - 07:36 AM

This is the same Raymond Chen who has credits on early releases of the Linux kernel, by the way.

I know who Raymond Chen is, and I've read that article years ago.

That is the proper way (or one proper way) of preventing the compiler from optimizing out the loop. Which, contrary to using volatile on the loop counter, does not include some very different (and wrong) assumptions about the variable and its contents and does not screw up the optimizer.

Okay, his program is more proper. However, that still ignores the fact that the compiler is optimizing his limit away to a constant (cheating the test), which my volatile limit avoided.

### #49rip-off  Moderators

10730
Like
0Likes
Like

Posted 04 May 2014 - 08:10 AM

However, that still ignores the fact that the compiler is optimizing his limit away to a constant (cheating the test), which my volatile limit avoided.

Again, easily avoided:
// up.c
#include <stdio.h>
#include <stdlib.h>

int main() {
char buf[10000];
unsigned int limit;
printf("enter a number up to 10000: ");
scanf("%u", &limit);

for(unsigned int i = 0; i != limit; ++i){
buf[i] = i;
}

printf("%d\n", buf[rand() % 10000]);
}

And the same of the other program.

On my system:
$gcc -S -O2 *.c -std=c99 down.c: In function ‘main’: down.c:9:7: warning: ignoring return value of ‘scanf’, declared with attribute warn_unused_result [-Wunused-result] up.c: In function ‘main’: up.c:9:7: warning: ignoring return value of ‘scanf’, declared with attribute warn_unused_result [-Wunused-result]$ diff {up,down}.s
1c1
< 	.file	"up.c"
---
> 	.file	"down.c"
33,34c33,34
< 	movl	24(%esp), %eax
< 	testl	%eax, %eax
---
> 	movl	24(%esp), %edx
> 	testl	%edx, %edx
36c36
< 	subl	$1, %eax --- > xorl %eax, %eax 41,42c41,42 < subl$1, %eax
< 	cmpl	$-1, %eax --- > addl$1, %eax
> 	cmpl	%edx, %eax


### #50Ectara  Members

3097
Like
0Likes
Like

Posted 04 May 2014 - 08:30 AM

On my system:

Those two look equal in length, and time complexity, so I have no issues with that.

### #51catslap  Members

39
Like
0Likes
Like

Posted 04 May 2014 - 09:59 AM

Nice to see a simple but interesting question get discussed in detail tbh.

Edited by catslap, 05 May 2014 - 02:23 AM.

### #52Ectara  Members

3097
Like
-1Likes
Like

Posted 04 May 2014 - 10:08 AM

I'm going to be honest with all of you. This seems like a pointless and impossible argument. I knew coming in to it that playing devil's advocate wouldn't make me very popular here.

However, if you're on a platform where you actually care about the code size in terms of free bytes, then unrolling loops and using extra registers for simple bookkeeping is counter-productive.

Things I am arguing:

1. If you're counting bytes, small loops are important.
2. If you're on a low-powered platform, you may not even have that good of an optimizing compiler.
3. If you're on a slow, small-cache/cacheless CPU, that extra register matters.

Things I am NOT arguing:

1. There isn't better code for modern machines and processors.
2. If you [modify the scenario in some way that suits some particular argument], it won't optimize to nothing or some large, but fast, control structure.
3. This is the preferred coding method.
4. Counting down is easier to read.
5. Optimizers are bad at optimizing.

I feel this is turning into a flood of "Look, when I set the limit to zero, the counting up version converts to an AVX loop that plays 'Moonlight Sonata' out of my PC speaker. Take THAT!" and everyone high-fives and up-votes. To be honest, this is all missing the point. Most of the arguments here are throwing out my reasons why I offered a use case, and just attack the flimsiest part of the argument, by offering solutions that make code that is astronomically large for a simple loop, use 2 or more registers just for the loop counting (which, if you only have 4 registers, is a waste), and even do things as bizarre as to make it into a function call.

I suggested that where it matters, it can make a difference. Showing that on a high-speed, giant three-tiered cache, 16 64bit general purpose register processor it doesn't matter is great. However, interpreting these results as being relevant to the original point that I made is misleading.

EDIT: As evidenced by the downvote.

Edited by Ectara, 04 May 2014 - 05:37 PM.

### #53mhagain  Members

12439
Like
0Likes
Like

Posted 04 May 2014 - 10:40 AM

I'm going to be honest with all of you. This seems like a pointless and impossible argument. I knew coming in to it that playing devil's advocate wouldn't make me very popular here.

However, if you're on a platform where you actually care about the code size in terms of free bytes, then unrolling loops and using extra registers for simple bookkeeping is counter-productive.

Things I am arguing:

1. If you're counting bytes, small loops are important.
2. If you're on a low-powered platform, you may not even have that good of an optimizing compiler.
3. If you're on a slow, small-cache/cacheless CPU, that extra register matters.

Things I am NOT arguing:

1. There isn't better code for modern machines and processors.
2. If you [modify the scenario in some way that suits some particular argument], it won't optimize to nothing or some large, but fast, control structure.
3. This is the preferred coding method.
4. Counting down is easier to read.
5. Optimizers are bad at optimizing.

I feel this is turning into a flood of "Look, when I set the limit to zero, the counting up versions converts to an AVX loop that plays 'Moonlight Sonata' out of my PC speaker. Take THAT!" and everyone high-fives and up-votes. To be honest, this is all missing the point. Most of the arguments here are throwing out my reasons why I offered a use case, and just attack the flimsiest part of the argument, by offering solutions that make code that is astronomically large for a simple loop, use 2 or more registers just for the loop counting (which, if you only have 4 registers, is a waste), and even do things as bizarre as to make it into a function call.

I suggested that where it matters, it can make a difference. Showing that on a high-speed, giant three-tiered cache, 16 64bit general purpose register processor it doesn't matter is great. However, interpreting these results as being relevant to the original point that I made is misleading.

I have to say that if you'd qualified your original posts with these points things may have gone differently; the ensuing discussion may have even been interesting rather than tedious!

From my point of view, your earlier posts in this thread came across as though you were someone who was claiming that "for (i = something; i--;)" was faster than "for (i = something; i > something_else; i--)" and that you were providing a disingenuous contrived example that wasn't an apples-to-apples comparison in order to prove a point.  I'm not saying that's what you were doing, I'm saying that's how you came across (I need to stress this because you've been guilty of doing what you claim others have done to you too; i.e misrepresenting their positions and arguments).

"Where it matters it can make a difference" is something that's true of anything: one could hypothesise a processor that's several orders of magnitude slower at subtraction than it is at addition and in that use case - even for asm instructions and register usage - you'd probably still want to count up.  I think that hypothetical (and, I admit, contrived) example defeats your final statement (with specific applicability to counting down vs counting up, not in general terms), but I've no arguments with the thinking behind it, which seems to me to essentially boil down to: "choose your optimization strategies according to what's good and what actually gives results on your target platform".

But surely we all already knew that?

It appears that the gentleman thought C++ was extremely difficult and he was overjoyed that the machine was absorbing it; he understood that good C++ is difficult but the best C++ is well-nigh unintelligible.

### #54rip-off  Moderators

10730
Like
0Likes
Like

Posted 04 May 2014 - 11:31 AM

Trying it on Visual C++, counting up:
; 10   : 	for(unsigned int i = 0; i != limit; ++i){

mov	ecx, DWORD PTR _limit$[ebp] add esp, 12 ; 0000000cH xor eax, eax test ecx, ecx je SHORT$LN1@up
$LL3@up: ; 11 : buf[i] = i; mov BYTE PTR _buf$[ebp+eax], al
inc	eax
cmp	eax, ecx
jne	SHORT $LL3@up$LN1@up:

; 12   : 	}

Down:
; 23   : 	for(unsigned int i = limit; i-- ; ){

mov	eax, DWORD PTR _limit$[ebp] add esp, 12 ; 0000000cH test eax, eax je SHORT$LN6@down
$LL2@down: dec eax ; 24 : buf[i] = i; mov BYTE PTR _buf$[ebp+eax], al
jne	SHORT $LL2@down$LN6@down:

; 25   : 	}

We do appear to have saved an additional compare instruction when counting down.

I'm not familiar with the Visual C++ command line, and the project I used is a bit of a dumping ground for helping people here and elsewhere, so it is possible I've some odd options enabled, but the basic optimisation ones appear to

### #55Ectara  Members

3097
Like
0Likes
Like

Posted 04 May 2014 - 12:35 PM

But surely we all already knew that?

Not everyone; there's a For Beginner's section of the site, and a lot of people may interpret this subforum as a "what not to do" of programming. It's entirely possible for someone to read this, and start automatically condemning every usage of it without bothering to ask why.

I'd prefer that they learn to write well-reading code if they are just learning, but there's no reason to ban parts of their toolbox if it is the tool for the job, unless there's red tape.