Jump to content

  • Log In with Google      Sign In   
  • Create Account


loop break

  • You cannot reply to this topic
54 replies to this topic

#41 swiftcoder   Senior Moderators   -  Reputation: 9637

Like
2Likes
Like

Posted 01 May 2014 - 07:49 AM

Of course a contrived case can be set up to prove any point, so it is important to do an equivalent comparison.

The zero case is kind of degenerate, anyways. Assuming we assign the index variable to the matching array slot, and print a random value from the array after the loop to prevent it being optimised away, the results are even stranger...

At -O2 optimisation, both loops get unrolled, and converted to SSE. However, the up case weights in at 120 lines of ASM, whereas the down case is a whopping 164.

Now, my SSE isn't up to scratch enough to tell which of these is more efficient, but it's fairly obvious which one is simpler smile.png
 
// up.c
#include <stdio.h>
#include <stdlib.h>

int main() {
	char buf[10000];
	const unsigned int limit = 10000;
	
	for(unsigned int i = 0; i != limit; ++i){
		buf[i] = i;
	}
	
	printf("%d\n", buf[rand() % 10000]);
}
// down.c
#include <stdio.h>
#include <stdlib.h>

int main() {
	char buf[10000];
	const unsigned int limit = 10000;
	
	for(unsigned int i = limit; i--;){
		buf[i] = i;
	}
	
	printf("%d\n", buf[rand() % 10000]);
}
// up.s
	.section	__TEXT,__text,regular,pure_instructions
	.section	__TEXT,__const
	.align	4
LCPI0_0:
	.byte	0                       ## 0x0
	.byte	1                       ## 0x1
	.byte	2                       ## 0x2
	.byte	3                       ## 0x3
	.byte	4                       ## 0x4
	.byte	5                       ## 0x5
	.byte	6                       ## 0x6
	.byte	7                       ## 0x7
	.byte	8                       ## 0x8
	.byte	9                       ## 0x9
	.byte	10                      ## 0xa
	.byte	11                      ## 0xb
	.byte	12                      ## 0xc
	.byte	13                      ## 0xd
	.byte	14                      ## 0xe
	.byte	15                      ## 0xf
LCPI0_1:
	.byte	16                      ## 0x10
	.byte	17                      ## 0x11
	.byte	18                      ## 0x12
	.byte	19                      ## 0x13
	.byte	20                      ## 0x14
	.byte	21                      ## 0x15
	.byte	22                      ## 0x16
	.byte	23                      ## 0x17
	.byte	24                      ## 0x18
	.byte	25                      ## 0x19
	.byte	26                      ## 0x1a
	.byte	27                      ## 0x1b
	.byte	28                      ## 0x1c
	.byte	29                      ## 0x1d
	.byte	30                      ## 0x1e
	.byte	31                      ## 0x1f
	.section	__TEXT,__text,regular,pure_instructions
	.globl	_main
	.align	4, 0x90
_main:                                  ## @main
	.cfi_startproc
## BB#0:                                ## %vector.ph
	pushq	%rbp
Ltmp3:
	.cfi_def_cfa_offset 16
Ltmp4:
	.cfi_offset %rbp, -16
	movq	%rsp, %rbp
Ltmp5:
	.cfi_def_cfa_register %rbp
	pushq	%rbx
	subq	$10008, %rsp            ## imm = 0x2718
Ltmp6:
	.cfi_offset %rbx, -24
	movq	___stack_chk_guard@GOTPCREL(%rip), %rbx
	movq	(%rbx), %rax
	movq	%rax, -16(%rbp)
	xorl	%eax, %eax
	movdqa	LCPI0_0(%rip), %xmm0
	movdqa	LCPI0_1(%rip), %xmm1
	.align	4, 0x90
LBB0_1:                                 ## %vector.body
                                        ## =>This Inner Loop Header: Depth=1
	movd	%eax, %xmm2
	punpcklbw	%xmm2, %xmm2    ## xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	punpcklbw	%xmm2, %xmm2    ## xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	pshufd	$0, %xmm2, %xmm2        ## xmm2 = xmm2[0,0,0,0]
	movdqa	%xmm2, %xmm3
	paddb	%xmm0, %xmm3
	paddb	%xmm1, %xmm2
	movdqa	%xmm3, -10016(%rbp,%rax)
	movdqa	%xmm2, -10000(%rbp,%rax)
	addq	$32, %rax
	cmpq	$9984, %rax             ## imm = 0x2700
	jne	LBB0_1
## BB#2:                                ## %scalar.ph.preheader
	xorl	%eax, %eax
	.align	4, 0x90
LBB0_3:                                 ## %scalar.ph
                                        ## =>This Inner Loop Header: Depth=1
	movb	%al, -32(%rbp,%rax)
	incq	%rax
	cmpq	$16, %rax
	jne	LBB0_3
## BB#4:
	callq	_rand
	cltq
	imulq	$1759218605, %rax, %rcx ## imm = 0x68DB8BAD
	movq	%rcx, %rdx
	shrq	$63, %rdx
	sarq	$44, %rcx
	addl	%edx, %ecx
	imull	$10000, %ecx, %ecx      ## imm = 0x2710
	subl	%ecx, %eax
	cltq
	movsbl	-10016(%rbp,%rax), %esi
	leaq	L_.str(%rip), %rdi
	xorl	%eax, %eax
	callq	_printf
	movq	(%rbx), %rax
	cmpq	-16(%rbp), %rax
	jne	LBB0_6
## BB#5:
	xorl	%eax, %eax
	addq	$10008, %rsp            ## imm = 0x2718
	popq	%rbx
	popq	%rbp
	ret
LBB0_6:
	callq	___stack_chk_fail
	.cfi_endproc

	.section	__TEXT,__cstring,cstring_literals
L_.str:                                 ## @.str
	.asciz	"%d\n"


.subsections_via_symbols
// down.s
	.section	__TEXT,__text,regular,pure_instructions
	.section	__TEXT,__const
	.align	4
LCPI0_0:
	.quad	-8                      ## 0xfffffffffffffff8
	.quad	-9                      ## 0xfffffffffffffff7
LCPI0_1:
	.quad	-4                      ## 0xfffffffffffffffc
	.quad	-5                      ## 0xfffffffffffffffb
LCPI0_2:
	.quad	-12                     ## 0xfffffffffffffff4
	.quad	-13                     ## 0xfffffffffffffff3
LCPI0_3:
	.quad	-2                      ## 0xfffffffffffffffe
	.quad	-3                      ## 0xfffffffffffffffd
LCPI0_4:
	.quad	-10                     ## 0xfffffffffffffff6
	.quad	-11                     ## 0xfffffffffffffff5
LCPI0_5:
	.quad	-6                      ## 0xfffffffffffffffa
	.quad	-7                      ## 0xfffffffffffffff9
LCPI0_6:
	.quad	-14                     ## 0xfffffffffffffff2
	.quad	-15                     ## 0xfffffffffffffff1
LCPI0_7:
	.byte	15                      ## 0xf
	.byte	14                      ## 0xe
	.byte	13                      ## 0xd
	.byte	12                      ## 0xc
	.byte	11                      ## 0xb
	.byte	10                      ## 0xa
	.byte	9                       ## 0x9
	.byte	8                       ## 0x8
	.byte	7                       ## 0x7
	.byte	6                       ## 0x6
	.byte	5                       ## 0x5
	.byte	4                       ## 0x4
	.byte	3                       ## 0x3
	.byte	2                       ## 0x2
	.byte	1                       ## 0x1
	.byte	0                       ## 0x0
	.section	__TEXT,__text,regular,pure_instructions
	.globl	_main
	.align	4, 0x90
_main:                                  ## @main
	.cfi_startproc
## BB#0:                                ## %vector.ph
	pushq	%rbp
Ltmp3:
	.cfi_def_cfa_offset 16
Ltmp4:
	.cfi_offset %rbp, -16
	movq	%rsp, %rbp
Ltmp5:
	.cfi_def_cfa_register %rbp
	pushq	%rbx
	subq	$10136, %rsp            ## imm = 0x2798
Ltmp6:
	.cfi_offset %rbx, -24
	movq	___stack_chk_guard@GOTPCREL(%rip), %rbx
	movq	(%rbx), %rax
	movq	%rax, -16(%rbp)
	movl	$10000, %eax            ## imm = 0x2710
	movq	$-1, %rcx
	movd	%rcx, %xmm0
	pslldq	$8, %xmm0
	movdqa	LCPI0_1(%rip), %xmm9
	movdqa	LCPI0_2(%rip), %xmm10
	movdqa	LCPI0_3(%rip), %xmm11
	movdqa	LCPI0_4(%rip), %xmm12
	movdqa	LCPI0_5(%rip), %xmm13
	movdqa	LCPI0_6(%rip), %xmm14
	movdqa	LCPI0_7(%rip), %xmm15
	.align	4, 0x90
LBB0_1:                                 ## %vector.body
                                        ## =>This Inner Loop Header: Depth=1
	leaq	-1(%rax), %rcx
	movd	%rcx, %xmm2
	movlhps	%xmm2, %xmm2            ## xmm2 = xmm2[0,0]
	movaps	%xmm2, %xmm3
	movaps	%xmm2, %xmm4
	movaps	%xmm2, %xmm5
	movaps	%xmm2, %xmm6
	movaps	%xmm2, %xmm7
	movaps	%xmm2, %xmm1
	movaps	%xmm2, %xmm8
	paddq	%xmm13, %xmm8
	paddq	%xmm14, %xmm2
	movdqa	%xmm2, -10144(%rbp)
	movdqa	%xmm8, -10080(%rbp)
	punpcklbw	%xmm2, %xmm8    ## xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7]
	paddq	%xmm11, %xmm7
	paddq	%xmm12, %xmm1
	movdqa	%xmm1, -10112(%rbp)
	movdqa	%xmm7, -10048(%rbp)
	punpcklbw	%xmm1, %xmm7    ## xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7]
	punpcklbw	%xmm8, %xmm7    ## xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
	paddq	%xmm0, %xmm3
	paddq	LCPI0_0(%rip), %xmm4
	paddq	%xmm9, %xmm5
	paddq	%xmm10, %xmm6
	movdqa	%xmm6, -10128(%rbp)
	movdqa	%xmm5, -10064(%rbp)
	movdqa	%xmm4, -10096(%rbp)
	movdqa	%xmm3, -10032(%rbp)
	punpcklbw	%xmm6, %xmm5    ## xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
	punpcklbw	%xmm4, %xmm3    ## xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
	punpcklbw	%xmm5, %xmm3    ## xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
	punpcklbw	%xmm7, %xmm3    ## xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
	movd	-10136(%rbp), %xmm1
	movd	-10072(%rbp), %xmm2
	punpcklbw	%xmm1, %xmm2    ## xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
	movd	-10104(%rbp), %xmm1
	movd	-10040(%rbp), %xmm4
	punpcklbw	%xmm1, %xmm4    ## xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
	punpcklbw	%xmm2, %xmm4    ## xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
	movd	-10120(%rbp), %xmm1
	movd	-10056(%rbp), %xmm2
	punpcklbw	%xmm1, %xmm2    ## xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
	movd	-10088(%rbp), %xmm1
	movd	-10024(%rbp), %xmm5
	punpcklbw	%xmm1, %xmm5    ## xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
	punpcklbw	%xmm2, %xmm5    ## xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
	punpcklbw	%xmm4, %xmm5    ## xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
	punpcklbw	%xmm5, %xmm3    ## xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
	pshufb	%xmm15, %xmm3
	movdqu	%xmm3, -10032(%rbp,%rax)
	addq	$-16, %rax
	jne	LBB0_1
## BB#2:                                ## %middle.block
	callq	_rand
	cltq
	imulq	$1759218605, %rax, %rcx ## imm = 0x68DB8BAD
	movq	%rcx, %rdx
	shrq	$63, %rdx
	sarq	$44, %rcx
	addl	%edx, %ecx
	imull	$10000, %ecx, %ecx      ## imm = 0x2710
	subl	%ecx, %eax
	cltq
	movsbl	-10016(%rbp,%rax), %esi
	leaq	L_.str(%rip), %rdi
	xorl	%eax, %eax
	callq	_printf
	movq	(%rbx), %rax
	cmpq	-16(%rbp), %rax
	jne	LBB0_4
## BB#3:                                ## %middle.block
	xorl	%eax, %eax
	addq	$10136, %rsp            ## imm = 0x2798
	popq	%rbx
	popq	%rbp
	ret
LBB0_4:                                 ## %middle.block
	callq	___stack_chk_fail
	.cfi_endproc

	.section	__TEXT,__cstring,cstring_literals
L_.str:                                 ## @.str
	.asciz	"%d\n"


.subsections_via_symbols

Tristam MacDonald - Software Engineer @Amazon - [swiftcoding]


Sponsor:

#42 mhagain   Crossbones+   -  Reputation: 7461

Like
0Likes
Like

Posted 03 May 2014 - 06:03 PM

At this stage I feel a new rule of the internet coming on.

 

"There is no coding standard so reprehensible, so obviously wrong-looking, that somebody somewhere won't try to defend it."  (See also comma-first.)


It appears that the gentleman thought C++ was extremely difficult and he was overjoyed that the machine was absorbing it; he understood that good C++ is difficult but the best C++ is well-nigh unintelligible.


#43 Ectara   Crossbones+   -  Reputation: 2824

Like
0Likes
Like

Posted 03 May 2014 - 06:08 PM


It's actually really cute what happens when you remove the volatile from each case:

Heck, may as well write

;

for both, and watch the compiler optimize away.



#44 Ectara   Crossbones+   -  Reputation: 2824

Like
0Likes
Like

Posted 03 May 2014 - 06:32 PM


Amazing what happens when you actually compare like-with-like, isn't it?

That's not what that is comparing. That's pretty much saying x*0 vs. x - x, where the observer can clearly see that the calculation is useless, and thus do anything they want. If I turn the optimization higher, without volatile, it emits an empty program.

 


Of course a contrived case can be set up to prove any point, so it is important to do an equivalent comparison.

How is a case where the compiler is permitted to do nothing at all not contrived, again? It feels like you're agreeing with anything that faintly sounds against my argument.

 


It appears that the optimiser does not know that the two loops are identical, as the more normal upwards loop is optimised to a call to bzero(), whereas the downwards case actually generates a loop.

I acknowledge that the compiler is intelligent enough to convert this to a bzero() call (how it doesn't discard the array entirely is interesting). However, writing zeroes is an example, as the contents of the loop aren't the important part. The fact that the compiler can cheat the test and remove the loop if the loop looks like a certain pattern is not an argument for either loop preference.

 

To be honest, this doesn't feel like a valid debate anymore. I'm content with its current course of analyzing optimizer behavior.



#45 Ectara   Crossbones+   -  Reputation: 2824

Like
0Likes
Like

Posted 03 May 2014 - 06:35 PM


At -O2 optimisation, both loops get unrolled, and converted to SSE. However, the up case weights in at 120 lines of ASM, whereas the down case is a whopping 164.

That's scary, where an optimizer can take nearly equal code, and make the one that performs better in theory perform worse in practice. However, this does assert a monopoly of the registers, again.



#46 mhagain   Crossbones+   -  Reputation: 7461

Like
0Likes
Like

Posted 04 May 2014 - 05:16 AM

 


At -O2 optimisation, both loops get unrolled, and converted to SSE. However, the up case weights in at 120 lines of ASM, whereas the down case is a whopping 164.

That's scary, where an optimizer can take nearly equal code, and make the one that performs better in theory perform worse in practice. However, this does assert a monopoly of the registers, again.

 

 

At this stage I think you really need to read this: http://blogs.msdn.com/b/oldnewthing/archive/2004/12/16/317157.aspx

 

The technical specifics in it are of course out of date, but the lesson remains valid, and it's instructive to follow him walking through an optimization scenario where he produces code that - going by the disassembly - should be faster in theory, but turns out to be slower in practice because it ends up totally blowing the return address predictor stack.

 

This is the same Raymond Chen who has credits on early releases of the Linux kernel, by the way.


It appears that the gentleman thought C++ was extremely difficult and he was overjoyed that the machine was absorbing it; he understood that good C++ is difficult but the best C++ is well-nigh unintelligible.


#47 samoth   Crossbones+   -  Reputation: 4522

Like
1Likes
Like

Posted 04 May 2014 - 05:51 AM

 


It's actually really cute what happens when you remove the volatile from each case:

Heck, may as well write

;

for both, and watch the compiler optimize away.

 

No. Please note this line:

printf("%d\n", buf[rand() % 10000]);

That is the proper way (or one proper way) of preventing the compiler from optimizing out the loop. Which, contrary to using volatile on the loop counter, does not include some very different (and wrong) assumptions about the variable and its contents and does not screw up the optimizer.

 

The difference is that Swiftcoder's program is non-trivial (well it is of course kind of "trivial", but it's not trivial insofar as it actually does something by consuming a value of the array) whereas your solution with volatile is a trivial program (one that basically does nothing at all) that inhibits optimization of that variable.

 

It would probably be entirely legal for a very aggressively optimizing compiler to optimize out the array alltogether, and only count the volatile loop counter up (or down). Because, in the end, whether or not there is an array at all makes no externally observable difference, it's not consumed anywhere, and the program ends right after initializing it.


Edited by samoth, 04 May 2014 - 05:51 AM.


#48 Ectara   Crossbones+   -  Reputation: 2824

Like
0Likes
Like

Posted 04 May 2014 - 07:36 AM




This is the same Raymond Chen who has credits on early releases of the Linux kernel, by the way.

I know who Raymond Chen is, and I've read that article years ago.

 


That is the proper way (or one proper way) of preventing the compiler from optimizing out the loop. Which, contrary to using volatile on the loop counter, does not include some very different (and wrong) assumptions about the variable and its contents and does not screw up the optimizer.

Okay, his program is more proper. However, that still ignores the fact that the compiler is optimizing his limit away to a constant (cheating the test), which my volatile limit avoided.



#49 rip-off   Moderators   -  Reputation: 7696

Like
0Likes
Like

Posted 04 May 2014 - 08:10 AM

However, that still ignores the fact that the compiler is optimizing his limit away to a constant (cheating the test), which my volatile limit avoided.

Again, easily avoided:
// up.c
#include <stdio.h>
#include <stdlib.h>

int main() {
	char buf[10000];
	unsigned int limit;
	printf("enter a number up to 10000: ");
	scanf("%u", &limit);
	
	for(unsigned int i = 0; i != limit; ++i){
		buf[i] = i;
	}
	
	printf("%d\n", buf[rand() % 10000]);
}
And the same of the other program.

On my system:
$ gcc -S -O2 *.c -std=c99
down.c: In function ‘main’:
down.c:9:7: warning: ignoring return value of ‘scanf’, declared with attribute warn_unused_result [-Wunused-result]
up.c: In function ‘main’:
up.c:9:7: warning: ignoring return value of ‘scanf’, declared with attribute warn_unused_result [-Wunused-result]

$ diff {up,down}.s
1c1
< 	.file	"up.c"
---
> 	.file	"down.c"
33,34c33,34
< 	movl	24(%esp), %eax
< 	testl	%eax, %eax
---
> 	movl	24(%esp), %edx
> 	testl	%edx, %edx
36c36
< 	subl	$1, %eax
---
> 	xorl	%eax, %eax
41,42c41,42
< 	subl	$1, %eax
< 	cmpl	$-1, %eax
---
> 	addl	$1, %eax
> 	cmpl	%edx, %eax


#50 Ectara   Crossbones+   -  Reputation: 2824

Like
0Likes
Like

Posted 04 May 2014 - 08:30 AM



On my system:

Those two look equal in length, and time complexity, so I have no issues with that.



#51 catslap   Members   -  Reputation: 35

Like
0Likes
Like

Posted 04 May 2014 - 09:59 AM

Nice to see a simple but interesting question get discussed in detail tbh.


Edited by catslap, 05 May 2014 - 02:23 AM.


#52 Ectara   Crossbones+   -  Reputation: 2824

Like
-1Likes
Like

Posted 04 May 2014 - 10:08 AM

I'm going to be honest with all of you. This seems like a pointless and impossible argument. I knew coming in to it that playing devil's advocate wouldn't make me very popular here.

 

However, if you're on a platform where you actually care about the code size in terms of free bytes, then unrolling loops and using extra registers for simple bookkeeping is counter-productive.

 

Things I am arguing:

  1. If you're counting bytes, small loops are important.
  2. If you're on a low-powered platform, you may not even have that good of an optimizing compiler.
  3. If you're on a slow, small-cache/cacheless CPU, that extra register matters.

Things I am NOT arguing:

  1. There isn't better code for modern machines and processors.
  2. If you [modify the scenario in some way that suits some particular argument], it won't optimize to nothing or some large, but fast, control structure.
  3. This is the preferred coding method.
  4. Counting down is easier to read.
  5. Optimizers are bad at optimizing.

I feel this is turning into a flood of "Look, when I set the limit to zero, the counting up version converts to an AVX loop that plays 'Moonlight Sonata' out of my PC speaker. Take THAT!" and everyone high-fives and up-votes. To be honest, this is all missing the point. Most of the arguments here are throwing out my reasons why I offered a use case, and just attack the flimsiest part of the argument, by offering solutions that make code that is astronomically large for a simple loop, use 2 or more registers just for the loop counting (which, if you only have 4 registers, is a waste), and even do things as bizarre as to make it into a function call.

 

I suggested that where it matters, it can make a difference. Showing that on a high-speed, giant three-tiered cache, 16 64bit general purpose register processor it doesn't matter is great. However, interpreting these results as being relevant to the original point that I made is misleading.

 

EDIT: As evidenced by the downvote.


Edited by Ectara, 04 May 2014 - 05:37 PM.


#53 mhagain   Crossbones+   -  Reputation: 7461

Like
0Likes
Like

Posted 04 May 2014 - 10:40 AM

I'm going to be honest with all of you. This seems like a pointless and impossible argument. I knew coming in to it that playing devil's advocate wouldn't make me very popular here.

 

However, if you're on a platform where you actually care about the code size in terms of free bytes, then unrolling loops and using extra registers for simple bookkeeping is counter-productive.

 

Things I am arguing:

  1. If you're counting bytes, small loops are important.
  2. If you're on a low-powered platform, you may not even have that good of an optimizing compiler.
  3. If you're on a slow, small-cache/cacheless CPU, that extra register matters.

Things I am NOT arguing:

  1. There isn't better code for modern machines and processors.
  2. If you [modify the scenario in some way that suits some particular argument], it won't optimize to nothing or some large, but fast, control structure.
  3. This is the preferred coding method.
  4. Counting down is easier to read.
  5. Optimizers are bad at optimizing.

I feel this is turning into a flood of "Look, when I set the limit to zero, the counting up versions converts to an AVX loop that plays 'Moonlight Sonata' out of my PC speaker. Take THAT!" and everyone high-fives and up-votes. To be honest, this is all missing the point. Most of the arguments here are throwing out my reasons why I offered a use case, and just attack the flimsiest part of the argument, by offering solutions that make code that is astronomically large for a simple loop, use 2 or more registers just for the loop counting (which, if you only have 4 registers, is a waste), and even do things as bizarre as to make it into a function call.

 

I suggested that where it matters, it can make a difference. Showing that on a high-speed, giant three-tiered cache, 16 64bit general purpose register processor it doesn't matter is great. However, interpreting these results as being relevant to the original point that I made is misleading.

 

I have to say that if you'd qualified your original posts with these points things may have gone differently; the ensuing discussion may have even been interesting rather than tedious!

 

From my point of view, your earlier posts in this thread came across as though you were someone who was claiming that "for (i = something; i--;)" was faster than "for (i = something; i > something_else; i--)" and that you were providing a disingenuous contrived example that wasn't an apples-to-apples comparison in order to prove a point.  I'm not saying that's what you were doing, I'm saying that's how you came across (I need to stress this because you've been guilty of doing what you claim others have done to you too; i.e misrepresenting their positions and arguments).

 

"Where it matters it can make a difference" is something that's true of anything: one could hypothesise a processor that's several orders of magnitude slower at subtraction than it is at addition and in that use case - even for asm instructions and register usage - you'd probably still want to count up.  I think that hypothetical (and, I admit, contrived) example defeats your final statement (with specific applicability to counting down vs counting up, not in general terms), but I've no arguments with the thinking behind it, which seems to me to essentially boil down to: "choose your optimization strategies according to what's good and what actually gives results on your target platform".

 

But surely we all already knew that?


It appears that the gentleman thought C++ was extremely difficult and he was overjoyed that the machine was absorbing it; he understood that good C++ is difficult but the best C++ is well-nigh unintelligible.


#54 rip-off   Moderators   -  Reputation: 7696

Like
0Likes
Like

Posted 04 May 2014 - 11:31 AM

Trying it on Visual C++, counting up:
; 10   : 	for(unsigned int i = 0; i != limit; ++i){

	mov	ecx, DWORD PTR _limit$[ebp]
	add	esp, 12					; 0000000cH
	xor	eax, eax
	test	ecx, ecx
	je	SHORT $LN1@up
$LL3@up:

; 11   : 		buf[i] = i;

	mov	BYTE PTR _buf$[ebp+eax], al
	inc	eax
	cmp	eax, ecx
	jne	SHORT $LL3@up
$LN1@up:

; 12   : 	}
Down:
; 23   : 	for(unsigned int i = limit; i-- ; ){

	mov	eax, DWORD PTR _limit$[ebp]
	add	esp, 12					; 0000000cH
	test	eax, eax
	je	SHORT $LN6@down
$LL2@down:
	dec	eax

; 24   : 		buf[i] = i;

	mov	BYTE PTR _buf$[ebp+eax], al
	jne	SHORT $LL2@down
$LN6@down:

; 25   : 	}
We do appear to have saved an additional compare instruction when counting down.

I'm not familiar with the Visual C++ command line, and the project I used is a bit of a dumping ground for helping people here and elsewhere, so it is possible I've some odd options enabled, but the basic optimisation ones appear to

#55 Ectara   Crossbones+   -  Reputation: 2824

Like
0Likes
Like

Posted 04 May 2014 - 12:35 PM


But surely we all already knew that?

Not everyone; there's a For Beginner's section of the site, and a lot of people may interpret this subforum as a "what not to do" of programming. It's entirely possible for someone to read this, and start automatically condemning every usage of it without bothering to ask why.

 

I'd prefer that they learn to write well-reading code if they are just learning, but there's no reason to ban parts of their toolbox if it is the tool for the job, unless there's red tape.







PARTNERS