Back to General and Gameplay Programming

my c++ d c# benchmark!

Kambiz · 2006-10-13T08:18:40

Today I saw the D's pi computing sample (\dmd\samples\d\pi.d) and thought that porting it to c++ and c# for making a little benchmark should be easy: d: import std.c.stdio; import std.c.stdlib; import std.c.time; const int LONG_TIME=4000; byte[] p; byte[] t; int q; int main(char[][] args) { int startime, endtime; int i; if (args.length == 2) { sscanf(&args[1][0],"%d",&q); } else { printf("Usage: pi [precision]\n"); exit(55); } if (q < 0) { printf("Precision was too low, running with precision of 0.\n"); q = 0; } if (q > LONG_TIME) { printf("Be prepared to wait a while...\n"); } // Compute one more digit than we display to compensate for rounding q++; p.length = q + 1; t.length = q + 1; /* compute pi */ std.c.time.time(&startime); arctan(2); arctan(3); mul4(); std.c.time.time(&endtime); // Return to the number of digits we want to display q--; /* print pi */ printf("pi = %d.",cast(int)(p[0])); for (i = 1; i <= q; i++) printf("%d",cast(int)(p)); printf("\n"); printf("%ld seconds to compute pi with a precision of %d digits.\n",endtime-startime,q); return 0; } void arctan(int s) { int n; t[0] = 1; div(s); /* t[] = 1/s */ add(); n = 1; do { mul(n); div(s * s); div(n += 2); if (((n-1) / 2) % 2 == 0) add(); else sub(); } while (!tiszero()); } void add() { int j; for (j = q; j >= 0; j--) { if (t[j] + p[j] > 9) { p[j] += t[j] - 10; p[j-1] += 1; } else p[j] += t[j]; } } void sub() { int j; for (j = q; j >= 0; j--) if (p[j] < t[j]) { p[j] -= t[j] - 10; p[j-1] -= 1; } else p[j] -= t[j]; } void mul(int multiplier) { int b; int i; int carry = 0, digit = 0; for (i = q; i >= 0; i--) { b = (t * multiplier + carry); digit = b % 10; carry = b / 10; t = digit; } } /* t[] /= l */ void div(int divisor) { int i, b; int quotient, remainder = 0; for (i = 0; i <= q; i++) { b = (10 * remainder + t); quotient = b / divisor; remainder = b % divisor; t = quotient; } } void div4() { int i, c, d = 0; for (i = 0; i <= q; i++) { c = (10 * d + p) / 4; d = (10 * d + p) % 4; p = c; } } void mul4() { int i, c, d; d = c = 0; for (i = q; i >= 0; i--) { d = (p * 4 + c) % 10; c = (p * 4 + c) / 10; p = d; } } int tiszero() { int k; for (k = 0; k <= q; k++) if (t[k] != 0) return false; return true; } (I compiled pi.d with dmd -O -release pi.d) c++: #include <stdio.h> #include <stdlib.h> #include <time.h> #define LONG_TIME 4000 __int8* p; __int8* t; int q; void arctan(int s); void add(); void sub(); void mul(int); void div(int); void mul4(); void div4(); int tiszero(); int main(int argv,char** args) { time_t startime, endtime; int i; if (argv == 2) { sscanf(&args[1][0],"%d",&q); } else { printf("Usage: pi [precision]\n"); exit(55); } if (q < 0) { printf("Precision was too low, running with precision of 0.\n"); q = 0; } if (q > LONG_TIME) { printf("Be prepared to wait a while...\n"); } // Compute one more digit than we display to compensate for rounding q++; p = new __int8[ q + 1 ]; t = new __int8[ q + 1 ]; for(int tt=0;tt<=q;tt++) { p=0; t=0; } /* compute pi */ time(&startime); arctan(2); arctan(3); mul4(); time(&endtime); // Return to the number of digits we want to display q--; /* print pi */ printf("pi = %d.",(int)(p[0])); for (i = 1; i <= q; i++) printf("%d",(int)(p)); printf("\n"); printf("%ld seconds to compute pi with a precision of %d digits.\n",(long)endtime-(long)startime,q); delete [] p; delete [] t; return 0; } void arctan(int s) { int n; t[0] = 1; div(s); /* t[] = 1/s */ add(); n = 1; do { mul(n); div(s * s); div(n += 2); if (((n-1) / 2) % 2 == 0) add(); else sub(); } while (!tiszero()); } void add() { int j; for (j = q; j >= 0; j--) { if (t[j] + p[j] > 9) { p[j] += t[j] - 10; p[j-1] += 1; } else p[j] += t[j]; } } void sub() { int j; for (j = q; j >= 0; j--) if (p[j] < t[j]) { p[j] -= t[j] - 10; p[j-1] -= 1; } else p[j] -= t[j]; } void mul(int multiplier) { int b; int i; int carry = 0, digit = 0; for (i = q; i >= 0; i--) { b = (t * multiplier + carry); digit = b % 10; carry = b / 10; t = digit; } } /* t[] /= l */ void div(int divisor) { int i, b; int quotient, remainder = 0; for (i = 0; i <= q; i++) { b = (10 * remainder + t); quotient = b / divisor; remainder = b % divisor; t = quotient; } } void div4() { int i, c, d = 0; for (i = 0; i <= q; i++) { c = (10 * d + p) / 4; d = (10 * d + p) % 4; p = c; } } void mul4() { int i, c, d; d = c = 0; for (i = q; i >= 0; i--) { d = (p * 4 + c) % 10; c = (p * 4 + c) / 10; p = d; } } int tiszero() { int k; for (k = 0; k <= q; k++) if (t[k] != 0) return false; return true; } c#: using System; using System.Diagnostics; class Pi { static void Main(string[] args) { Pi pi = new Pi(); pi.run(args); } private const int LONG_TIME = 4000; sbyte[] p; sbyte[] t; int q; void run(string[] args) { Stopwatch timer = new Stopwatch(); int i; if (args.Length == 1) { q = int.Parse(args[0]); } else { Console.WriteLine("Usage: pi [precision]"); return; } if (q < 0) { Console.WriteLine("Precision was too low, running with precision of 0."); q = 0; } if (q > LONG_TIME) { Console.WriteLine("Be prepared to wait a while..."); } // Compute one more digit than we display to compensate for rounding q++; p = new sbyte[q + 1]; t = new sbyte[q + 1]; /* compute pi */ timer.Start(); arctan(2); arctan(3); mul4(); timer.Stop(); // Return to the number of digits we want to display q--; /* print pi */ Console.Write("pi = {0}.", p[0]); for (i = 1; i <= q; i++) Console.Write(p); Console.WriteLine(); Console.WriteLine("{0} seconds to compute pi with a precision of {1} digits.", timer.ElapsedMilliseconds / 1000.0, q); return; } void arctan(int s) { int n; t[0] = 1; div(s); /* t[] = 1/s */ add(); n = 1; do { mul(n); div(s * s); div(n += 2); if (((n - 1) / 2) % 2 == 0) add(); else sub(); } while (!tiszero()); } void add() { int j; for (j = q; j >= 0; j--) { if (t[j] + p[j] > 9) { p[j] += (sbyte)(t[j] - 10); p[j - 1] += 1; } else p[j] += t[j]; } } void sub() { int j; for (j = q; j >= 0; j--) if (p[j] < t[j]) { p[j] -= (sbyte)(t[j] - 10); p[j - 1] -= 1; } else p[j] -= t[j]; } void mul(int multiplier) { int b; int i; int carry = 0, digit = 0; for (i = q; i >= 0; i--) { b = (t * multiplier + carry); digit = b % 10; carry = b / 10; t = (sbyte)digit; } } /* t[] /= l */ void div(int divisor) { int i, b; int quotient, remainder = 0; for (i = 0; i <= q; i++) { b = (10 * remainder + t); quotient = b / divisor; remainder = b % divisor; t = (sbyte)quotient; } } void div4() { int i, c, d = 0; for (i = 0; i <= q; i++) { c = (10 * d + p) / 4; d = (10 * d + p) % 4; p = (sbyte)c; } } void mul4() { int i, c, d; d = c = 0; for (i = q; i >= 0; i--) { d = (p * 4 + c) % 10; c = (p * 4 + c) / 10; p = (sbyte)d; } } bool tiszero() { int k; for (k = 0; k <= q; k++) if (t[k] != 0) return false; return true; } } I used the release builds for the benchmark, here are the results: d : 26 seconds to compute pi with a precision of 10000 digits. c# : 34,745(34.745) seconds to compute pi with a precision of 10000 digits. c++ : 15 seconds to compute pi with a precision of 10000 digits. I do not want to start a language war... we had enough such wars in the past days and I have just made this benchmark because I saw the “What do you think of the D language?“ thread and was testing d. I'm just surprised a little: I thought that D shouldn't be much slower than c++ and I thought that c# would be much faster. Maybe there is some optimization option I have not used(?) What do you think about the results? (maybe some one can test java) (I really like the invariant and unittest features of D, are there equivalents for c#?) -I can't reply until tomorrow- [Edited by - Kambiz on October 7, 2006 2:02:45 AM]

General and Gameplay Programming Programming Unity

Started by Kambiz October 06, 2006 06:40 PM

70 comments, last by Raghar 17 years, 6 months ago

h3r3tic

228

October 09, 2006 08:05 PM

Quote:Original post by dbzprogrammer
C/C++ is the best tool for intense calculations.

D is not, along with C#.

Errr... right. And I haven't shown that D gets as fast as C++ when using the GCC-based compiler...

MaulingMonkey

1,729

October 10, 2006 04:57 AM

Quote:Original post by Stachel
Quote:Original post by MaulingMonkey
For all you know, this is significantly faster than the equivilant C/C++ due to optimizaitons based on input data which absolutely could not be made in the equivilant staticly compiled program (because, as that first article mentions, Java is able to make profile guided assumptions, even when those assumptions may later prove invalid for the general case!). For all you know, it's been running faster than the static equivilant for all those hours!
I read about the profile guided assumptions, and the theory that JITs can therefore produce faster code. But the results always seem to be missing in action.

Here's one set of benchmarks comparing Java with C++: http://shootout.alioth.debian.org/gp4/benchmark.php?test=all〈=java&lang2=gpp

Score 1 out of 17 for Java.

Note: HTML Translation $@(^%*(s up your link.

Fixed: http://shootout.alioth.debian.org/gp4/benchmark.php?test=all〈=java&lang2=gpp

Final comment: 1 out of 17 is similarly enough to contradict your earlier generalization, espeically considering most of the tests listed there only run a few seconds (only 4 out of 17 take longer than 5 seconds in the C++ version), which I already knew Java was at a disadvantage at (Just look at the Java vs C++ startup benchmark to see what I mean - almost a 43x increase just to run hello world - nevermind JIT compilation/optimization). We were talking about the scope of a few hours, a 3600x in timescale. Presuming we need the program to start performing faster than the C++ version after a mere few minutes in order to come out ahead for the hours scale, that's still a 60x increase.

"Only 1 out of 17 in a few seconds [where Java >= C++]" I can believe at face value.
"Only 0 out of N in a few hours [where Java merely 'approaches' C++]" does not follow that at all - which again, is how your original overgeneralization interprets.

DaveJF

118

October 10, 2006 07:30 AM

Quote:Original post by dbzprogrammer
C/C++ is the best tool for intense calculations.

D is not, along with C#.

Urmmm..

D is built with speed in mind (D 1st class array types, built-in strings, etc.).

Overall Shootout Benchmarks

D and C++

10 of 15 (C++ is missing 2) in favor of D, and D is not quite at version 1.0.

Aldacron

4,545

October 11, 2006 02:35 AM

Quote:Original post by StachelIt's not about writing a benchmark that favors the way Java works. It's about writing a benchmark that functionally resembles a real application.

I'm not sure what your point is. For a real-world application, a Java programmer would code in a manner that favors the way Java works. If a benchmark doesn't reflect that, then it isn't accurate.

--- Official D Blog | Learning D | The One With D | D Bits

Stachel

100

October 11, 2006 04:10 AM

Quote:Original post by Aldacron
Quote:Original post by StachelIt's not about writing a benchmark that favors the way Java works. It's about writing a benchmark that functionally resembles a real application.
I'm not sure what your point is. For a real-world application, a Java programmer would code in a manner that favors the way Java works. If a benchmark doesn't reflect that, then it isn't accurate.

Think of it like a test in school. There's teaching the test so the students can pass it, then there's teaching knowledge so passing the test falls out naturally.

For a benchmark, consider that D supports inline assembler. That means one could write the Pi benchmark entirely in hand-optimized assembler, and that would technically be writing it in D. Nothing could beat that for speed. But is that a reasonable benchmark for D? I'd sure cry foul. I didn't see any special tweaking in the D or C versions of Pi.

For the Java benchmark, it's been tweaked to replace (x / 10) with (x * .1f). That's faster on some CPUs, but slower on others. Doesn't that mean the Java JIT should do this transformation on its own because after all, isn't the great strength of the JIT being able to adapt to the particular CPU? If I have to bend my otherwise straightforward Java code around shortcomings in its JIT, that doesn't reflect well on the Java implementation, and it's fair for a benchmark to point that out. If the Java implementation is written to do a good job with straightforward Java source code, then doing well on a benchmark will fall out naturally.

I'm not interested in benchmarks custom carefully designed to avoid weaknesses in an implementation, and only show its strength. It's like posting a picture of the good side of your car for sale on ebay, and neglecting to mention that the other side is bashed in and the car won't drive straight. You'll also find that if you do tweaks like (x * .1f), supposedly portable Java source is going to do very badly on some CPUs, and those tweaks may actually sabotage performance if the Java implementation improves.

Write benchmarks in a manner that's straightforward to the language being used. Straightforward code is what the language implementors work hardest at improving performance for, so you're not likely to be left in a backwater with oddities like (x * .1f). "Optimizations" like that were once popular with C, and they did work, but with modern compilers such things perform worse than the original unoptimized code.

Anonymous

October 11, 2006 04:52 AM

Quote:Original post by Stachel
For the Java benchmark, it's been tweaked to replace (x / 10) with (x * .1f). That's faster on some CPUs, but slower on others. Doesn't that mean the Java JIT should do this transformation on its own because after all, isn't the great strength of the JIT being able to adapt to the particular CPU?

Actually that particular optimization was not an optimization any compiler could do. It could be said being a (minor) algorithmic change because it changes the input/output mapping of the div and mul functions. It just "happens" to work in this application because it's not so important how the numbers are distributed in the array that represents large numbers, only that the array's "real" value is correct. (e.g. 4*10^2+1*10 is same as 3*10^2+11*10)

The problem was really that neither Java nor C# did div/mod with just one instruction so another way had to be used to get same speed as those combined. But it's not that complex or time-wasting optimization that it couldn't be done in a JIT compiler. It just shows the immaturity of the .NET and Java JIT compilers as compared to C++ compilers. This information is relevant today, but it doesn't prove anything against "managed" languages' compilation model.

Aldacron

4,545

October 11, 2006 01:20 PM

Quote:Original post by Stachel
For a benchmark, consider that D supports inline assembler. That means one could write the Pi benchmark entirely in hand-optimized assembler, and that would technically be writing it in D. Nothing could beat that for speed. But is that a reasonable benchmark for D? I'd sure cry foul. I didn't see any special tweaking in the D or C versions of Pi.

Writing in inline assembler is not writing in D. I'd cry foul, too.

Quote:
For the Java benchmark, it's been tweaked to replace (x / 10) with (x * .1f).

...

I'm not interested in benchmarks custom carefully designed to avoid weaknesses in an implementation, and only show its strength.

...

Write benchmarks in a manner that's straightforward to the language being used.

Indeed. So what you were on about is optimizing the benchmark and not about tailoring the benchmark to the language. I was referring to the latter.

The problem is that most of these multi-language benchmarks that we see online are written by people who have most of their experience in only one of the languages. So when it comes to porting the benchmark from that language to the others, they carry with them the same idioms -- which may not apply to the other languages being benchmarked.

For example, in a benchmark that makes use of a large number of objects the C++ version might preallocate the objects upfront in an array. In Java these days, object pools are rarely used except for resource-intensive objects (such as Threads) because of advances made in garbage collection, so doing the same thing for the Java benchmark would not be a reflection of real-world code. If what you were benchmarking is array access, then that's one thing. But if the storage and allocation of objects are peripheral to the benchmark they should be done in a manner that reflects real-world usage.

There's a difference between optimizing a benchmark to get better results and "coding to the language". For many benchmarks it isn't going to matter, but it's always important to be aware of the common idioms of each language you are benchmarking so that you create a benchmark that is accurate (well, whatever that means in the world of microbenchmarks).

--- Official D Blog | Learning D | The One With D | D Bits

Raghar

October 12, 2006 10:52 AM

Quote:Original post by Stachel
For a benchmark, consider that D supports inline assembler. That means one could write the Pi benchmark entirely in hand-optimized assembler, and that would technically be writing it in D. Nothing could beat that for speed. But is that a reasonable benchmark for D? I'd sure cry foul. I didn't see any special tweaking in the D or C versions of Pi.

I wrote things in a "hand" optimized assembler. What about you? Is compiler able to convert that hand optimized assembly into 64 bit code? Could you use that compiled code on any CPU in natively optimized form? You could also write hand optimized ASM in Java and use JNI. Then you could attempt to run that ASM in error condition, and say. Java application didn't crashed because of that ASM code, the other did and blasted out OS. Should this be in benchmark? Obviously it should.

However this benchmark is a simple one. It's just coincidence that C++ code was so similar there was no need for modifications. In fact you can find also code that is so nicely written you could just copy and past it into Java, or C# and change method naming, and it works.

Quote:For the Java benchmark, it's been tweaked to replace (x / 10) with (x * .1f). That's faster on some CPUs, but slower on others. Doesn't that mean the Java JIT should do this transformation on its own because after all, isn't the great strength of the JIT being able to adapt to the particular CPU?

Are you saing that there are CPUs where division isn't 80 cycles monstrosity, and multiplication 1 - 1.5 cycle sweety operation?

If "a" is a double precision floating point number writen in a IEEE standard and
"a" modulo 10 != 0 then
"a"/10 != "a" * 0.1D

If a compiler would replace it without your permission, it could decrease precision of computation considerably.

BTW I was bitten by a * b / c != a * (b / c)
It created a hiccup in the middle of screen.

Stachel

100

October 12, 2006 01:10 PM

Quote:Original post by Raghar
Are you saing that there are CPUs where division isn't 80 cycles monstrosity, and multiplication 1 - 1.5 cycle sweety operation?

Yes. The 386/387 CPU combination and the 486 have integer division faster than float multiply according to clock cycle counts. Not to mention any processor that doesn't have hardware floating point (seen in embedded processors). It isn't just the multiply, there's the conversion to and from float to add in, plus resetting the rounding mode.

I tried this change on my machine. Despite CPU instruction timings saying that the floating multiply should be faster, the actual benchmark timing shows it to be slower. I can't explain this other than the fact that for modern CPUs the cycle counts aren't the whole story. It probably has to do with some internal pipelining/scheduling issue.

deathkrush

350

October 13, 2006 01:30 AM

Quote:Original post by Kevlar-X
Quote:Original post by deathkrush
The fastest way to calculate PI in C is of course:

*** Source Snippet Removed ***

Hey, I loved the snippet!
Though, it wouldnt compile as given (F_00() is used before defined) so I rearranged the code blocks.

Then it crunches out the number '0.250'.

So I might add: it may be a great way to calculate pi, compact and cool looking, but it lacks one extra nicety we would like in a program to calculate pi. Determining the correct result.
On all other accounts its cool though! :)

- Jacob

It's from the International Obfuscated C Contest. It probably worked fine with old compilers and produced a correct result. Does anybody know what's wrong with the code, I get a wrong result too. The author says that you can get more precision by using a bigger ASCII picture though. :-)

deathkrushPS3/Xbox360 Graphics Programmer, Mass Media.Completed Projects: Stuntman Ignition (PS3), Saints Row 2 (PS3), Darksiders(PS3, 360)

my c++ d c# benchmark!

This topic is closed to new replies.

Popular Topics

Recommended Tutorials

my c++ d c# benchmark!

This topic is closed to new replies.

Popular Topics

Recommended Tutorials

Reticulating splines