Jump to content
  • Advertisement

Archived

This topic is now archived and is closed to further replies.

cnstrnd

Nearest power of 2

This topic is 5591 days old which is more than the 365 day threshold we allow for new replies. Please post a new topic.

If you intended to correct an error in the post then please contact us.

Recommended Posts

@neoztar

Very good point. I did not remember this algo, though I have probably seen it a long timle ago. For those who don't get it, after x--, starting at the upper bit, it is propagated on the right exponentially, this way :
1??????
11?????
1111???
1111111

The x++ restores a power of two.

It might compete with the IEEE integer representation trick. It's a succession of low latency integer instructions. The only problem is the dependency chain with variable x, there is one single "thread". The CC won't be able to parallelize. I wonder if the algo could be modified to enhance parallelism , two // threads would be all right. Currently I suppose it's done in around 10 cycles.


[edited by - Charles B on June 7, 2004 9:20:23 AM]

Share this post


Link to post
Share on other sites
Advertisement
Nothing critical, as you guessed, i use this function to convert the resolution of my display to fit in a 2^n texture ...

I''ll check your lib.

Share this post


Link to post
Share on other sites
Talking about assembly language, i found/tweaked this one :

__declspec( naked )
unsigned __fastcall NextPow2( unsigned n ) { __asm
{
// ecx = n

dec ecx
mov eax,1
bsr ecx,ecx
inc ecx
shl eax,cl
ret
}}

Share this post


Link to post
Share on other sites
Yep I nearly forgot this cabled log2 (bit scan reverse) instruction. It was very slow. So on the first Pentium the IEEE trick was the fastest way to do. I wonder how fast it is now (say Pentium 4). What''s the latency ?

Share this post


Link to post
Share on other sites
I made small test application to empirically test the different methods.. obviously this is pretty irrelevant, sice you will almost never be calling this in a tight loop.

anyway:


#include "stdafx.h"
#include "math.h"
#include "stdlib.h"
#include "assert.h"
#include "time.h"
#include <windows.h>
#include <IOSTREAM>


__int64 freq;

// compare

int NearestPowerOf2(int n)
{
if (!n) return n; //(0 == 2^0)


int x = 1;
while(x < n)
{
x <<= 1;
}
return x;
}

// saturation

int NearestPowerOf2_1( int x )
{
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}


// logarithmic

int NearestPowerOf2_2( int n )
{
return (unsigned) pow( 2, ceil( log( n ) / log( 2 ) ) );
}

// recursion

int NearestPowerOf2_3(int i)
{
int x = ((i - 1) & i);
return x ? NearestPowerOf2_3(x) : i << 1;
}


// floating point representation

int NearestPowerOf2_4( int n )
{
const int MantissaMask = (1<<23) - 1;
(*(float*)&n) = (float) n;
n = n + MantissaMask & ~MantissaMask;
n = (int) (*(float*)&n);
return n;
}

// always 5 if statements (unrolled compare)

int NearestPowerOf2_5( int n )
{
if(n > 1<<16)
{
if(n > 1<<24)
{
if(n > 1<<28)
{
if(n > 1<<30)
{
if(n > 1<<31)
{
return 1<<32;
}
else
{
return 1<<31;
}
}
else
{
if(n > 1<<29)
{
return 1<<30;
}
else
{
return 1<<29;
}
}
}
else
{
if(n > 1<<26)
{
if(n > 1<<27)
{
return 1<<28;
}
else
{
return 1<<27;
}
}
else
{
if(n > 1<<25)
{
return 1<<26;
}
else
{
return 1<<25;
}
}
}
}
else
{
if(n > 1<<20)
{
if(n > 1<<22)
{
if(n > 1<<23)
{
return 1<<24;
}
else
{
return 1<<23;
}
}
else
{
if(n > 1<<21)
{
return 1<<22;
}
else
{
return 1<<21;
}
}
}
else
{
if(n > 1<<18)
{
if(n > 1<<19)
{
return 1<<20;
}
else
{
return 1<<19;
}
}
else
{
if(n > 1<<17)
{
return 1<<18;
}
else
{
return 1<<17;
}
}
}
}
}
else
{
if(n > 1<<8)
{
if(n > 1<<12)
{
if(n > 1<<14)
{
if(n > 1<<15)
{
return 1<<16;
}
else
{
return 1<<15;
}
}
else
{
if(n > 1<<13)
{
return 1<<14;
}
else
{
return 1<<13;
}
}
}
else
{
if(n > 1<<10)
{
if(n > 1<<11)
{
return 1<<12;
}
else
{
return 1<<11;
}
}
else
{
if(n > 1<<9)
{
return 1<<10;
}
else
{
return 1<<9;
}
}
}
}
else
{
if(n > 1<<4)
{
if(n > 1<<6)
{
if(n > 1<<7)
{
return 1<<8;
}
else
{
return 1<<7;
}
}
else
{
if(n > 1<<5)
{
return 1<<6;
}
else
{
return 1<<5;
}
}
}
else
{
if(n > 1<<2)
{
if(n > 1<<3)
{
return 1<<4;
}
else
{
return 1<<3;
}
}
else
{
if(n > 1<<1)
{
return 1<<2;
}
else
{
return 1<<1;
}
}
}
}
}
}


// asm trickery, assume N = 244 (11110100)

__declspec( naked ) int NearestPowerOf2_6( int n )
{
__asm
{
// ecx = n

dec ecx // n-=1; n= 243 (111100110)

mov eax,1 // store 1 in eax

bsr ecx,ecx // get the number of the first bit that is set to 1, scanning from MSB to LSB (eg. 000111100110 would give you 8)

inc ecx // ecx = 9

shl eax, cl // shift left 1 by 9 gives you 256

ret
}
}

// floating point representation

int NearestPowerOf2_7( int n )
{
return 0;
}

typedef int (*fun)(int);

int test(fun f)
{

__int64 before;
__int64 after;


srand(345678);

QueryPerformanceCounter ((LARGE_INTEGER*) &before);

for(int i=0; i<10000000; i++)
{
f(rand());
}

QueryPerformanceCounter ((LARGE_INTEGER*) &after);

__int64 diff = after - before;
__int64 milliseconds = diff / freq;
return (int) milliseconds;
}


int main(int argc, char* argv[])
{
srand(345678);

QueryPerformanceFrequency ((LARGE_INTEGER*) &freq);
freq = freq / 1000;

for(int i=0; i<100000000; i++)
{
int n = rand();
assert(! (NearestPowerOf2(n) == NearestPowerOf2_1(n) == NearestPowerOf2_2(n) == NearestPowerOf2_3(n) == NearestPowerOf2_4(n) == NearestPowerOf2_5(n) == NearestPowerOf2_6(n) ));
}

int k = test(NearestPowerOf2);
int k1 = test(NearestPowerOf2_1);
int k2 = test(NearestPowerOf2_2);
int k3 = test(NearestPowerOf2_3);
int k4 = test(NearestPowerOf2_4);
int k5 = test(NearestPowerOf2_5);
int k6 = test(NearestPowerOf2_6);
int k7 = test(NearestPowerOf2_7);

std::cout << k << std::endl;
std::cout << k1 << std::endl;
std::cout << k2 << std::endl;
std::cout << k3 << std::endl;
std::cout << k4 << std::endl;
std::cout << k5 << std::endl;
std::cout << k6 << std::endl;
std::cout << k7 << std::endl;

char c;
std::cin >> c;

return 0;
}



Quick synopsis: on my machine (dual xeon 2.8 ghz) the asm version is the quickest.

the floating point trick is only fourth..


Willem

[edited by - thezensunni on June 10, 2004 8:31:11 AM]

Share this post


Link to post
Share on other sites
am I the only one worried over the abuse of bsr?
from the instruction set reference:

"If the contents source operand are 0, the contents of the destination operand is undefined"

as it seems right now it simply doesn't change the destination in that case but really we can't just assume that now can we?

but if we can, here's a slightly faster version:


int __declspec( naked) __fastcall NearestPow2(int n)
{
_asm
{
dec ecx
mov eax, 2
bsr ecx, ecx
rol eax, cl
ret
}
}


edit: added trimmed version

[edited by - DigitalDelusion on June 10, 2004 9:21:21 AM]

Share this post


Link to post
Share on other sites
EDIT: Added DigitalDelusion asm bsr.

Nice you wrapped everything together.

#include <windows.h>
#include <iostream>
#include <cmath>
#include <cassert>

const unsigned times = 100000000;
__int64 freq;

//-----------------------------------------------------------------------------

// compare


unsigned __fastcall NearestPowerOf2(unsigned n)
{
if (!n) return n; // (0 == 2^0)


unsigned x = 1;
while(x < n)
{
x <<= 1;
}
return x;
}

//-----------------------------------------------------------------------------

// saturation


unsigned __fastcall NearestPowerOf2_1( unsigned x )
{
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}

//-----------------------------------------------------------------------------

// logarithmic


unsigned __fastcall NearestPowerOf2_2( unsigned n )
{
return (unsigned) pow( (float) 2, ceil( log( (float) n ) / log( (float) 2 ) ) );
}

//-----------------------------------------------------------------------------

// recursion


unsigned __fastcall NearestPowerOf2_3(unsigned i)
{
unsigned x = ((i - 1) & i);
return x ? NearestPowerOf2_3(x) : i << 1;
}

//-----------------------------------------------------------------------------

// floating pounsigned representation


unsigned __fastcall NearestPowerOf2_4( unsigned n )
{
const unsigned MantissaBitdepth = 23;
const unsigned MantissaMask = (1<<MantissaBitdepth) - 1;

(*(float*)&n) = (float) n;

n += MantissaMask;
n >>= 23;
n -= 127;

return 1 << n;
}

//-----------------------------------------------------------------------------

// always 5 if statements (unrolled compare)


unsigned __fastcall NearestPowerOf2_5( unsigned n )
{
if(n > 1<<16)
{
if(n > 1<<24)
{
if(n > 1<<28)
{
if(n > 1<<30)
{
if(n > 1<<31)
{
return 1<<32;
}
else
{
return 1<<31;
}
}
else
{
if(n > 1<<29)
{
return 1<<30;
}
else
{
return 1<<29;
}
}
}
else
{
if(n > 1<<26)
{
if(n > 1<<27)
{
return 1<<28;
}
else
{
return 1<<27;
}
}
else
{
if(n > 1<<25)
{
return 1<<26;
}
else
{
return 1<<25;
}
}
}
}
else
{
if(n > 1<<20)
{
if(n > 1<<22)
{
if(n > 1<<23)
{
return 1<<24;
}
else
{
return 1<<23;
}
}
else
{
if(n > 1<<21)
{
return 1<<22;
}
else
{
return 1<<21;
}
}
}
else
{
if(n > 1<<18)
{
if(n > 1<<19)
{
return 1<<20;
}
else
{
return 1<<19;
}
}
else
{
if(n > 1<<17)
{
return 1<<18;
}
else
{
return 1<<17;
}
}
}
}
}
else
{
if(n > 1<<8)
{
if(n > 1<<12)
{
if(n > 1<<14)
{
if(n > 1<<15)
{
return 1<<16;
}
else
{
return 1<<15;
}
}
else
{
if(n > 1<<13)
{
return 1<<14;
}
else
{
return 1<<13;
}
}
}
else
{
if(n > 1<<10)
{
if(n > 1<<11)
{
return 1<<12;
}
else
{
return 1<<11;
}
}
else
{
if(n > 1<<9)
{
return 1<<10;
}
else
{
return 1<<9;
}
}
}
}
else
{
if(n > 1<<4)
{
if(n > 1<<6)
{
if(n > 1<<7)
{
return 1<<8;
}
else
{
return 1<<7;
}
}
else
{
if(n > 1<<5)
{
return 1<<6;
}
else
{
return 1<<5;
}
}
}
else
{
if(n > 1<<2)
{
if(n > 1<<3)
{
return 1<<4;
}
else
{
return 1<<3;
}
}
else
{
if(n > 1<<1)
{
return 1<<2;
}
else
{
return 1<<1;
}
}
}
}
}
}

//-----------------------------------------------------------------------------

// asm trickery, assume N = 244 (11110100)


__declspec( naked )
unsigned __fastcall NearestPowerOf2_6( unsigned n )
{
// DigitalDelusion version

_asm
{
dec ecx
mov eax, 2
bsr ecx, ecx
rol eax, cl
ret
}
/*
__asm
{
// ecx = n
dec ecx // n-=1; n= 243 (111100110)
mov eax,1 // store 1 in eax
bsr ecx,ecx // get the number of the first bit that is set to 1, scanning from MSB to LSB (eg. 000111100110 would give you 8)
inc ecx // ecx = 9
shl eax, cl // shift left 1 by 9 gives you 256
ret
}
*/

}

//-----------------------------------------------------------------------------

// floating point representation


unsigned __fastcall NearestPowerOf2_7( unsigned n )
{
return 0;
}

//-----------------------------------------------------------------------------


typedef unsigned (__fastcall * fun)(unsigned);

unsigned test(fun f)
{
__int64 before;
__int64 after;

srand(345678);

QueryPerformanceCounter ((LARGE_INTEGER*) &before);

for(int i=0; i<times; i++)
{
f(rand());
}

QueryPerformanceCounter ((LARGE_INTEGER*) &after);

__int64 diff = after - before;
__int64 milliseconds = diff * 1000 / freq;
return (unsigned) milliseconds;
}

//-----------------------------------------------------------------------------


int main(int argc, char* argv[])
{
srand(345678);

QueryPerformanceFrequency ((LARGE_INTEGER*) &freq);

for(int i=0; i<123456; i++)
{
unsigned n = rand();
assert(! (NearestPowerOf2(n) == NearestPowerOf2_1(n) == NearestPowerOf2_2(n) == NearestPowerOf2_3(n) == NearestPowerOf2_4(n) == NearestPowerOf2_5(n) == NearestPowerOf2_6(n) ));
}

unsigned k = test(NearestPowerOf2);
unsigned k1 = test(NearestPowerOf2_1);
unsigned k2 = test(NearestPowerOf2_2);
unsigned k3 = test(NearestPowerOf2_3);
unsigned k4 = test(NearestPowerOf2_4);
unsigned k5 = test(NearestPowerOf2_5);
unsigned k6 = test(NearestPowerOf2_6);
// unsigned k7 = test(NearestPowerOf2_7);


std::cout << "compare " << k << std::endl;
std::cout << "saturation " << k1 << std::endl;
std::cout << "fpu log2 " << k2 << std::endl;
std::cout << "recursion " << k3 << std::endl;
std::cout << "float trick " << k4 << std::endl;
std::cout << "unrolled if " << k5 << std::endl;
std::cout << "asm bsr " << k6 << std::endl;
// std::cout << " " << k7 << std::endl;

/*
char c;
std::cin >> c;
*/

return 0;
}


Here's my results on an AthlonXP 1700+ :

compare     8235
saturation 4149
fpu log2 52911
recursion 6759
float trick 4642
unrolled if 4295
asm bsr 3951


PS: Very small function like 'float trick' and 'asm bsr' could run faster when inlined because of the ~dissolve~ effect of the optimizer.

[edited by - cnstrnd on June 10, 2004 10:07:10 AM]

Share this post


Link to post
Share on other sites
results on dual Xeon 2.8 ghz:

compare 3698
saturation 1446
fpu log2 38376
recursion 3394
float trick 3912
unrolled if 2530
asm bsr 1027
empty ref 871

The last one is a sort of baseline number, just calling an empty function X amount of times.. because it is through a function pointer, the compiler doesn''t optize that away.

Share this post


Link to post
Share on other sites
I just wanted say that I used something similar in an innerloop today, so I thought it might be interesting to what it can be used for.

But instead I needed to find the index of the first set bit for my huffman decoder.
Canonical huffman codes consist of a prefix of zeros followed by at most eight bits of ''data''. I used the first bit''s index to extract that data and could directly lookup the symbol in an array indexed by the bit and the code word.
eg.


while(Count--)
{
Pattern = BitStream->Get();

Index = s_FindFirstBit(Pattern);
Pattern = (Pattern >> Index) & 0xff;
Symbol = CodeTable[Index][Pattern];

BitStream->Advance(CodeList[Symbol].m_Size);

*Output++ = Symbol;
}

Share this post


Link to post
Share on other sites
doynax I've always loved compression and now how hard it is getting decent speed from huffies, so that's quite intresting to hear, any good links to info on canocial huffman coders?

Anyhow, if I understand what you're doing correctly you rist find the lowest set bit in "Pattern" call that "Index" and transmogrifies "Pattern" by a simple shift & and.

I hacked togheter this routine that (should) do the equvivalent
of

Index = s_FindFirstBit(Pattern);
Pattern = (Pattern >> Index) & 0xff;



//returns first set bit and stores the new pattern @ pOut

//returns -1 if Pattern is 0

int __declspec( naked) __fastcall GetIndexPattern(int Pattern, int *pOut)
{
static int error = -1;
_asm
{
bsf eax, ecx
cmovz eax, error
mov ebx, ecx
mov cl, al
shr ebx, cl
movzx ebx, bl
mov [edx], ebx
ret
}
}


simply call it like:
Index = GetIndexPattern( Pattern, &Pattern);

EDIT: added commments

[edited by - DigitalDelusion on June 11, 2004 5:45:44 AM]

Share this post


Link to post
Share on other sites

This topic is 5591 days old which is more than the 365 day threshold we allow for new replies. Please post a new topic.

If you intended to correct an error in the post then please contact us.

Guest
This topic is now closed to further replies.

  • Advertisement
×

Important Information

By using GameDev.net, you agree to our community Guidelines, Terms of Use, and Privacy Policy.

GameDev.net is your game development community. Create an account for your GameDev Portfolio and participate in the largest developer community in the games industry.

Sign me up!