• Create Account

### #Actualc-up

Posted 09 May 2013 - 07:00 AM

Hodgman mentioning radix sort got me wondering so I've knocked up an implementation.

On my 2 core laptop (AMD E-450 APU 1.65GHz), sorting 1024*1024 items takes:

1 core = 678ms

2 core = 368ms (1.84x speedup)

On my 4 core desktop (AMD Athalon 2 X4 630 2.8GHz), sorting the same number of items takes:

1 core = 333ms

2 core = 190ms (1.75x)

3 core = 136ms (2.45x)

4 core = 106ms (3.14x)

Pretty reasonably speedups.

Anyway, here's the radix sort code. It recursively subdivides the problem to generate the parallelism as described here:

It doesn't do parallel scatter as Hodgman suggested for reasons commented on in the code. I'd be interested to know if there's a better approach than I took.

As well as it being parallel internally you can also run multiple (non-overlapping) sorts in parallel if you need to.

// parallel radix sort
// in-place stable sort of items array into ascending key order
// type to be sorted must have RadixSortKey uint property
// you can pass in temporary workspace otherwise it will allocate one internally on the stack
public parallel void RadixSort<T>(T[] local items, T[] local itemsTemp = null)
{
// workspace required for double buffering
T[] local itemsAlt = itemsTemp;
if (!itemsAlt)
itemsAlt = new local T[items.Length];// = void;			// = void prevents zeroing the allocated memory (in theory - currently crashes compiler)

if (items.Length > 4096)
else
}

private parallel void RadixSortP<T>(T[] local items, T[] local itemsAlt, int shift)
{
// calculate prefix values
int[] local prefixes = new local int[256];

// if lots of keys then calculate prefixes in parallel
if (items.Length >= 32768)
{
int split = items.Length / 4;
int[] local [4] prefixesPar;
for (int i = 0; i < 4; i++)
prefixesPar[i] = new local int[256];
parallel
{
int s = 0;
for (int i = 0; i < 3; i++, s += split)
RadixSortCountPrefixes<T>(prefixesPar[i], items[s .. s + split], shift);
// remainder
}

// sum the prefixes using simd add (stack allocated arrays are at least 16 byte aligned)
foreach (int[] local pfxPar; prefixesPar)
{
int8[] local pfxSrc8 = cast(int8[] local)pfxPar;
int8[] local pfxDst8 = cast(int8[] local)prefixes;
foreach (int8& pfx, i; pfxSrc8)
pfxDst8[i] += *pfx;
}
}
// if not a large number of prefixes do them in serial
else
{
}

// prefix is sum of all previous values
int sum = 0;
foreach (int& prefix, i; prefixes)
{
int newSum = sum + *prefix;
*prefix = sum;
sum = newSum;
}

// scatter values out to destination array
// can be done in parallel if shared memory is used but:
//	- you'd need to use atomic increments which would kill performance if you have lots of the same key
//	- would make the sort unstable
// (maybe there's another way to partition the work I'm not seeing that avoids these issues)
foreach (T& item; items)
{
int o = prefixes[(item.RadixSortKey << shift) >> 24]++;
itemsAlt[o] = *item;
}

// done?
if (shift < 24)
{
// now sort the sub-arrays in parallel if they're reasonably large or non-parallel if they're not
parallel
{
int s = 0;
for (int i = 0; i < 256; i++)
{
int e = prefixes[i];

if ((e - s) > 4096)
RadixSortP<T>(itemsAlt[s .. e], items[s .. e], shift + 8);
else if ((e - s) > 1)
RadixSortNP<T>(itemsAlt[s .. e], items[s .. e], shift + 8);
else if ((e - s) > 0)
items[s] = itemsAlt[s];

s = e;
}
}
}
}

// non-parallel version called by the master version above when arrays are sufficiently small
private void RadixSortNP<T>(T[] local items, T[] local itemsAlt, int shift) : parallel
{
// calculate prefix values
int[256] prefixes;

// prefix is sum of all previous values
int sum = 0;
foreach (int& prefix, i; prefixes)
{
int newSum = sum + *prefix;
*prefix = sum;
sum = newSum;
}

// scatter values out to destination array
foreach (T& item; items)
{
int o = prefixes[(item.RadixSortKey << shift) >> 24]++;
itemsAlt[o] = *item;
}

// if not done sort the sub-arrays
if (shift < 24)
{
int s = 0;
for (int i = 0; i < 256; i++)
{
int e = prefixes[i];

if ((e - s) > 1)
RadixSortNP<T>(itemsAlt[s .. e], items[s .. e], shift + 8);
else if ((e - s) > 0)
items[s] = itemsAlt[s];

s = e;
}
}
}

parallel void RadixSortCountPrefixes<T>(int[] local prefixes, const(T)[] local items, int shift)
{
foreach (const(T)& item; items)
}


And here's the little testbed I used to generate the results at the top:

import System.IO.Console;

uint randSeed = 0;

public void SeedRand(uint seed)
{
randSeed = seed;
}

public uint Rand()
{
randSeed = (randSeed * 1103515245 + 12345) & int.MaxValue;
return randSeed;
}

{
private uint key;
private int value;

public this()
{
key = Rand();
value = Rand();
}

public uint RadixSortKey() const : parallel
{
return key;
}
}

public int Main(string[] args)
{
SeedRand(12345);

static const int numItems = 1024 * 1024;
static const int numRuns = 10;

// uncomment this line to set number of threads job queue can use (it clamps internally to 1 .. number of physical cores on machine)

// run radix sort performance test
Console.WriteLine("Timing radix sort of %d items", &numItems);
ulong totalCycles = 0;

// 10 runs
for (int r = 0; r < numRuns; r++)
{

ulong startCycle = GetProcessorCycleCount();

ulong endCycle = GetProcessorCycleCount();
totalCycles += endCycle - startCycle;

// check sorted
for (int i = 1; i < items.Length; i++)
}

// report average time taken
float totalMs = cast(float)totalCycles / cast(float)GetProcessorCyclesPerSecond() / cast(float)numRuns * 1000.0f;
Console.WriteLine("Average time taken over %d runs = %.2fms", &numRuns, &totalMs);

return 0;
}


### #3c-up

Posted 09 May 2013 - 06:59 AM

Hodgman mentioning radix sort got me wondering so I've knocked up an implementation.

On my 2 core laptop (AMD E-450 APU 1.65GHz), sorting 1024*1024 items takes:

1 core = 678ms

2 core = 368ms (1.84x speedup)

On my 4 core desktop (AMD Athalon 2 X4 630 2.8GHz), sorting the same number of items takes:

1 core = 333ms

2 core = 190ms (1.75x)

3 core = 136ms (2.45x)

4 core = 106ms (3.14x)

Pretty reasonably speedups.

Anyway, here's the radix sort code. It recursively subdivides the problem to generate the parallelism as described here:

It doesn't do parallel scatter as Hodgman suggested for reasons commented on in the code. I'd be interested to know if there's a better approach than I took.

As well as it being parallel internally you can also run multiple (non-overlapping) sorts in parallel if you need to.

// parallel radix sort
// in-place stable sort of items array into ascending key order
// type to be sorted must have RadixSortKey uint property
// you can pass in temporary workspace otherwise it will allocate one internally on the stack
public parallel void RadixSort<T>(T[] local items, T[] local itemsTemp = null, int shift)
{
// workspace required for double buffering
T[] local itemsAlt = itemsTemp;
if (!itemsAlt)
itemsAlt = new local T[items.Length];// = void;			// = void prevents zeroing the allocated memory (in theory - currently crashes compiler)

if (items.Length > 4096)
else
}

private parallel void RadixSortP<T>(T[] local items, T[] local itemsAlt, int shift)
{
// calculate prefix values
int[] local prefixes = new local int[256];

// if lots of keys then calculate prefixes in parallel
if (items.Length >= 32768)
{
int split = items.Length / 4;
int[] local [4] prefixesPar;
for (int i = 0; i < 4; i++)
prefixesPar[i] = new local int[256];
parallel
{
int s = 0;
for (int i = 0; i < 3; i++, s += split)
RadixSortCountPrefixes<T>(prefixesPar[i], items[s .. s + split], shift);
// remainder
}

// sum the prefixes using simd add (stack allocated arrays are at least 16 byte aligned)
foreach (int[] local pfxPar; prefixesPar)
{
int8[] local pfxSrc8 = cast(int8[] local)pfxPar;
int8[] local pfxDst8 = cast(int8[] local)prefixes;
foreach (int8& pfx, i; pfxSrc8)
pfxDst8[i] += *pfx;
}
}
// if not a large number of prefixes do them in serial
else
{
}

// prefix is sum of all previous values
int sum = 0;
foreach (int& prefix, i; prefixes)
{
int newSum = sum + *prefix;
*prefix = sum;
sum = newSum;
}

// scatter values out to destination array
// can be done in parallel if shared memory is used but:
//	- you'd need to use atomic increments which would kill performance if you have lots of the same key
//	- would make the sort unstable
// (maybe there's another way to partition the work I'm not seeing that avoids these issues)
foreach (T& item; items)
{
int o = prefixes[(item.RadixSortKey << shift) >> 24]++;
itemsAlt[o] = *item;
}

// done?
if (shift < 24)
{
// now sort the sub-arrays in parallel if they're reasonably large or non-parallel if they're not
parallel
{
int s = 0;
for (int i = 0; i < 256; i++)
{
int e = prefixes[i];

if ((e - s) > 4096)
RadixSortP<T>(itemsAlt[s .. e], items[s .. e], shift + 8);
else if ((e - s) > 1)
RadixSortNP<T>(itemsAlt[s .. e], items[s .. e], shift + 8);
else if ((e - s) > 0)
items[s] = itemsAlt[s];

s = e;
}
}
}
}

// non-parallel version called by the master version above when arrays are sufficiently small
private void RadixSortNP<T>(T[] local items, T[] local itemsAlt, int shift) : parallel
{
// calculate prefix values
int[256] prefixes;

// prefix is sum of all previous values
int sum = 0;
foreach (int& prefix, i; prefixes)
{
int newSum = sum + *prefix;
*prefix = sum;
sum = newSum;
}

// scatter values out to destination array
foreach (T& item; items)
{
int o = prefixes[(item.RadixSortKey << shift) >> 24]++;
itemsAlt[o] = *item;
}

// if not done sort the sub-arrays
if (shift < 24)
{
int s = 0;
for (int i = 0; i < 256; i++)
{
int e = prefixes[i];

if ((e - s) > 1)
RadixSortNP<T>(itemsAlt[s .. e], items[s .. e], shift + 8);
else if ((e - s) > 0)
items[s] = itemsAlt[s];

s = e;
}
}
}

parallel void RadixSortCountPrefixes<T>(int[] local prefixes, const(T)[] local items, int shift)
{
foreach (const(T)& item; items)
}


And here's the little testbed I used to generate the results at the top:

import System.IO.Console;

uint randSeed = 0;

public void SeedRand(uint seed)
{
randSeed = seed;
}

public uint Rand()
{
randSeed = (randSeed * 1103515245 + 12345) & int.MaxValue;
return randSeed;
}

{
private uint key;
private int value;

public this()
{
key = Rand();
value = Rand();
}

public uint RadixSortKey() const : parallel
{
return key;
}
}

public int Main(string[] args)
{
SeedRand(12345);

static const int numItems = 1024 * 1024;
static const int numRuns = 10;

// uncomment this line to set number of threads job queue can use (it clamps internally to 1 .. number of physical cores on machine)

// run radix sort performance test
Console.WriteLine("Timing radix sort of %d items", &numItems);
ulong totalCycles = 0;

// 10 runs
for (int r = 0; r < numRuns; r++)
{

ulong startCycle = GetProcessorCycleCount();

ulong endCycle = GetProcessorCycleCount();
totalCycles += endCycle - startCycle;

// check sorted
for (int i = 1; i < items.Length; i++)
}

// report average time taken
float totalMs = cast(float)totalCycles / cast(float)GetProcessorCyclesPerSecond() / cast(float)numRuns * 1000.0f;
Console.WriteLine("Average time taken over %d runs = %.2fms", &numRuns, &totalMs);

return 0;
}


### #2c-up

Posted 09 May 2013 - 01:40 AM

Hodgman mentioning radix sort got me wondering so I've knocked up an implementation.

On my 2 core laptop (AMD E-450 APU 1.65GHz), sorting 1024*1024 items takes:

1 core = 718ms

2 core = 368ms (1.95x speedup)

On my 4 core desktop (AMD Athalon 2 X4 630 2.8GHz), sorting the same number of items takes:

1 core = 333ms

2 core = 190ms (1.75x)

3 core = 136ms (2.45x)

4 core = 106ms (3.14x)

Pretty reasonably speedups. I don't understand how I'm getting such a good speedup on the laptop actually ... probably made a mistake somewhere but those results seem repeatable.

Anyway, here's the radix sort code. It recursively subdivides the problem to generate the parallelism as described here:

It doesn't do parallel scatter as Hodgman suggested for reasons commented on in the code. I'd be interested to know if there's a better approach than I took.

As well as it being parallel internally you can also run multiple (non-overlapping) sorts in parallel if you need to.

// parallel radix sort
// in-place stable sort of items array into ascending key order
// type to be sorted must have RadixSortKey uint property
// you can pass in temporary workspace otherwise it will allocate one internally on the stack
public parallel void RadixSort<T>(T[] local items, T[] local itemsTemp = null, int shift)
{
// workspace required for double buffering
T[] local itemsAlt = itemsTemp;
if (!itemsAlt)
itemsAlt = new local T[items.Length];// = void;			// = void prevents zeroing the allocated memory (in theory - currently crashes compiler)

if (items.Length > 4096)
else
}

private parallel void RadixSortP<T>(T[] local items, T[] local itemsAlt, int shift)
{
// calculate prefix values
int[] local prefixes = new local int[256];

// if lots of keys then calculate prefixes in parallel
if (items.Length >= 32768)
{
int split = items.Length / 4;
int[] local [4] prefixesPar;
for (int i = 0; i < 4; i++)
prefixesPar[i] = new local int[256];
parallel
{
int s = 0;
for (int i = 0; i < 3; i++, s += split)
RadixSortCountPrefixes<T>(prefixesPar[i], items[s .. s + split], shift);
// remainder
}

// sum the prefixes using simd add (stack allocated arrays are at least 16 byte aligned)
foreach (int[] local pfxPar; prefixesPar)
{
int8[] local pfxSrc8 = cast(int8[] local)pfxPar;
int8[] local pfxDst8 = cast(int8[] local)prefixes;
foreach (int8& pfx, i; pfxSrc8)
pfxDst8[i] += *pfx;
}
}
// if not a large number of prefixes do them in serial
else
{
}

// prefix is sum of all previous values
int sum = 0;
foreach (int& prefix, i; prefixes)
{
int newSum = sum + *prefix;
*prefix = sum;
sum = newSum;
}

// scatter values out to destination array
// can be done in parallel if shared memory is used but:
//	- you'd need to use atomic increments which would kill performance if you have lots of the same key
//	- would make the sort unstable
// (maybe there's another way to partition the work I'm not seeing that avoids these issues)
foreach (T& item; items)
{
int o = prefixes[(item.RadixSortKey << shift) >> 24]++;
itemsAlt[o] = *item;
}

// done?
if (shift < 24)
{
// now sort the sub-arrays in parallel if they're reasonably large or non-parallel if they're not
parallel
{
int s = 0;
for (int i = 0; i < 256; i++)
{
int e = prefixes[i];

if ((e - s) > 4096)
RadixSortP<T>(itemsAlt[s .. e], items[s .. e], shift + 8);
else if ((e - s) > 1)
RadixSortNP<T>(itemsAlt[s .. e], items[s .. e], shift + 8);
else if ((e - s) > 0)
items[s] = itemsAlt[s];

s = e;
}
}
}
}

// non-parallel version called by the master version above when arrays are sufficiently small
private void RadixSortNP<T>(T[] local items, T[] local itemsAlt, int shift) : parallel
{
// calculate prefix values
int[256] prefixes;

// prefix is sum of all previous values
int sum = 0;
foreach (int& prefix, i; prefixes)
{
int newSum = sum + *prefix;
*prefix = sum;
sum = newSum;
}

// scatter values out to destination array
foreach (T& item; items)
{
int o = prefixes[(item.RadixSortKey << shift) >> 24]++;
itemsAlt[o] = *item;
}

// if not done sort the sub-arrays
if (shift < 24)
{
int s = 0;
for (int i = 0; i < 256; i++)
{
int e = prefixes[i];

if ((e - s) > 1)
RadixSortNP<T>(itemsAlt[s .. e], items[s .. e], shift + 8);
else if ((e - s) > 0)
items[s] = itemsAlt[s];

s = e;
}
}
}

parallel void RadixSortCountPrefixes<T>(int[] local prefixes, const(T)[] local items, int shift)
{
foreach (const(T)& item; items)
}


And here's the little testbed I used to generate the results at the top:

import System.IO.Console;

uint randSeed = 0;

public void SeedRand(uint seed)
{
randSeed = seed;
}

public uint Rand()
{
randSeed = (randSeed * 1103515245 + 12345) & int.MaxValue;
return randSeed;
}

{
private uint key;
private int value;

public this()
{
key = Rand();
value = Rand();
}

public uint RadixSortKey() const : parallel
{
return key;
}
}

public int Main(string[] args)
{
SeedRand(12345);

static const int numItems = 1024 * 1024;
static const int numRuns = 10;

// uncomment this line to set number of threads job queue can use (it clamps internally to 1 .. number of physical cores on machine)

// run radix sort performance test
Console.WriteLine("Timing radix sort of %d items", &numItems);
ulong totalCycles = 0;

// 10 runs
for (int r = 0; r < numRuns; r++)
{

ulong startCycle = GetProcessorCycleCount();

ulong endCycle = GetProcessorCycleCount();
totalCycles += endCycle - startCycle;

// check sorted
for (int i = 1; i < items.Length; i++)
}

// report average time taken
float totalMs = cast(float)totalCycles / cast(float)GetProcessorCyclesPerSecond() / cast(float)numRuns * 1000.0f;
Console.WriteLine("Average time taken over %d runs = %.2fms", &numRuns, &totalMs);

return 0;
}


### #1c-up

Posted 08 May 2013 - 03:55 PM

Hodgman mentioning radix sort got me wondering so I've knocked up an implementation.

On my 2 core laptop (AMD E-450 APU 1.65GHz), sorting 1024*1024 items takes:

1 core = 718ms

2 core = 368ms (1.95x speedup)

On my 4 core desktop (AMD Athalon 2 X4 630 2.8GHz), sorting the same number of items takes:

1 core = 333ms

2 core = 190ms (1.75x)

3 core = 136ms (2.45x)

4 core = 106ms (3.14x)

Pretty reasonably speedups. I don't understand how I'm getting such a good speedup on the laptop actually ... probably made a mistake somewhere but those results seem repeatable.

Anyway, here's the radix sort code. It recursively subdivides the problem to generate the parallelism as described here:

It doesn't do parallel scatter as Hodgman suggested for reasons commented on in the code. I'd be interested to know if there's a better approach than I took.

As well as it being parallel internally you can also run multiple (non-overlapping) sorts in parallel if you need to.

// parallel radix sort
// in-place stable sort of items array into ascending key order
// type to be sorted must have RadixSortKey uint property
// you can pass in temporary workspace otherwise it will allocate one internally on the stack
public parallel void RadixSort<T>(T[] local items, T[] local itemsTemp = null, int shift)
{
// workspace required for double buffering
T[] local itemsAlt = itemsTemp;
if (!itemsAlt)
itemsAlt = new local T[items.Length];// = void;			// = void prevents zeroing the allocated memory (in theory - currently crashes compiler)

if (items.Length > 4096)
else
}

// this is implemented as a private function so shared items array isn't exposed as a public interface because
// this would be very bad if multiple instances of this function were invoked on a single array in parallel
private parallel void RadixSortP<T>(shared T[] local items, shared T[] local itemsAlt, int shift)
{
// calculate prefix values
int[] local prefixes = new local int[256];

// if lots of keys then calculate prefixes in parallel
if (items.Length >= 32768)
{
int split = items.Length / 4;
int[] local [4] prefixesPar;
for (int i = 0; i < 4; i++)
prefixesPar[i] = new local int[256];
parallel
{
int s = 0;
for (int i = 0; i < 3; i++, s += split)
RadixSortCountPrefixes<T>(prefixesPar[i], items[s .. s + split], shift);
// remainder
}

// sum the prefixes using simd add (stack allocated arrays are at least 16 byte aligned)
foreach (int[] local pfxPar; prefixesPar)
{
int8[] local pfxSrc8 = cast(int8[] local)pfxPar;
int8[] local pfxDst8 = cast(int8[] local)prefixes;
foreach (int8& pfx, i; pfxSrc8)
pfxDst8[i] += *pfx;
}
}
// if not a large number of prefixes do them in serial
else
{
}

// prefix is sum of all previous values
int sum = 0;
foreach (int& prefix, i; prefixes)
{
int newSum = sum + *prefix;
*prefix = sum;
sum = newSum;
}

// scatter values out to destination array
// can be done in parallel if shared memory is used but:
//	- you'd need to use atomic increments which would kill performance if you have lots of the same key
//	- would make the sort unstable
// (maybe there's another way to partition the work I'm not seeing that avoids these issues)
foreach (T& item; items)
{
int o = prefixes[(item.RadixSortKey << shift) >> 24]++;
itemsAlt[o] = *item;
}

// done?
if (shift < 24)
{
// now sort the sub-arrays in parallel if they're reasonably large or non-parallel if they're not
parallel
{
int s = 0;
for (int i = 0; i < 256; i++)
{
int e = prefixes[i];

if ((e - s) > 4096)
RadixSortP<T>(itemsAlt[s .. e], items[s .. e], shift + 8);
else if ((e - s) > 1)
RadixSortNP<T>(itemsAlt[s .. e], items[s .. e], shift + 8);
else if ((e - s) > 0)
items[s] = itemsAlt[s];

s = e;
}
}
}
}

// non-parallel version called by the master version above when arrays are sufficiently small
private void RadixSortNP<T>(T[] local items, T[] local itemsAlt, int shift) : parallel
{
// calculate prefix values
int[256] prefixes;

// prefix is sum of all previous values
int sum = 0;
foreach (int& prefix, i; prefixes)
{
int newSum = sum + *prefix;
*prefix = sum;
sum = newSum;
}

// scatter values out to destination array
foreach (T& item; items)
{
int o = prefixes[(item.RadixSortKey << shift) >> 24]++;
itemsAlt[o] = *item;
}

// if not done sort the sub-arrays
if (shift < 24)
{
int s = 0;
for (int i = 0; i < 256; i++)
{
int e = prefixes[i];

if ((e - s) > 1)
RadixSortNP<T>(itemsAlt[s .. e], items[s .. e], shift + 8);
else if ((e - s) > 0)
items[s] = itemsAlt[s];

s = e;
}
}
}

parallel void RadixSortCountPrefixes<T>(int[] local prefixes, const(T)[] local items, int shift)
{
foreach (const(T)& item; items)
}


And here's the little testbed I used to generate the results at the top:

import System.IO.Console;

uint randSeed = 0;

public void SeedRand(uint seed)
{
randSeed = seed;
}

public uint Rand()
{
randSeed = (randSeed * 1103515245 + 12345) & int.MaxValue;
return randSeed;
}

{
private uint key;
private int value;

public this()
{
key = Rand();
value = Rand();
}

public uint RadixSortKey() const : parallel
{
return key;
}
}

public int Main(string[] args)
{
SeedRand(12345);

static const int numItems = 1024 * 1024;
static const int numRuns = 10;

// uncomment this line to set number of threads job queue can use (it clamps internally to 1 .. number of physical cores on machine)

// run radix sort performance test
Console.WriteLine("Timing radix sort of %d items", &numItems);
ulong totalCycles = 0;

// 10 runs
for (int r = 0; r < numRuns; r++)
{

ulong startCycle = GetProcessorCycleCount();

ulong endCycle = GetProcessorCycleCount();
totalCycles += endCycle - startCycle;

// check sorted
for (int i = 1; i < items.Length; i++)
}

// report average time taken
float totalMs = cast(float)totalCycles / cast(float)GetProcessorCyclesPerSecond() / cast(float)numRuns * 1000.0f;
Console.WriteLine("Average time taken over %d runs = %.2fms", &numRuns, &totalMs);

return 0;
}


PARTNERS