Hodgman mentioning radix sort got me wondering so I've knocked up an implementation.
On my 2 core laptop (AMD E-450 APU 1.65GHz), sorting 1024*1024 items takes:
1 core = 678ms
2 core = 368ms (1.84x speedup)
On my 4 core desktop (AMD Athalon 2 X4 630 2.8GHz), sorting the same number of items takes:
1 core = 333ms
2 core = 190ms (1.75x)
3 core = 136ms (2.45x)
4 core = 106ms (3.14x)
Pretty reasonably speedups.
Anyway, here's the radix sort code. It recursively subdivides the problem to generate the parallelism as described here:
http://software.intel.com/sites/default/files/m/b/3/6/3/8/22396-ParallelRadixSort_andreyryabov.pdf
It doesn't do parallel scatter as Hodgman suggested for reasons commented on in the code. I'd be interested to know if there's a better approach than I took.
As well as it being parallel internally you can also run multiple (non-overlapping) sorts in parallel if you need to.
// parallel radix sort
// in-place stable sort of items array into ascending key order
// type to be sorted must have RadixSortKey uint property
// you can pass in temporary workspace otherwise it will allocate one internally on the stack
public parallel void RadixSort<T>(T[] local items, T[] local itemsTemp = null)
{
// workspace required for double buffering
T[] local itemsAlt = itemsTemp;
if (!itemsAlt)
itemsAlt = new local T[items.Length];// = void; // = void prevents zeroing the allocated memory (in theory - currently crashes compiler)
if (items.Length > 4096)
RadixSortP<T>(items, itemsAlt, 0);
else
RadixSortNP<T>(items, itemsAlt, 0);
}
private parallel void RadixSortP<T>(T[] local items, T[] local itemsAlt, int shift)
{
// calculate prefix values
int[] local prefixes = new local int[256];
// if lots of keys then calculate prefixes in parallel
if (items.Length >= 32768)
{
int split = items.Length / 4;
int[] local [4] prefixesPar;
for (int i = 0; i < 4; i++)
prefixesPar = new local int[256];
parallel
{
int s = 0;
for (int i = 0; i < 3; i++, s += split)
RadixSortCountPrefixes<T>(prefixesPar, items[s .. s + split], shift);
// remainder
RadixSortCountPrefixes<T>(prefixesPar[3], items[s .. items.Length], shift);
}
// sum the prefixes using simd add (stack allocated arrays are at least 16 byte aligned)
foreach (int[] local pfxPar; prefixesPar)
{
int8[] local pfxSrc8 = cast(int8[] local)pfxPar;
int8[] local pfxDst8 = cast(int8[] local)prefixes;
foreach (int8& pfx, i; pfxSrc8)
pfxDst8 += *pfx;
}
}
// if not a large number of prefixes do them in serial
else
{
RadixSortCountPrefixes<T>(prefixes, items, shift);
}
// prefix is sum of all previous values
int sum = 0;
foreach (int& prefix, i; prefixes)
{
int newSum = sum + *prefix;
*prefix = sum;
sum = newSum;
}
// scatter values out to destination array
// can be done in parallel if shared memory is used but:
// - you'd need to use atomic increments which would kill performance if you have lots of the same key
// - would make the sort unstable
// (maybe there's another way to partition the work I'm not seeing that avoids these issues)
foreach (T& item; items)
{
int o = prefixes[(item.RadixSortKey << shift) >> 24]++;
itemsAlt[o] = *item;
}
// done?
if (shift < 24)
{
// now sort the sub-arrays in parallel if they're reasonably large or non-parallel if they're not
parallel
{
int s = 0;
for (int i = 0; i < 256; i++)
{
int e = prefixes;
if ((e - s) > 4096)
RadixSortP<T>(itemsAlt[s .. e], items[s .. e], shift + 8);
else if ((e - s) > 1)
RadixSortNP<T>(itemsAlt[s .. e], items[s .. e], shift + 8);
else if ((e - s) > 0)
items[s] = itemsAlt[s];
s = e;
}
}
}
}
// non-parallel version called by the master version above when arrays are sufficiently small
private void RadixSortNP<T>(T[] local items, T[] local itemsAlt, int shift) : parallel
{
// calculate prefix values
int[256] prefixes;
RadixSortCountPrefixes<T>(&prefixes, items, shift);
// prefix is sum of all previous values
int sum = 0;
foreach (int& prefix, i; prefixes)
{
int newSum = sum + *prefix;
*prefix = sum;
sum = newSum;
}
// scatter values out to destination array
foreach (T& item; items)
{
int o = prefixes[(item.RadixSortKey << shift) >> 24]++;
itemsAlt[o] = *item;
}
// if not done sort the sub-arrays
if (shift < 24)
{
int s = 0;
for (int i = 0; i < 256; i++)
{
int e = prefixes;
if ((e - s) > 1)
RadixSortNP<T>(itemsAlt[s .. e], items[s .. e], shift + 8);
else if ((e - s) > 0)
items[s] = itemsAlt[s];
s = e;
}
}
}
// radix sort helper
parallel void RadixSortCountPrefixes<T>(int[] local prefixes, const(T)[] local items, int shift)
{
foreach (const(T)& item; items)
prefixes[(item.RadixSortKey << shift) >> 24]++;
}
And here's the little testbed I used to generate the results at the top:
import System.IO.Console;
uint randSeed = 0;
public void SeedRand(uint seed)
{
randSeed = seed;
}
public uint Rand()
{
randSeed = (randSeed * 1103515245 + 12345) & int.MaxValue;
return randSeed;
}
public struct RadixItem
{
private uint key;
private int value;
public this()
{
key = Rand();
value = Rand();
}
public uint RadixSortKey() const : parallel
{
return key;
}
}
public int Main(string[] args)
{
SeedRand(12345);
static const int numItems = 1024 * 1024;
static const int numRuns = 10;
RadixItem[] items = new RadixItem[numItems];
RadixItem[] itemsTmp = new RadixItem[numItems];
// uncomment this line to set number of threads job queue can use (it clamps internally to 1 .. number of physical cores on machine)
//System.Profile.JobQueue.SetNumThreads(2);
// run radix sort performance test
Console.WriteLine("Timing radix sort of %d items", &numItems);
ulong totalCycles = 0;
// 10 runs
for (int r = 0; r < numRuns; r++)
{
foreach (RadixItem& item; items)
*item = RadixItem();
ulong startCycle = GetProcessorCycleCount();
RadixSort<RadixItem>(items, itemsTmp);
ulong endCycle = GetProcessorCycleCount();
totalCycles += endCycle - startCycle;
// check sorted
for (int i = 1; i < items.Length; i++)
Assert(items.RadixSortKey >= items[i - 1].RadixSortKey);
}
// report average time taken
float totalMs = cast(float)totalCycles / cast(float)GetProcessorCyclesPerSecond() / cast(float)numRuns * 1000.0f;
Console.WriteLine("Average time taken over %d runs = %.2fms", &numRuns, &totalMs);
return 0;
}