Jump to content

  • Log In with Google      Sign In   
  • Create Account


#Actualc-up

Posted 09 May 2013 - 07:00 AM

Hodgman mentioning radix sort got me wondering so I've knocked up an implementation.

 

On my 2 core laptop (AMD E-450 APU 1.65GHz), sorting 1024*1024 items takes:

1 core = 678ms

2 core = 368ms (1.84x speedup)

 

On my 4 core desktop (AMD Athalon 2 X4 630 2.8GHz), sorting the same number of items takes:

1 core = 333ms

2 core = 190ms (1.75x)

3 core = 136ms (2.45x)

4 core = 106ms (3.14x)

 

Pretty reasonably speedups.

 

Anyway, here's the radix sort code. It recursively subdivides the problem to generate the parallelism as described here:

http://software.intel.com/sites/default/files/m/b/3/6/3/8/22396-ParallelRadixSort_andreyryabov.pdf

 

It doesn't do parallel scatter as Hodgman suggested for reasons commented on in the code. I'd be interested to know if there's a better approach than I took.

 

As well as it being parallel internally you can also run multiple (non-overlapping) sorts in parallel if you need to.

 

// parallel radix sort
// in-place stable sort of items array into ascending key order
// type to be sorted must have RadixSortKey uint property
// you can pass in temporary workspace otherwise it will allocate one internally on the stack
public parallel void RadixSort<T>(T[] local items, T[] local itemsTemp = null)
{
	// workspace required for double buffering
	T[] local itemsAlt = itemsTemp;
	if (!itemsAlt)
		itemsAlt = new local T[items.Length];// = void;			// = void prevents zeroing the allocated memory (in theory - currently crashes compiler)

	if (items.Length > 4096)
		RadixSortP<T>(items, itemsAlt, 0);
	else
		RadixSortNP<T>(items, itemsAlt, 0);
}

private parallel void RadixSortP<T>(T[] local items, T[] local itemsAlt, int shift)
{
	// calculate prefix values
	int[] local prefixes = new local int[256];

	// if lots of keys then calculate prefixes in parallel
	if (items.Length >= 32768)
	{
		int split = items.Length / 4;
		int[] local [4] prefixesPar;
		for (int i = 0; i < 4; i++)
			prefixesPar[i] = new local int[256];
		parallel
		{
			int s = 0;
			for (int i = 0; i < 3; i++, s += split)
				RadixSortCountPrefixes<T>(prefixesPar[i], items[s .. s + split], shift);
			// remainder
			RadixSortCountPrefixes<T>(prefixesPar[3], items[s .. items.Length], shift);
		}

		// sum the prefixes using simd add (stack allocated arrays are at least 16 byte aligned)
		foreach (int[] local pfxPar; prefixesPar)
		{
			int8[] local pfxSrc8 = cast(int8[] local)pfxPar;
			int8[] local pfxDst8 = cast(int8[] local)prefixes;
			foreach (int8& pfx, i; pfxSrc8)
				pfxDst8[i] += *pfx;
		}
	}
	// if not a large number of prefixes do them in serial
	else
	{
		RadixSortCountPrefixes<T>(prefixes, items, shift);
	}

	// prefix is sum of all previous values
	int sum = 0;
	foreach (int& prefix, i; prefixes)
	{
		int newSum = sum + *prefix;
		*prefix = sum;
		sum = newSum;
	}

	// scatter values out to destination array
	// can be done in parallel if shared memory is used but:
	//	- you'd need to use atomic increments which would kill performance if you have lots of the same key
	//	- would make the sort unstable
	// (maybe there's another way to partition the work I'm not seeing that avoids these issues)
	foreach (T& item; items)
	{
		int o = prefixes[(item.RadixSortKey << shift) >> 24]++;
		itemsAlt[o] = *item;
	}

	// done?
	if (shift < 24)
	{
		// now sort the sub-arrays in parallel if they're reasonably large or non-parallel if they're not
		parallel
		{
			int s = 0;
			for (int i = 0; i < 256; i++)
			{
				int e = prefixes[i];
		
				if ((e - s) > 4096)
					RadixSortP<T>(itemsAlt[s .. e], items[s .. e], shift + 8);
				else if ((e - s) > 1)
					RadixSortNP<T>(itemsAlt[s .. e], items[s .. e], shift + 8);
				else if ((e - s) > 0)
					items[s] = itemsAlt[s];

				s = e;
			}
		}
	}
}

// non-parallel version called by the master version above when arrays are sufficiently small
private void RadixSortNP<T>(T[] local items, T[] local itemsAlt, int shift) : parallel
{
	// calculate prefix values
	int[256] prefixes;

	RadixSortCountPrefixes<T>(&prefixes, items, shift);

	// prefix is sum of all previous values
	int sum = 0;
	foreach (int& prefix, i; prefixes)
	{
		int newSum = sum + *prefix;
		*prefix = sum;
		sum = newSum;
	}

	// scatter values out to destination array
	foreach (T& item; items)
	{
		int o = prefixes[(item.RadixSortKey << shift) >> 24]++;
		itemsAlt[o] = *item;
	}

	// if not done sort the sub-arrays
	if (shift < 24)
	{
		int s = 0;
		for (int i = 0; i < 256; i++)
		{
			int e = prefixes[i];

			if ((e - s) > 1)
				RadixSortNP<T>(itemsAlt[s .. e], items[s .. e], shift + 8);
			else if ((e - s) > 0)
				items[s] = itemsAlt[s];

			s = e;
		}
	}
}

// radix sort helper
parallel void RadixSortCountPrefixes<T>(int[] local prefixes, const(T)[] local items, int shift)
{
	foreach (const(T)& item; items)
		prefixes[(item.RadixSortKey << shift) >> 24]++;
}

 

And here's the little testbed I used to generate the results at the top:

 

import System.IO.Console;

uint randSeed = 0;

public void SeedRand(uint seed)
{
	randSeed = seed;
}

public uint Rand()
{
	randSeed = (randSeed * 1103515245 + 12345) & int.MaxValue;
	return randSeed;
}

public struct RadixItem
{
	private uint key;
	private int value;

	public this()
	{
		key = Rand();
		value = Rand();
	}

	public uint RadixSortKey() const : parallel
	{
		return key;
	}
}

public int Main(string[] args)
{
	SeedRand(12345);

	static const int numItems = 1024 * 1024;
	static const int numRuns = 10;

	RadixItem[] items = new RadixItem[numItems];
	RadixItem[] itemsTmp = new RadixItem[numItems];

	// uncomment this line to set number of threads job queue can use (it clamps internally to 1 .. number of physical cores on machine)
	//System.Profile.JobQueue.SetNumThreads(2);

	// run radix sort performance test
	Console.WriteLine("Timing radix sort of %d items", &numItems);
	ulong totalCycles = 0;

	// 10 runs
	for (int r = 0; r < numRuns; r++)
	{
		foreach (RadixItem& item; items)
			*item = RadixItem();

		ulong startCycle = GetProcessorCycleCount();

		RadixSort<RadixItem>(items, itemsTmp);

		ulong endCycle = GetProcessorCycleCount();
		totalCycles += endCycle - startCycle;

		// check sorted
		for (int i = 1; i < items.Length; i++)
			Assert(items[i].RadixSortKey >= items[i - 1].RadixSortKey);
	}

	// report average time taken
	float totalMs = cast(float)totalCycles / cast(float)GetProcessorCyclesPerSecond() / cast(float)numRuns * 1000.0f;
	Console.WriteLine("Average time taken over %d runs = %.2fms", &numRuns, &totalMs);

	return 0;
}

#3c-up

Posted 09 May 2013 - 06:59 AM

Hodgman mentioning radix sort got me wondering so I've knocked up an implementation.

 

On my 2 core laptop (AMD E-450 APU 1.65GHz), sorting 1024*1024 items takes:

1 core = 678ms

2 core = 368ms (1.84x speedup)

 

On my 4 core desktop (AMD Athalon 2 X4 630 2.8GHz), sorting the same number of items takes:

1 core = 333ms

2 core = 190ms (1.75x)

3 core = 136ms (2.45x)

4 core = 106ms (3.14x)

 

Pretty reasonably speedups.

 

Anyway, here's the radix sort code. It recursively subdivides the problem to generate the parallelism as described here:

http://software.intel.com/sites/default/files/m/b/3/6/3/8/22396-ParallelRadixSort_andreyryabov.pdf

 

It doesn't do parallel scatter as Hodgman suggested for reasons commented on in the code. I'd be interested to know if there's a better approach than I took.

 

As well as it being parallel internally you can also run multiple (non-overlapping) sorts in parallel if you need to.

 

// parallel radix sort
// in-place stable sort of items array into ascending key order
// type to be sorted must have RadixSortKey uint property
// you can pass in temporary workspace otherwise it will allocate one internally on the stack
public parallel void RadixSort<T>(T[] local items, T[] local itemsTemp = null, int shift)
{
	// workspace required for double buffering
	T[] local itemsAlt = itemsTemp;
	if (!itemsAlt)
		itemsAlt = new local T[items.Length];// = void;			// = void prevents zeroing the allocated memory (in theory - currently crashes compiler)

	if (items.Length > 4096)
		RadixSortP<T>(items, itemsAlt, 0);
	else
		RadixSortNP<T>(items, itemsAlt, 0);
}

private parallel void RadixSortP<T>(T[] local items, T[] local itemsAlt, int shift)
{
	// calculate prefix values
	int[] local prefixes = new local int[256];

	// if lots of keys then calculate prefixes in parallel
	if (items.Length >= 32768)
	{
		int split = items.Length / 4;
		int[] local [4] prefixesPar;
		for (int i = 0; i < 4; i++)
			prefixesPar[i] = new local int[256];
		parallel
		{
			int s = 0;
			for (int i = 0; i < 3; i++, s += split)
				RadixSortCountPrefixes<T>(prefixesPar[i], items[s .. s + split], shift);
			// remainder
			RadixSortCountPrefixes<T>(prefixesPar[3], items[s .. items.Length], shift);
		}

		// sum the prefixes using simd add (stack allocated arrays are at least 16 byte aligned)
		foreach (int[] local pfxPar; prefixesPar)
		{
			int8[] local pfxSrc8 = cast(int8[] local)pfxPar;
			int8[] local pfxDst8 = cast(int8[] local)prefixes;
			foreach (int8& pfx, i; pfxSrc8)
				pfxDst8[i] += *pfx;
		}
	}
	// if not a large number of prefixes do them in serial
	else
	{
		RadixSortCountPrefixes<T>(prefixes, items, shift);
	}

	// prefix is sum of all previous values
	int sum = 0;
	foreach (int& prefix, i; prefixes)
	{
		int newSum = sum + *prefix;
		*prefix = sum;
		sum = newSum;
	}

	// scatter values out to destination array
	// can be done in parallel if shared memory is used but:
	//	- you'd need to use atomic increments which would kill performance if you have lots of the same key
	//	- would make the sort unstable
	// (maybe there's another way to partition the work I'm not seeing that avoids these issues)
	foreach (T& item; items)
	{
		int o = prefixes[(item.RadixSortKey << shift) >> 24]++;
		itemsAlt[o] = *item;
	}

	// done?
	if (shift < 24)
	{
		// now sort the sub-arrays in parallel if they're reasonably large or non-parallel if they're not
		parallel
		{
			int s = 0;
			for (int i = 0; i < 256; i++)
			{
				int e = prefixes[i];
		
				if ((e - s) > 4096)
					RadixSortP<T>(itemsAlt[s .. e], items[s .. e], shift + 8);
				else if ((e - s) > 1)
					RadixSortNP<T>(itemsAlt[s .. e], items[s .. e], shift + 8);
				else if ((e - s) > 0)
					items[s] = itemsAlt[s];

				s = e;
			}
		}
	}
}

// non-parallel version called by the master version above when arrays are sufficiently small
private void RadixSortNP<T>(T[] local items, T[] local itemsAlt, int shift) : parallel
{
	// calculate prefix values
	int[256] prefixes;

	RadixSortCountPrefixes<T>(&prefixes, items, shift);

	// prefix is sum of all previous values
	int sum = 0;
	foreach (int& prefix, i; prefixes)
	{
		int newSum = sum + *prefix;
		*prefix = sum;
		sum = newSum;
	}

	// scatter values out to destination array
	foreach (T& item; items)
	{
		int o = prefixes[(item.RadixSortKey << shift) >> 24]++;
		itemsAlt[o] = *item;
	}

	// if not done sort the sub-arrays
	if (shift < 24)
	{
		int s = 0;
		for (int i = 0; i < 256; i++)
		{
			int e = prefixes[i];

			if ((e - s) > 1)
				RadixSortNP<T>(itemsAlt[s .. e], items[s .. e], shift + 8);
			else if ((e - s) > 0)
				items[s] = itemsAlt[s];

			s = e;
		}
	}
}

// radix sort helper
parallel void RadixSortCountPrefixes<T>(int[] local prefixes, const(T)[] local items, int shift)
{
	foreach (const(T)& item; items)
		prefixes[(item.RadixSortKey << shift) >> 24]++;
}

 

And here's the little testbed I used to generate the results at the top:

 

import System.IO.Console;

uint randSeed = 0;

public void SeedRand(uint seed)
{
	randSeed = seed;
}

public uint Rand()
{
	randSeed = (randSeed * 1103515245 + 12345) & int.MaxValue;
	return randSeed;
}

public struct RadixItem
{
	private uint key;
	private int value;

	public this()
	{
		key = Rand();
		value = Rand();
	}

	public uint RadixSortKey() const : parallel
	{
		return key;
	}
}

public int Main(string[] args)
{
	SeedRand(12345);

	static const int numItems = 1024 * 1024;
	static const int numRuns = 10;

	RadixItem[] items = new RadixItem[numItems];
	RadixItem[] itemsTmp = new RadixItem[numItems];

	// uncomment this line to set number of threads job queue can use (it clamps internally to 1 .. number of physical cores on machine)
	//System.Profile.JobQueue.SetNumThreads(2);

	// run radix sort performance test
	Console.WriteLine("Timing radix sort of %d items", &numItems);
	ulong totalCycles = 0;

	// 10 runs
	for (int r = 0; r < numRuns; r++)
	{
		foreach (RadixItem& item; items)
			*item = RadixItem();

		ulong startCycle = GetProcessorCycleCount();

		RadixSort<RadixItem>(items, itemsTmp);

		ulong endCycle = GetProcessorCycleCount();
		totalCycles += endCycle - startCycle;

		// check sorted
		for (int i = 1; i < items.Length; i++)
			Assert(items[i].RadixSortKey >= items[i - 1].RadixSortKey);
	}

	// report average time taken
	float totalMs = cast(float)totalCycles / cast(float)GetProcessorCyclesPerSecond() / cast(float)numRuns * 1000.0f;
	Console.WriteLine("Average time taken over %d runs = %.2fms", &numRuns, &totalMs);

	return 0;
}

#2c-up

Posted 09 May 2013 - 01:40 AM

Hodgman mentioning radix sort got me wondering so I've knocked up an implementation.

 

On my 2 core laptop (AMD E-450 APU 1.65GHz), sorting 1024*1024 items takes:

1 core = 718ms

2 core = 368ms (1.95x speedup)

 

On my 4 core desktop (AMD Athalon 2 X4 630 2.8GHz), sorting the same number of items takes:

1 core = 333ms

2 core = 190ms (1.75x)

3 core = 136ms (2.45x)

4 core = 106ms (3.14x)

 

Pretty reasonably speedups. I don't understand how I'm getting such a good speedup on the laptop actually ... probably made a mistake somewhere but those results seem repeatable.

 

Anyway, here's the radix sort code. It recursively subdivides the problem to generate the parallelism as described here:

http://software.intel.com/sites/default/files/m/b/3/6/3/8/22396-ParallelRadixSort_andreyryabov.pdf

 

It doesn't do parallel scatter as Hodgman suggested for reasons commented on in the code. I'd be interested to know if there's a better approach than I took.

 

As well as it being parallel internally you can also run multiple (non-overlapping) sorts in parallel if you need to.

 

// parallel radix sort
// in-place stable sort of items array into ascending key order
// type to be sorted must have RadixSortKey uint property
// you can pass in temporary workspace otherwise it will allocate one internally on the stack
public parallel void RadixSort<T>(T[] local items, T[] local itemsTemp = null, int shift)
{
	// workspace required for double buffering
	T[] local itemsAlt = itemsTemp;
	if (!itemsAlt)
		itemsAlt = new local T[items.Length];// = void;			// = void prevents zeroing the allocated memory (in theory - currently crashes compiler)

	if (items.Length > 4096)
		RadixSortP<T>(items, itemsAlt, 0);
	else
		RadixSortNP<T>(items, itemsAlt, 0);
}

private parallel void RadixSortP<T>(T[] local items, T[] local itemsAlt, int shift)
{
	// calculate prefix values
	int[] local prefixes = new local int[256];

	// if lots of keys then calculate prefixes in parallel
	if (items.Length >= 32768)
	{
		int split = items.Length / 4;
		int[] local [4] prefixesPar;
		for (int i = 0; i < 4; i++)
			prefixesPar[i] = new local int[256];
		parallel
		{
			int s = 0;
			for (int i = 0; i < 3; i++, s += split)
				RadixSortCountPrefixes<T>(prefixesPar[i], items[s .. s + split], shift);
			// remainder
			RadixSortCountPrefixes<T>(prefixesPar[3], items[s .. items.Length], shift);
		}

		// sum the prefixes using simd add (stack allocated arrays are at least 16 byte aligned)
		foreach (int[] local pfxPar; prefixesPar)
		{
			int8[] local pfxSrc8 = cast(int8[] local)pfxPar;
			int8[] local pfxDst8 = cast(int8[] local)prefixes;
			foreach (int8& pfx, i; pfxSrc8)
				pfxDst8[i] += *pfx;
		}
	}
	// if not a large number of prefixes do them in serial
	else
	{
		RadixSortCountPrefixes<T>(prefixes, items, shift);
	}

	// prefix is sum of all previous values
	int sum = 0;
	foreach (int& prefix, i; prefixes)
	{
		int newSum = sum + *prefix;
		*prefix = sum;
		sum = newSum;
	}

	// scatter values out to destination array
	// can be done in parallel if shared memory is used but:
	//	- you'd need to use atomic increments which would kill performance if you have lots of the same key
	//	- would make the sort unstable
	// (maybe there's another way to partition the work I'm not seeing that avoids these issues)
	foreach (T& item; items)
	{
		int o = prefixes[(item.RadixSortKey << shift) >> 24]++;
		itemsAlt[o] = *item;
	}

	// done?
	if (shift < 24)
	{
		// now sort the sub-arrays in parallel if they're reasonably large or non-parallel if they're not
		parallel
		{
			int s = 0;
			for (int i = 0; i < 256; i++)
			{
				int e = prefixes[i];
		
				if ((e - s) > 4096)
					RadixSortP<T>(itemsAlt[s .. e], items[s .. e], shift + 8);
				else if ((e - s) > 1)
					RadixSortNP<T>(itemsAlt[s .. e], items[s .. e], shift + 8);
				else if ((e - s) > 0)
					items[s] = itemsAlt[s];

				s = e;
			}
		}
	}
}

// non-parallel version called by the master version above when arrays are sufficiently small
private void RadixSortNP<T>(T[] local items, T[] local itemsAlt, int shift) : parallel
{
	// calculate prefix values
	int[256] prefixes;

	RadixSortCountPrefixes<T>(&prefixes, items, shift);

	// prefix is sum of all previous values
	int sum = 0;
	foreach (int& prefix, i; prefixes)
	{
		int newSum = sum + *prefix;
		*prefix = sum;
		sum = newSum;
	}

	// scatter values out to destination array
	foreach (T& item; items)
	{
		int o = prefixes[(item.RadixSortKey << shift) >> 24]++;
		itemsAlt[o] = *item;
	}

	// if not done sort the sub-arrays
	if (shift < 24)
	{
		int s = 0;
		for (int i = 0; i < 256; i++)
		{
			int e = prefixes[i];

			if ((e - s) > 1)
				RadixSortNP<T>(itemsAlt[s .. e], items[s .. e], shift + 8);
			else if ((e - s) > 0)
				items[s] = itemsAlt[s];

			s = e;
		}
	}
}

// radix sort helper
parallel void RadixSortCountPrefixes<T>(int[] local prefixes, const(T)[] local items, int shift)
{
	foreach (const(T)& item; items)
		prefixes[(item.RadixSortKey << shift) >> 24]++;
}

 

And here's the little testbed I used to generate the results at the top:

 

import System.IO.Console;

uint randSeed = 0;

public void SeedRand(uint seed)
{
	randSeed = seed;
}

public uint Rand()
{
	randSeed = (randSeed * 1103515245 + 12345) & int.MaxValue;
	return randSeed;
}

public struct RadixItem
{
	private uint key;
	private int value;

	public this()
	{
		key = Rand();
		value = Rand();
	}

	public uint RadixSortKey() const : parallel
	{
		return key;
	}
}

public int Main(string[] args)
{
	SeedRand(12345);

	static const int numItems = 1024 * 1024;
	static const int numRuns = 10;

	RadixItem[] items = new RadixItem[numItems];
	RadixItem[] itemsTmp = new RadixItem[numItems];

	// uncomment this line to set number of threads job queue can use (it clamps internally to 1 .. number of physical cores on machine)
	//System.Profile.JobQueue.SetNumThreads(2);

	// run radix sort performance test
	Console.WriteLine("Timing radix sort of %d items", &numItems);
	ulong totalCycles = 0;

	// 10 runs
	for (int r = 0; r < numRuns; r++)
	{
		foreach (RadixItem& item; items)
			*item = RadixItem();

		ulong startCycle = GetProcessorCycleCount();

		RadixSort<RadixItem>(items, itemsTmp);

		ulong endCycle = GetProcessorCycleCount();
		totalCycles += endCycle - startCycle;

		// check sorted
		for (int i = 1; i < items.Length; i++)
			Assert(items[i].RadixSortKey >= items[i - 1].RadixSortKey);
	}

	// report average time taken
	float totalMs = cast(float)totalCycles / cast(float)GetProcessorCyclesPerSecond() / cast(float)numRuns * 1000.0f;
	Console.WriteLine("Average time taken over %d runs = %.2fms", &numRuns, &totalMs);

	return 0;
}

#1c-up

Posted 08 May 2013 - 03:55 PM

Hodgman mentioning radix sort got me wondering so I've knocked up an implementation.

 

On my 2 core laptop (AMD E-450 APU 1.65GHz), sorting 1024*1024 items takes:

1 core = 718ms

2 core = 368ms (1.95x speedup)

 

On my 4 core desktop (AMD Athalon 2 X4 630 2.8GHz), sorting the same number of items takes:

1 core = 333ms

2 core = 190ms (1.75x)

3 core = 136ms (2.45x)

4 core = 106ms (3.14x)

 

Pretty reasonably speedups. I don't understand how I'm getting such a good speedup on the laptop actually ... probably made a mistake somewhere but those results seem repeatable.

 

Anyway, here's the radix sort code. It recursively subdivides the problem to generate the parallelism as described here:

http://software.intel.com/sites/default/files/m/b/3/6/3/8/22396-ParallelRadixSort_andreyryabov.pdf

 

It doesn't do parallel scatter as Hodgman suggested for reasons commented on in the code. I'd be interested to know if there's a better approach than I took.

 

As well as it being parallel internally you can also run multiple (non-overlapping) sorts in parallel if you need to.

 

// parallel radix sort
// in-place stable sort of items array into ascending key order
// type to be sorted must have RadixSortKey uint property
// you can pass in temporary workspace otherwise it will allocate one internally on the stack
public parallel void RadixSort<T>(T[] local items, T[] local itemsTemp = null, int shift)
{
	// workspace required for double buffering
	T[] local itemsAlt = itemsTemp;
	if (!itemsAlt)
		itemsAlt = new local T[items.Length];// = void;			// = void prevents zeroing the allocated memory (in theory - currently crashes compiler)

	if (items.Length > 4096)
		RadixSortP<T>(items, itemsAlt, 0);
	else
		RadixSortNP<T>(items, itemsAlt, 0);
}

// this is implemented as a private function so shared items array isn't exposed as a public interface because
// this would be very bad if multiple instances of this function were invoked on a single array in parallel
private parallel void RadixSortP<T>(shared T[] local items, shared T[] local itemsAlt, int shift)
{
	// calculate prefix values
	int[] local prefixes = new local int[256];

	// if lots of keys then calculate prefixes in parallel
	if (items.Length >= 32768)
	{
		int split = items.Length / 4;
		int[] local [4] prefixesPar;
		for (int i = 0; i < 4; i++)
			prefixesPar[i] = new local int[256];
		parallel
		{
			int s = 0;
			for (int i = 0; i < 3; i++, s += split)
				RadixSortCountPrefixes<T>(prefixesPar[i], items[s .. s + split], shift);
			// remainder
			RadixSortCountPrefixes<T>(prefixesPar[3], items[s .. items.Length], shift);
		}

		// sum the prefixes using simd add (stack allocated arrays are at least 16 byte aligned)
		foreach (int[] local pfxPar; prefixesPar)
		{
			int8[] local pfxSrc8 = cast(int8[] local)pfxPar;
			int8[] local pfxDst8 = cast(int8[] local)prefixes;
			foreach (int8& pfx, i; pfxSrc8)
				pfxDst8[i] += *pfx;
		}
	}
	// if not a large number of prefixes do them in serial
	else
	{
		RadixSortCountPrefixes<T>(prefixes, items, shift);
	}

	// prefix is sum of all previous values
	int sum = 0;
	foreach (int& prefix, i; prefixes)
	{
		int newSum = sum + *prefix;
		*prefix = sum;
		sum = newSum;
	}

	// scatter values out to destination array
	// can be done in parallel if shared memory is used but:
	//	- you'd need to use atomic increments which would kill performance if you have lots of the same key
	//	- would make the sort unstable
	// (maybe there's another way to partition the work I'm not seeing that avoids these issues)
	foreach (T& item; items)
	{
		int o = prefixes[(item.RadixSortKey << shift) >> 24]++;
		itemsAlt[o] = *item;
	}

	// done?
	if (shift < 24)
	{
		// now sort the sub-arrays in parallel if they're reasonably large or non-parallel if they're not
		parallel
		{
			int s = 0;
			for (int i = 0; i < 256; i++)
			{
				int e = prefixes[i];
		
				if ((e - s) > 4096)
					RadixSortP<T>(itemsAlt[s .. e], items[s .. e], shift + 8);
				else if ((e - s) > 1)
					RadixSortNP<T>(itemsAlt[s .. e], items[s .. e], shift + 8);
				else if ((e - s) > 0)
					items[s] = itemsAlt[s];

				s = e;
			}
		}
	}
}

// non-parallel version called by the master version above when arrays are sufficiently small
private void RadixSortNP<T>(T[] local items, T[] local itemsAlt, int shift) : parallel
{
	// calculate prefix values
	int[256] prefixes;

	RadixSortCountPrefixes<T>(&prefixes, items, shift);

	// prefix is sum of all previous values
	int sum = 0;
	foreach (int& prefix, i; prefixes)
	{
		int newSum = sum + *prefix;
		*prefix = sum;
		sum = newSum;
	}

	// scatter values out to destination array
	foreach (T& item; items)
	{
		int o = prefixes[(item.RadixSortKey << shift) >> 24]++;
		itemsAlt[o] = *item;
	}

	// if not done sort the sub-arrays
	if (shift < 24)
	{
		int s = 0;
		for (int i = 0; i < 256; i++)
		{
			int e = prefixes[i];

			if ((e - s) > 1)
				RadixSortNP<T>(itemsAlt[s .. e], items[s .. e], shift + 8);
			else if ((e - s) > 0)
				items[s] = itemsAlt[s];

			s = e;
		}
	}
}

// radix sort helper
parallel void RadixSortCountPrefixes<T>(int[] local prefixes, const(T)[] local items, int shift)
{
	foreach (const(T)& item; items)
		prefixes[(item.RadixSortKey << shift) >> 24]++;
}

 

And here's the little testbed I used to generate the results at the top:

 

import System.IO.Console;

uint randSeed = 0;

public void SeedRand(uint seed)
{
	randSeed = seed;
}

public uint Rand()
{
	randSeed = (randSeed * 1103515245 + 12345) & int.MaxValue;
	return randSeed;
}

public struct RadixItem
{
	private uint key;
	private int value;

	public this()
	{
		key = Rand();
		value = Rand();
	}

	public uint RadixSortKey() const : parallel
	{
		return key;
	}
}

public int Main(string[] args)
{
	SeedRand(12345);

	static const int numItems = 1024 * 1024;
	static const int numRuns = 10;

	RadixItem[] items = new RadixItem[numItems];
	RadixItem[] itemsTmp = new RadixItem[numItems];

	// uncomment this line to set number of threads job queue can use (it clamps internally to 1 .. number of physical cores on machine)
	//System.Profile.JobQueue.SetNumThreads(2);

	// run radix sort performance test
	Console.WriteLine("Timing radix sort of %d items", &numItems);
	ulong totalCycles = 0;

	// 10 runs
	for (int r = 0; r < numRuns; r++)
	{
		foreach (RadixItem& item; items)
			*item = RadixItem();

		ulong startCycle = GetProcessorCycleCount();

		RadixSort<RadixItem>(items, itemsTmp);

		ulong endCycle = GetProcessorCycleCount();
		totalCycles += endCycle - startCycle;

		// check sorted
		for (int i = 1; i < items.Length; i++)
			Assert(items[i].RadixSortKey >= items[i - 1].RadixSortKey);
	}

	// report average time taken
	float totalMs = cast(float)totalCycles / cast(float)GetProcessorCyclesPerSecond() / cast(float)numRuns * 1000.0f;
	Console.WriteLine("Average time taken over %d runs = %.2fms", &numRuns, &totalMs);

	return 0;
}

PARTNERS