I have a quite fast, excellent quality crypto-based PRNG that I always use for my OpenCL projects (it lends itself well to SIMD and MIMD architectures, even both at the same time). I might as well post it here (this is OpenCL code but should be readily convertible to whatever language you use, it uses vector types). The code below is rather barebones because it was designed to be used inside a GPU kernel, and is a bit redundant because the "wrapper" is unfinished:
#pragma once
/** @file prng.cl
* @brief Kernel PRNG implementation.
**/
/** This macro converts a 64-bit integer, to a [0..1) float. **/
#define TO_FLOAT(x) ((float)x / (ulong)(18446744073709551615UL))
/** This indicates how many rounds are to be used for the pseudorandom permutation
* function, higher means greater quality but at a higher computational cost,
* 3 recommended at a minimum, 9 is more than enough.
**/
#define ROUNDS 4
/* This is a 256-bit -> 256-bit keyed permutation function. */
/** Keyed permutation function.
* @param state The internal state.
* @param seed The PRNG's seed.
* @returns A 256-bit pseudorandom output.
**/
void renew(ulong4 *state, constant ulong4 *seed)
{
/* Retain the PRNG's state. */
ulong4 block = *state + *seed;
#pragma unroll
for (ulong t = 0; t < ROUNDS; t++)
{
/* Round 2t + 0 (×4 mix & permutation). */
block.lo += block.hi; block.hi = rotate(block.hi, (ulong2)(14, 16));
block.hi ^= block.lo; block = block.xywz;
block.lo += block.hi; block.hi = rotate(block.hi, (ulong2)(52, 57));
block.hi ^= block.lo; block = block.xywz;
block.lo += block.hi; block.hi = rotate(block.hi, (ulong2)(23, 40));
block.hi ^= block.lo; block = block.xywz;
block.lo += block.hi; block.hi = rotate(block.hi, (ulong2)( 5, 37));
block.hi ^= block.lo; block = block.xywz;
/* Key addition. */
block += *seed;
/* Round 2t + 1 (×4 mix & permutation). */
block.lo += block.hi; block.hi = rotate(block.hi, (ulong2)(25, 33));
block.hi ^= block.lo; block = block.xywz;
block.lo += block.hi; block.hi = rotate(block.hi, (ulong2)(46, 12));
block.hi ^= block.lo; block = block.xywz;
block.lo += block.hi; block.hi = rotate(block.hi, (ulong2)(58, 22));
block.hi ^= block.lo; block = block.xywz;
block.lo += block.hi; block.hi = rotate(block.hi, (ulong2)(32, 32));
block.hi ^= block.lo; block = block.xywz;
/* Key addition. */
block += *seed;
}
}
/** @struct PRNG
* @brief PRNG internal state.
*
* This structure contains an instance of PRNG, which is enough information to
* generate essentially infinitely many unbiased pseudorandom numbers.
**/
typedef struct PRNG
{
/** @brief The 256-bit internal state. **/
ulong4 state;
/** @brief A pointer to the PRNG's seed, common to all instances. **/
constant ulong4 *seed;
} PRNG;
/** This function creates a new PRNG instance, and initializes it to zero.
* @param ID The ID to create the PRNG instance with, must be unique.
* @param seed A pointer to the PRNG's seed.
* @returns The PRNG instance, ready for use.
**/
PRNG init(ulong ID, constant ulong4 *seed)
{
PRNG instance;
instance.state = (ulong4)(ID);
instance.seed = seed;
return instance;
}
/** This function returns a vector of uniform pseudorandom numbers in [0..1).
* @param prng A pointer to the PRNG instance to use.
* @returns An vector of unbiased uniform pseudorandom numbers between 0 and 1 exclusive.
**/
float4 rand(PRNG *prng)
{
renew(&prng->state, prng->seed);
return TO_FLOAT(prng->state);
}
The word size and vector width (here, 64 bits and 4) can be tweaked easily. I actually have a WIP document (including better code) on this that I may turn into a Gamedev article if I have time, but it's not ready yet. Two rounds is far better than an LCG and mighty fast, three rounds is pretty much all you'll need, four if you want to be certain (it depends a bit on the other parameters). Anything above that is overkill, really.
Do not use for cryptographic purposes. The design may be derived from a cipher but.. just don't.
It is also reversible (there is an inverse permutation). Period is expected to be 2^255 on average for this particular set of parameters, in general 2^(n - 1) where n is the internal state size in bits.
I think I was the not the first to use this particular cipher (Threefish) as a performance-oriented PRNG and I know people have been playing with Serpent and Twofish for a while, but I like having things I can rely on in any situation and that I can actually understand. This is one of them.