• Create Account

Need scary sound effects or creepy audio loops for your next horror-themed game? Check out Highscore Vol.3 - The Horror Edition in our marketplace. 50 sounds and 10 loops for only \$9.99!

### #ActualCryZe

Posted 11 March 2012 - 06:30 AM

After looking through NVidia's implementation of the FFT I noticed, that they were only using 128 threads per thread group. I realized that my graphics card might not have enough shader cores to run all threads in parallel (400). So everytime all threads would need to be synchronized the driver would have to reorganize all the workload the shader cores are doing so that all the 512 threads can be synchronized.
So I tried implementing the FFT a bit more iterative to only have 128 threads as well. And indeed, the shader is almost twice as fast. It only needs 8.6ms now.

All 4 times the shader gets dispatched results in 34.6ms now. If I'd reduce the resolution to 256x256, all 4 dispatches together would only last 7.68ms. But I'm still looking for ways to improve it before reducing the resolution...

[numthreads(ELEMENTS/ITERATION_FACTOR,1,1)]
void CSMain(
uint3 groupId : SV_GroupID,
uint groupIndex: SV_GroupIndex,
{
Complex4 tempComplexValues[ITERATION_FACTOR];

//Revert the indices and load the values into groupshared memory
[unroll] //Do some FFTs iterative instead of parallel
for (uint i = 0, iterationIndex = index; i < ITERATION_FACTOR; ++i, iterationIndex += ELEMENTS/ITERATION_FACTOR)
{
}

//Sync to ensure that all the values are loaded
GroupMemoryBarrierWithGroupSync();

uint d = 1;
uint dHalf = 0;

[unroll] //Iterate over all the FFTs
for (uint n = 1; n <= LOG_ELEMENTS; ++n)
{
//The size of the FFT
dHalf = d;
d <<= 1;

[unroll] //Do some FFTs iterative instead of parallel
for (uint i = 0, iterationIndex = index; i < ITERATION_FACTOR; ++i, iterationIndex += ELEMENTS/ITERATION_FACTOR)
{
//Compute temp values
uint k = iterationIndex & (d - 1);
uint expK = k & (dHalf - 1);
uint m = iterationIndex - k + expK;

//Compute w, the factor for the odd values
float exponent = getExponentFactor() * expK / d;
float2 w = 0;
sincos(exponent, w.y, w.x);
w *= (k < dHalf) ? 1.0f : -1.0f;

Complex4 even = values[m];
Complex4 odd = values[m + dHalf];

//Write results into temp values
//y = even + w * odd

tempComplexValues[i].Real = even.Real + w.x * odd.Real - w.y * odd.Imaginary;

tempComplexValues[i].Imaginary = even.Imaginary + w.x * odd.Imaginary + w.y * odd.Real;
}

//Sync to ensure that every value is read before writing the results
GroupMemoryBarrierWithGroupSync();

//Write the results into groupshared memory
[unroll] //Do some FFTs iterative instead of parallel
for (uint i = 0, iterationIndex = index; i < ITERATION_FACTOR; ++i, iterationIndex += ELEMENTS/ITERATION_FACTOR)
{
values[iterationIndex] = tempComplexValues[i];
}

//Sync to ensure that every value is written before calculating the other FFTs
GroupMemoryBarrierWithGroupSync();
}

//Write the results
[unroll] //Do some FFTs iterative instead of parallel
for (uint i = 0, iterationIndex = index; i < ITERATION_FACTOR; ++i, iterationIndex += ELEMENTS/ITERATION_FACTOR)
{
setValue(savePosition, getWeightedResult(values[iterationIndex]));
}
}

### #10CryZe

Posted 11 March 2012 - 06:26 AM

After looking through NVidia's implementation of the FFT I noticed, that they were only using 128 threads per thread group. I realized that my graphics card might not have enough shader cores to run all threads in parallel (400). So everytime all threads would need to be synchronized the driver would have to reorganize all the workload the shader cores are doing so that all the 512 threads can be synchronized.
So I tried implementing the FFT a bit more iterative to only have 128 threads as well. And indeed, the shader is almost twice as fast. It only needs 8.6ms now.

All 4 times the shader gets dispatched results in 34.6ms now. If I'd reduce the resolution to 256x256, all 4 dispatches together would only last 7.68ms. But I'm still looking for ways to improve it before reducing the resolution...

[numthreads(ELEMENTS/ITERATION_FACTOR,1,1)]
void CSMain(
uint3 groupId : SV_GroupID,
uint groupIndex: SV_GroupIndex,
{
Complex4 tempComplexValues[ITERATION_FACTOR];

//Revert the indices and load the values into groupshared memory
[unroll] //Do some FFTs iterative instead of parallel
for (uint i = 0, iterationIndex = index; i < ITERATION_FACTOR; ++i, iterationIndex += ELEMENTS/ITERATION_FACTOR)
{
}

//Sync to ensure that all the values are loaded
GroupMemoryBarrierWithGroupSync();

uint d = 1;
uint dHalf = 0;

[unroll] //Iterate over all the FFTs
for (uint n = 1; n <= LOG_ELEMENTS; ++n)
{
//The size of the FFT
dHalf = d;
d <<= 1;

[unroll] //Do some FFTs iterative instead of parallel
for (uint i = 0, iterationIndex = index; i < ITERATION_FACTOR; ++i, iterationIndex += ELEMENTS/ITERATION_FACTOR)
{
//Compute temp values
uint k = iterationIndex & (d - 1);
uint expK = k & (dHalf - 1);
uint m = iterationIndex - k + expK;

//Compute w, the factor for the odd values
float exponent = getExponentFactor() * expK / d;
float2 w = 0;
sincos(exponent, w.y, w.x);
w *= (k < dHalf) ? 1.0f : -1.0f;

Complex4 even = values[m];
Complex4 odd = values[m + dHalf];

//Write results into temp values
//y = even + w * odd

tempComplexValues[i].Real = even.Real + w.x * odd.Real - w.y * odd.Imaginary;

tempComplexValues[i].Imaginary = even.Imaginary + w.x * odd.Imaginary + w.y * odd.Real;
}

//Sync to ensure that every value is read before writing the results
GroupMemoryBarrierWithGroupSync();

//Write the results into groupshared memory
[unroll] //Do some FFTs iterative instead of parallel
for (uint i = 0, iterationIndex = index; i < ITERATION_FACTOR; ++i, iterationIndex += ELEMENTS/ITERATION_FACTOR)
{
values[iterationIndex] = tempComplexValues[i];
}

//Sync to ensure that every value is written before calculating the other FFTs
GroupMemoryBarrierWithGroupSync();
}

//Write the results
[unroll] //Do some FFTs iterative instead of parallel
for (uint i = 0, iterationIndex = index; i < ITERATION_FACTOR; ++i, iterationIndex += ELEMENTS/ITERATION_FACTOR)
{
setValue(savePosition, getWeightedResult(values[iterationIndex]));
}
}

And what I wonder about, is this line:

float exponent = getExponentFactor() * expK / d;

Why can't I just do?:
float exponent = getExponentFactor() * (expK >> n);

It should work since:
d = exp2(n);

But it just doesn't work for some reason.

### #9CryZe

Posted 11 March 2012 - 06:24 AM

After looking through NVidia's implementation of the FFT I noticed, that they were only using 128 threads per thread group. I realized that my graphics card might not have enough shader cores to run all threads in parallel (400). So everytime all threads would need to be synchronized the driver would have to reorganize all the workload the shader cores are doing so that all the 512 threads can be synchronized.
So I tried implementing the FFT a bit more iterative to only have 128 threads as well. And indeed, the shader is almost twice as fast. It only needs 8.6ms now.

All 4 times the shader gets dispatched results in 34.6ms now. If I'd reduce the resolution to 256x256, all 4 dispatches together would only last 7.68ms. But I'm still looking for ways to improve it before reducing the resolution...

[numthreads(ELEMENTS/ITERATION_FACTOR,1,1)]
void CSMain(
uint3 groupId : SV_GroupID,
uint groupIndex: SV_GroupIndex,
{
Complex4 tempComplexValues[ITERATION_FACTOR];

//Revert the indices and load the values into groupshared memory
[unroll] //Do some FFTs iterative instead of parallel
for (uint i = 0, iterationIndex = index; i < ITERATION_FACTOR; ++i, iterationIndex += ELEMENTS/ITERATION_FACTOR)
{
}

//Sync to ensure that all the values are loaded
GroupMemoryBarrierWithGroupSync();

uint d = 1;
uint dHalf = 0;

[unroll] //Iterate over all the FFTs
for (uint n = 1; n <= LOG_ELEMENTS; ++n)
{
//The size of the FFT
dHalf = d;
d <<= 1;

[unroll] //Do some FFTs iterative instead of parallel
for (uint i = 0, iterationIndex = index; i < ITERATION_FACTOR; ++i, iterationIndex += ELEMENTS/ITERATION_FACTOR)
{
//Compute temp values
uint k = iterationIndex & (d - 1);
uint expK = k & (dHalf - 1);
uint m = iterationIndex - k + expK;

//Compute w, the factor for the odd values
float exponent = getExponentFactor() * expK / d;
float2 w = 0;
sincos(exponent, w.y, w.x);
w *= (k < dHalf) ? 1.0f : -1.0f;

Complex4 even = values[m];
Complex4 odd = values[m + dHalf];

//Write results into temp values
//y = even + w * odd

tempComplexValues[i].Real = even.Real + w.x * odd.Real - w.y * odd.Imaginary;

tempComplexValues[i].Imaginary = even.Imaginary + w.x * odd.Imaginary + w.y * odd.Real;
}

//Sync to ensure that every value is read before writing the results
GroupMemoryBarrierWithGroupSync();

//Write the results into groupshared memory
[unroll] //Do some FFTs iterative instead of parallel
for (uint i = 0, iterationIndex = index; i < ITERATION_FACTOR; ++i, iterationIndex += ELEMENTS/ITERATION_FACTOR)
{
values[iterationIndex] = tempComplexValues[i];
}

//Sync to ensure that every value is written before calculating the other FFTs
GroupMemoryBarrierWithGroupSync();
}

//Write the results
[unroll] //Do some FFTs iterative instead of parallel
for (uint i = 0, iterationIndex = index; i < ITERATION_FACTOR; ++i, iterationIndex += ELEMENTS/ITERATION_FACTOR)
{
setValue(savePosition, getWeightedResult(values[iterationIndex]));
}
}

### #8CryZe

Posted 11 March 2012 - 06:23 AM

After looking through NVidia's implementation of the FFT I noticed, that they were only using 128 threads per thread group. I realized that my graphics card might not have enough shader cores to run all threads in parallel (400). So everytime all threads would need to be synchronized the driver would have to reorganize all the workload the shader cores are doing so that all the 512 threads can be synchronized.
So I tried implementing the FFT a bit more iterative to only have 128 threads as well. And indeed, the shader is almost twice as fast. It only needs 8.6ms now.

All 4 times the shader gets dispatched results in 34.6ms now. If I'd reduce the resolution to 256x256, all 4 dispatches together would only last 7.68ms. But I'm still looking for ways to improve it before reducing the resolution...

[numthreads(ELEMENTS/ITERATION_FACTOR,1,1)]
void CSMain(
uint3 groupId : SV_GroupID,
uint groupIndex: SV_GroupIndex,
{
Complex4 tempComplexValues[ITERATION_FACTOR];

//Revert the indices and load the values into groupshared memory
[unroll] //Do some FFTs iterative instead of parallel
for (uint i = 0, iterationIndex = index; i < ITERATION_FACTOR; ++i, iterationIndex += ELEMENTS/ITERATION_FACTOR)
{
}

//Sync to ensure that all the values are loaded
GroupMemoryBarrierWithGroupSync();

uint d = 1;
uint dHalf = 0;

[unroll] //Iterate over all the FFTs
for (uint n = 1; n <= LOG_ELEMENTS; ++n)
{
//The size of the FFT
dHalf = d;
d <<= 1;

[unroll] //Do some FFTs iterative instead of parallel
for (uint i = 0, iterationIndex = index; i < ITERATION_FACTOR; ++i, iterationIndex += ELEMENTS/ITERATION_FACTOR)
{
//Compute temp values
uint k = iterationIndex & (d - 1);
uint expK = k & (dHalf - 1);
uint m = iterationIndex - k + expK;

//Compute w, the factor for the odd values
float exponent = getExponentFactor() * expK / d;
float2 w = 0;
sincos(exponent, w.y, w.x);
w *= (k < dHalf) ? 1.0f : -1.0f;

Complex4 even = values[m];
Complex4 odd = values[m + dHalf];

//Write results into temp values
//y = even + w * odd

tempComplexValues[i].Real = even.Real + w.x * odd.Real - w.y * odd.Imaginary;

tempComplexValues[i].Imaginary = even.Imaginary + w.x * odd.Imaginary + w.y * odd.Real;
}

//Sync to ensure that every value is read before writing the results
GroupMemoryBarrierWithGroupSync();

//Write the results into groupshared memory
[unroll] //Do some FFTs iterative instead of parallel
for (uint i = 0, iterationIndex = index; i < ITERATION_FACTOR; ++i, iterationIndex += ELEMENTS/ITERATION_FACTOR)
{
values[iterationIndex] = tempComplexValues[i];
}

//Sync to ensure that every value is written before calculating the other FFTs
GroupMemoryBarrierWithGroupSync();
}

//Write the results
[unroll] //Do some FFTs iterative instead of parallel
for (uint i = 0, iterationIndex = index; i < ITERATION_FACTOR; ++i, iterationIndex += ELEMENTS/ITERATION_FACTOR)
{
setValue(savePosition, getWeightedResult(values[iterationIndex]));
}
}

### #7CryZe

Posted 11 March 2012 - 06:16 AM

After looking through NVidia's implementation of the FFT I noticed, that they were only using 128 threads per thread group. I realized that my graphics card might not have enough shader cores to run all threads in parallel (400). So everytime all threads would need to be synchronized the driver would have to reorganize all the workload the shader cores are doing so that all the 512 threads can be synchronized.
So I tried implementing the FFT a bit more iterative to only have 128 threads as well. And indeed, the shader is almost twice as fast. It only needs 8.6ms now.

All 4 times the shader gets dispatched results in 34.6ms now. If I'd reduce the resolution to 256x256, all 4 dispatches together would only last 7.68ms. But I'm still looking for ways to improve it before reducing the resolution...

[numthreads(ELEMENTS/ITERATION_FACTOR,1,1)]
void CSMain(
uint3 groupId : SV_GroupID,
uint groupIndex: SV_GroupIndex,
{
Complex4 tempComplexValues[ITERATION_FACTOR];

//Revert the indices and load the values into groupshared memory
[unroll] //Do some FFTs iterative instead of parallel
for (uint i = 0, iterationIndex = index; i < ITERATION_FACTOR; ++i, iterationIndex += ELEMENTS/ITERATION_FACTOR)
{
}

//Sync to ensure that all the values are loaded
GroupMemoryBarrierWithGroupSync();

uint d = 1;
uint dHalf = 0;

[unroll] //Iterate over all the FFTs
for (uint n = 1; n <= LOG_ELEMENTS; ++n)
{
//The size of the FFT
dHalf = d;
d <<= 1;

[unroll] //Do some FFTs iterative instead of parallel
for (uint i = 0, iterationIndex = index; i < ITERATION_FACTOR; ++i, iterationIndex += ELEMENTS/ITERATION_FACTOR)
{
//Compute temp values
uint k = iterationIndex & (d - 1);
uint expK = k & (dHalf - 1);
uint m = iterationIndex - k + expK;

//Compute w, the factor for the odd values
float exponent = getExponentFactor() * expK / d;
float2 w = 0;
sincos(exponent, w.y, w.x);
w *= (k < dHalf) ? 1.0f : -1.0f;

Complex4 even = values[m];
Complex4 odd = values[m + dHalf];

//Write results into temp values
//y = even + w * odd

tempComplexValues[i].Real = even.Real + w.x * odd.Real - w.y * odd.Imaginary;

tempComplexValues[i].Imaginary = even.Imaginary + w.x * odd.Imaginary + w.y * odd.Real;
}

//Sync to ensure that every value is read before writing the results
GroupMemoryBarrierWithGroupSync();

//Write the results into groupshared memory
[unroll] //Do some FFTs iterative instead of parallel
for (uint i = 0, iterationIndex = index; i < ITERATION_FACTOR; ++i, iterationIndex += ELEMENTS/ITERATION_FACTOR)
{
values[iterationIndex] = tempComplexValues[i];
}

//Sync to ensure that every value is written before calculating the other FFTs
GroupMemoryBarrierWithGroupSync();
}

//Write the results
[unroll] //Do some FFTs iterative instead of parallel
for (uint i = 0, iterationIndex = index; i < ITERATION_FACTOR; ++i, iterationIndex += ELEMENTS/ITERATION_FACTOR)
{
setValue(savePosition, getWeightedResult(values[iterationIndex]));
}
}

### #6CryZe

Posted 11 March 2012 - 06:11 AM

After looking through NVidia's implementation of the FFT I noticed, that they were only using 128 threads per thread group. I realized that my graphics card might not have enough shader cores to run all threads in parallel (400). So everytime all threads would need to be synchronized the driver would have to reorganize all the workload the shader cores are doing so that all the 512 threads can be synchronized.
So I tried implementing the FFT a bit more iterative to only have 128 threads as well. And indeed, the shader is almost twice as fast. It only needs 8.6ms now.

All 4 times the shader gets dispatched results in 34.6ms now. If I'd reduce the resolution to 256x256, all 4 dispatches together would only last 7.68ms. But I'm still looking for ways to improve it before reducing the resolution...

[numthreads(ELEMENTS/ITERATION_FACTOR,1,1)]
void CSMain(
uint3 groupId : SV_GroupID,
uint groupIndex: SV_GroupIndex,
{
Complex4 tempComplexValues[ITERATION_FACTOR];

//Revert the indices and load the values into groupshared memory
[unroll] //Do some FFTs iterative instead of parallel
for (uint i = 0, iterationIndex = index; i < ITERATION_FACTOR; ++i, iterationIndex += ELEMENTS/ITERATION_FACTOR)
{
}

//Sync to ensure that all the values are loaded
GroupMemoryBarrierWithGroupSync();

uint d = 1;
uint dHalf = 0;

[unroll] //Iterate over all the FFTs
for (uint n = 1; n <= LOG_ELEMENTS; ++n)
{
//The size of the FFT
dHalf = d;
d <<= 1;

[unroll] //Do some FFTs iterative instead of parallel
for (uint i = 0, iterationIndex = index; i < ITERATION_FACTOR; ++i, iterationIndex += ELEMENTS/ITERATION_FACTOR)
{
//Compute temp values
uint k = iterationIndex & (d - 1);
uint expK = k & (dHalf - 1);
uint m = iterationIndex - k + expK;

//Compute w, the factor for the odd values
float exponent = getExponentFactor() * expK / d;
float2 w = 0;
sincos(exponent, w.y, w.x);
w *= (k < dHalf) ? 1.0f : -1.0f;

Complex4 even = values[m];
Complex4 odd = values[m + dHalf];

//Write results into temp values
//y = even + w * odd

tempComplexValues[i].Real = even.Real + w.x * odd.Real - w.y * odd.Imaginary;

tempComplexValues[i].Imaginary = even.Imaginary + w.x * odd.Imaginary + w.y * odd.Real;
}

//Sync to ensure that every value is read before writing the results
GroupMemoryBarrierWithGroupSync();

//Write the results into groupshared memory
[unroll] //Do some FFTs iterative instead of parallel
for (uint i = 0, iterationIndex = index; i < ITERATION_FACTOR; ++i, iterationIndex += ELEMENTS/ITERATION_FACTOR)
{
values[iterationIndex] = tempComplexValues[i];
}

//Sync to ensure that every value is written before calculating the other FFTs
GroupMemoryBarrierWithGroupSync();
}

//Write the results
[unroll] //Do some FFTs iterative instead of parallel
for (uint i = 0, iterationIndex = index; i < ITERATION_FACTOR; ++i, iterationIndex += ELEMENTS/ITERATION_FACTOR)
{
}