Sign in to follow this  
Kurt-olsson

DX11 8800GTX and only 150 Particles before lagging? Help. =)

Recommended Posts

Kurt-olsson    254
My method of rendering particle system is.

Have a list of particles with
position
velocity.

Loop list and create two triangles for each particle in a vertex buffer. (in this stage the created triangles are in right position with right rotation)
copy vertexBuffer every frame.

This gives me sooooooo pooor performance.

I have an 8800GTX card and can only render 150 Particles... come on ... 150 particles before it start lagging. Must be somethign big problem with my code.

Here is my Particle class, it is Simple and i have tried to comment every function.
Please let me know if you see something bad.

Another thing is how come the movement is "slow" when particles are visible, i count everything with DeltaTime so shouldent it lag, but the movement/velocity of my player the same even if there is too much drawn on scene?

Here is my Particle class.

[source lang="cpp"]#pragma once
#include <d3d11.h>
#include <d3dx11.h>
#include <d3dx10.h>
#include <vector>

class Particle {
public:
D3DXVECTOR3 position;
D3DXVECTOR3 velocity;
float time;
};

class ParticleSystem
{
private:
struct VERTEX {FLOAT X, Y, Z; D3DXVECTOR3 Normal; FLOAT U, V; D3DXCOLOR Color;};

D3D11_MAPPED_SUBRESOURCE ms;
ID3D11Buffer *m_vertexBuffer, *m_indexBuffer;
int m_vertexCount;
int m_indexCount;
int number_of_particles;
VERTEX* model_vertices;
DWORD* model_indicies;
std::vector<Particle> lstParticles;
int CurrentParticle;

public:

//This is just run onced to create all particles.
void AddParticles() {

float width = 1.0f;
float height = 1.0f;

for (int i = 0; i < 1150;i++) {

/*float rx = (float)rand()/((float)RAND_MAX/0.01f);
float ry = (float)rand()/((float)RAND_MAX/0.001f);
float rz = (float)rand()/((float)RAND_MAX/0.01f);*/
Particle p;
p.position = D3DXVECTOR3(0,0,0);
p.velocity = D3DXVECTOR3(0,0,0);
lstParticles.push_back(p);
}
}


//Set new position and new Velocity.
void Reset(D3DXVECTOR3 start, D3DXVECTOR3 velocity) {

lstParticles[CurrentParticle].position = start;
lstParticles[CurrentParticle].velocity = velocity;
CurrentParticle++;
if (CurrentParticle>=lstParticles.size())
CurrentParticle=0;

}

//This is run every Frame, here is where i set the position and create
//two triangles from a certain position of a particel.
//this makes it easy to just maintain a list of particles with one position instead of 6.

void UpdateParticles(D3DXVECTOR3 mPos,D3DXVECTOR3 mView) {
//float width = 1.0f;
//float height = 1.0f;

D3DXCOLOR particleColor(1.0f,1.0f,1.0f,0.5f);

for (int i=0;i<lstParticles.size();i++) {
int v_index = i*6;
D3DXVECTOR3 particlePos = lstParticles[i].position;

D3DXVECTOR3 look = mView - mPos;
D3DXVec3Normalize(&look,&look);

//This i could move outside becuase it is the same every particle
D3DXVECTOR3 camUp(0,1,0);
D3DXVec3Normalize(&camUp,&camUp);

D3DXVECTOR3 right;
D3DXVec3Cross(&right,&camUp,&look);
D3DXVec3Normalize(&right,&right);

D3DXVECTOR3 up;
D3DXVec3Cross(&up,&look,&right);
D3DXVec3Normalize(&up,&up);

//up = up * height;
//right = right * width;

model_vertices[v_index].Color = particleColor;
model_vertices[v_index].U = 0;
model_vertices[v_index].V = 0;
model_vertices[v_index].X = particlePos.x - right.x * 0.5f + up.x;
model_vertices[v_index].Y = particlePos.y - right.y * 0.5f + up.y;
model_vertices[v_index].Z = particlePos.z - right.z * 0.5f + up.z;
v_index++;

model_vertices[v_index].Color = particleColor;
model_vertices[v_index].U = 0;
model_vertices[v_index].V = 1;
model_vertices[v_index].X = particlePos.x + right.x * 0.5f + up.x;
model_vertices[v_index].Y = particlePos.y + right.y * 0.5f + up.y;
model_vertices[v_index].Z = particlePos.z + right.z * 0.5f + up.z;
v_index++;

model_vertices[v_index].Color = particleColor;
model_vertices[v_index].U = 1;
model_vertices[v_index].V = 0;
model_vertices[v_index].X = particlePos.x - right.x * 0.5f;
model_vertices[v_index].Y = particlePos.y - right.y * 0.5f;
model_vertices[v_index].Z = particlePos.z - right.z * 0.5f;
v_index++;

//Second Triangle

model_vertices[v_index].Color = particleColor;
model_vertices[v_index].U = 1;
model_vertices[v_index].V = 0;
model_vertices[v_index].X = particlePos.x - right.x * 0.5f;
model_vertices[v_index].Y = particlePos.y - right.y * 0.5f;
model_vertices[v_index].Z = particlePos.z - right.z * 0.5f;
v_index++;

model_vertices[v_index].Color = PlaneVerticies[0].Color;
model_vertices[v_index].U = 0;
model_vertices[v_index].V = 1;
model_vertices[v_index].X = particlePos.x + right.x * 0.5f + up.x;
model_vertices[v_index].Y = particlePos.y + right.y * 0.5f + up.y;
model_vertices[v_index].Z = particlePos.z + right.z * 0.5f + up.z;
v_index++;


model_vertices[v_index].Color = particleColor;
model_vertices[v_index].U = 1;
model_vertices[v_index].V = 1;
model_vertices[v_index].X = particlePos.x + right.x * 0.5f;
model_vertices[v_index].Y = particlePos.y + right.y * 0.5f;
model_vertices[v_index].Z = particlePos.z + right.z * 0.5f;
v_index++;

//update position with velocity
lstParticles[i].position+=lstParticles[i].velocity;
}

}

//Just create the Vertex Buffer with as many Particles there is * 6 because we render two triangles for the Quad.
//This is because i don´t know how to draw TRIANGLE_STRIP in different position, something with ResetStrip, but i think
//it only works with shaders.
void Init(ID3D11Device* dev) {
CurrentParticle = 0;

number_of_particles = lstParticles.size();
m_vertexCount = (number_of_particles * 6);
m_indexCount = (number_of_particles * 6);

model_vertices = new VERTEX[m_vertexCount];
model_indicies = new DWORD[m_indexCount];

//This might be a problem? The Indicies are never the same as one vertex, so it is a s big as VertexBuffer.
for (int i = 0; i<(number_of_particles * 6);i++) {
model_indicies[i] = i;
}

// create the vertex buffer
D3D11_BUFFER_DESC bd;
ZeroMemory(&bd, sizeof(bd));

bd.Usage = D3D11_USAGE_DYNAMIC;
bd.ByteWidth = sizeof(VERTEX) * m_vertexCount;
bd.BindFlags = D3D11_BIND_VERTEX_BUFFER;
bd.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;

dev->CreateBuffer(&bd, NULL, &m_vertexBuffer);

// create the index buffer
bd.Usage = D3D11_USAGE_DYNAMIC;
bd.ByteWidth = sizeof(DWORD) * m_indexCount;
bd.BindFlags = D3D11_BIND_INDEX_BUFFER;
bd.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
bd.MiscFlags = 0;

dev->CreateBuffer(&bd, NULL, &m_indexBuffer);



}

int GetIndexCount() {
return m_indexCount;
}

//This method is run EVERY Frame, it takes the Updated Vertex Buffer and then copies it to the RAM.
void CopyAndSetBuffers(ID3D11DeviceContext* devcon) {


// select which vertex buffer to display
UINT stride = sizeof(VERTEX);
UINT offset = 0;

// copy the vertices into the buffer
//THIS uses the D3D11_MAP_WRITE_DISCARD so it should be ok for updating every frame, right?
devcon->Map(m_vertexBuffer, NULL, D3D11_MAP_WRITE_DISCARD, NULL, &ms); // map the buffer
memcpy(ms.pData, model_vertices, sizeof(VERTEX) * m_vertexCount); // copy the data
devcon->Unmap(m_vertexBuffer, NULL);
//copy the index buffers i
//THIS uses the D3D11_MAP_WRITE_DISCARD so it should be ok for updating every frame, right?
devcon->Map(m_indexBuffer, NULL, D3D11_MAP_WRITE_DISCARD, NULL, &ms); // map the buffer
memcpy(ms.pData, model_indicies, sizeof(DWORD) * m_indexCount); // copy the data
devcon->Unmap(m_indexBuffer, NULL);

devcon->IASetVertexBuffers(0, 1, &m_vertexBuffer, &stride, &offset);
devcon->IASetIndexBuffer(m_indexBuffer, DXGI_FORMAT_R32_UINT, 0);
}

void Clean() {
m_indexBuffer->Release();
m_vertexBuffer->Release();
}


};[/source]

Share this post


Link to post
Share on other sites
Erik Rufelt    5901
First, it seems like you have 1150 particles, not 150. Still, that shouldn't be all too slow.. how much is it lagging?

Make sure you compile in Release, not Debug, and move those things you commented yourself outside the loop.
Then switch to only creating 4 vertices per quad instead of 6, but still use 6 indices. Indices can re-use vertices, so you only need 4 vertices and indices [0, 1, 2] and [0, 2, 3] for example, to make 2 triangles. This saves you some bandwidth.

If it's still not good enough, look into using a geometry shader, which can save you a lot of CPU time.

Share this post


Link to post
Share on other sites
Kurt-olsson    254
HodgeMan:
Can you please define "lag"; do you mean that the time per frame increases?
Have you timed UpdateParticles to see how much CPU time it's consuming?

My lag is like this:
I move my camera with a velocityVector lets say (0,0,0.001f*deltaTime)
Without particles it feels like i am moving "fast".
But with all particles i am moving "slow" but the velocity vector is still the same.
I have not times by Particles, dont know how.

Erik Rufelt:
1150 particles, correct my misstake.
I also forgot to mention i do a RenderTo Texture and use that texture to map a cube.
So i render everything twice so that should cut my performance in 50% but i still think it is to slow.
The only thing i draw is a 1500 verticies model and my Particles + Cube.

I think Indicies performance upgrade is next thing to look into, but i still think it is something wrong.
My plan is to draw at least 10 more 1000 vertices models in my level.

hm...
i will move the code as in my samples and try in release mode. Edited by KurtO

Share this post


Link to post
Share on other sites
Erik Rufelt    5901
Try displaying deltaTime on the screen, and measure the difference in milliseconds. If you compare drawing 1000 particles to not drawing anything at all, then it should be much slower. Even something that is very fast is infinitely slower than something that takes zero time. Drawing nothing is close to zero.
If you aim for 60 frames per second, that gives you a max deltaTime of ~16.5 milliseconds, so compare the time taken to draw 1000 particles to that, and see how many percent of the target time is spent. Edited by Erik Rufelt

Share this post


Link to post
Share on other sites
Ashaman73    13715
[quote name='KurtO' timestamp='1352035146' post='4997157']
Without particles it feels like i am moving "fast".
[/quote]
We need some numbers. Get the free version of fraps to display the FPS at least or best to incorporate some kind of time measurement in your code.

Do you send the particles in a single batch to the GPU or are you using a batch for each particle ? The latter will most likely slow down your performance even for only 1150 particles. An other issue would be to paint 1150 large particles, which could result in an huge overdraw rate, an other reason for a slow down.

Best to provide some more data and a screenshot.

Share this post


Link to post
Share on other sites
Kurt-olsson    254
FRAPS was a very good idea!

When i have 1500 particles at the beginning at the same place (0,0,0) and player real close to them my FPS is down to 14FPS.
But when i shoot them away and they are away from the player i get around 250~400 fps.
when the particles are far far away i get as high as 550 fps.

it feels that i cant draw my particles close at the same place... Edited by KurtO

Share this post


Link to post
Share on other sites
mhagain    13430
That's normal enough - you're getting heavy overdraw and bottlenecking on fillrate here. Probably covering a good-ish percentage of the entire screen area 1500 times which will bring any GPU to it's knees.

Share this post


Link to post
Share on other sites
Kurt-olsson    254
Suddenly i have more respect of the game-engines out there. It feels impossible to get the visuals they do from my hardward. =)

I will try implement indexed vertexbuffer for 2 of my 6 vertices of my two triangles as Erik said.
Maybe that will lift the performance a little bit.

Also, how do you get transparacy of color black?

If i have alphaBlending on the FPS drops even more...

Share this post


Link to post
Share on other sites
papulko    1418
I would recommend that you also make use of the geometry shader stage, that way you only have to use one vertex for the each sprite, here's a good article on how to do it:
[url="http://takinginitiative.net/2011/01/12/directx10-tutorial-9-the-geometry-shader/"]http://takinginitiative.net/2011/01/12/directx10-tutorial-9-the-geometry-shader/[/url]

Share this post


Link to post
Share on other sites
Erik Rufelt    5901
In your pixel-shader, try something like:
if(color.a == 0)
discard;

Whether it's faster or not is hard to say. As your problem is clearly fillrate, and your card is a few years old, there might not be all too much that you can do, other than making the particles smaller on the screen.

One technique you can try to reduce fillrate is to draw polygons that aren't squares or quads, so that you get as little area as possible on screen for your particles, like shown for example here: [url="http://www.humus.name/index.php?page=Comments&ID=266"]http://www.humus.name/index.php?page=Comments&ID=266[/url]

Share this post


Link to post
Share on other sites
Kurt-olsson    254
papulko, using a Geometry Shader is clearly my next step. When the game is finishes i might "upgrade" that part. It seems really nice to render all particles on the GPU.

Erik, color.a == 0 check looks like a good way to sort this out.

I will definently try to to use only a triangle with texture coords so that my texture is in the middle, because of my transparacy i really dont need a quad if my texture fits inside my triangle! This is really smart!

Correct me if i am wrong, but if i render all my triangles with different positions, i won¨t gain any performance of index-buffer becuase all my vertex will be on seperate places i guess? right?

Share this post


Link to post
Share on other sites
Kurt-olsson    254
By the way.
Is it better to have a vertex buffer that contains ALL particles and only update position.

OR

create a new VertexBuffer with only the particles that are Alive and then SWAP that vertexBuffer each frame?

Share this post


Link to post
Share on other sites
Erik Rufelt    5901
Probably only update the alive ones..
However, in your case this is most likely irrelevant. As you get high FPS when particles are far away, your vertices are not limiting your performance. Because of this both index-buffers and geometry shaders will gain very little.

Using triangles instead of quads could be better or worse, and you probably want to use like 8-corner polygons or something. Look again at the page I linked. The only thing that matters for you is how many Pixels are covered on the screen. If you use 10 vertices to cover 80% as many pixels, then that is a win.

Your graphics card does two things for you:
1. Transform vertices
2. Fill pixels

As your performance is much worse when your particles are close, it means that step 1 is cheap for you and doesn't matter very much. Index-buffers and geometry shaders improve step 1 to be even better. If you get 500 FPS when particles are far away and 14 FPS when particles are close, that gives approximately:
Step 1: 2 milliseconds
Step 2: 70 milliseconds

That means if you make step 1 twice as fast, your FPS close will still be close to 14. So it does not matter much at all.
If you make Step 2 twice as fast, that makes a much larger difference, even if Step 1 gets slower by increasing the vertex count. So choose vertices so that you cover the least number of pixels, if you want many particles covering a large number of pixels on the screen.

However, no matter what you do it is likely impossible to get 1000 particles covering a large part of the screen on your graphics card, it's simply too many pixels. You have to make your particles a bit smaller or draw fewer particles when they get close. If you have 1000 particles very close to the screen, most won't be visible, so you can maybe sort them and remove those behind others or similar.

Share this post


Link to post
Share on other sites
Kurt-olsson    254
Erik, thank you so much for your explanation and your time to write your answer to me.
Now i finally understand that it is the screen pixel coverage that is my problem.

My optimization will be smaller particles and draw fewer when close, that should do the trick!

again, thank you very much. Edited by KurtO

Share this post


Link to post
Share on other sites
Kurt-olsson    254
Holy shit!

you know what you are talking about!
making the particles 0.05f width/height instead of 1,0f makes the particles SUPERFAST!
The fillrate is down and the speed is UP!

5000 particles at same position ~ 200FPS
and all around the place = 450 FPS, hardly no drop at all!

COOOOL!

As you said Erik, i have not optimized index or quad etc, just the size of particle made it superfast!

thanks again.

Share this post


Link to post
Share on other sites
phil_t    8084
Another fairly easy thing you can do is when particles get closer and take up large portions of the screen, you can automatically fade them out, until the point where you don't draw them anymore. Of course this decision has to be made in the vertex shader (or earlier) to avoid the pixel shading cost.

Another much more complicated optimization is to render the particles to a lower resolution render target and apply them to the scene afterward: http://http.developer.nvidia.com/GPUGems3/gpugems3_ch23.html

Share this post


Link to post
Share on other sites

Create an account or sign in to comment

You need to be a member in order to leave a comment

Create an account

Sign up for a new account in our community. It's easy!

Register a new account

Sign in

Already have an account? Sign in here.

Sign In Now

Sign in to follow this  

  • Similar Content

    • By isu diss
       I'm trying to code Rayleigh part of Nishita's model (Display Method of the Sky Color Taking into Account Multiple Scattering). I get black screen no colors. Can anyone find the issue for me?
       
      #define InnerRadius 6320000 #define OutterRadius 6420000 #define PI 3.141592653 #define Isteps 20 #define Ksteps 10 static float3 RayleighCoeffs = float3(6.55e-6, 1.73e-5, 2.30e-5); RWTexture2D<float4> SkyColors : register (u0); cbuffer CSCONSTANTBUF : register( b0 ) { float fHeight; float3 vSunDir; } float Density(float Height) { return exp(-Height/8340); } float RaySphereIntersection(float3 RayOrigin, float3 RayDirection, float3 SphereOrigin, float Radius) { float t1, t0; float3 L = SphereOrigin - RayOrigin; float tCA = dot(L, RayDirection); if (tCA < 0) return -1; float lenL = length(L); float D2 = (lenL*lenL) - (tCA*tCA); float Radius2 = (Radius*Radius); if (D2<=Radius2) { float tHC = sqrt(Radius2 - D2); t0 = tCA-tHC; t1 = tCA+tHC; } else return -1; return t1; } float RayleighPhaseFunction(float cosTheta) { return ((3/(16*PI))*(1+cosTheta*cosTheta)); } float OpticalDepth(float3 StartPosition, float3 EndPosition) { float3 Direction = normalize(EndPosition - StartPosition); float RayLength = RaySphereIntersection(StartPosition, Direction, float3(0, 0, 0), OutterRadius); float SampleLength = RayLength / Isteps; float3 tmpPos = StartPosition + 0.5 * SampleLength * Direction; float tmp; for (int i=0; i<Isteps; i++) { tmp += Density(length(tmpPos)-InnerRadius); tmpPos += SampleLength * Direction; } return tmp*SampleLength; } static float fExposure = -2; float3 HDR( float3 LDR) { return 1.0f - exp( fExposure * LDR ); } [numthreads(32, 32, 1)] //disptach 8, 8, 1 it's 256 by 256 image void ComputeSky(uint3 DTID : SV_DispatchThreadID) { float X = ((2 * DTID.x) / 255) - 1; float Y = 1 - ((2 * DTID.y) / 255); float r = sqrt(((X*X)+(Y*Y))); float Theta = r * (PI); float Phi = atan2(Y, X); static float3 Eye = float3(0, 10, 0); float ViewOD = 0, SunOD = 0, tmpDensity = 0; float3 Attenuation = 0, tmp = 0, Irgb = 0; //if (r<=1) { float3 ViewDir = normalize(float3(sin(Theta)*cos(Phi), cos(Theta),sin(Theta)*sin(Phi) )); float ViewRayLength = RaySphereIntersection(Eye, ViewDir, float3(0, 0, 0), OutterRadius); float SampleLength = ViewRayLength / Ksteps; //vSunDir = normalize(vSunDir); float cosTheta = dot(normalize(vSunDir), ViewDir); float3 tmpPos = Eye + 0.5 * SampleLength * ViewDir; for(int k=0; k<Ksteps; k++) { float SunRayLength = RaySphereIntersection(tmpPos, vSunDir, float3(0, 0, 0), OutterRadius); float3 TopAtmosphere = tmpPos + SunRayLength*vSunDir; ViewOD = OpticalDepth(Eye, tmpPos); SunOD = OpticalDepth(tmpPos, TopAtmosphere); tmpDensity = Density(length(tmpPos)-InnerRadius); Attenuation = exp(-RayleighCoeffs*(ViewOD+SunOD)); tmp += tmpDensity*Attenuation; tmpPos += SampleLength * ViewDir; } Irgb = RayleighCoeffs*RayleighPhaseFunction(cosTheta)*tmp*SampleLength; SkyColors[DTID.xy] = float4(Irgb, 1); } }  
    • By Endurion
      I have a gaming framework with an renderer interface. Those support DX8, DX9 and latest, DX11. Both DX8 and DX9 use fixed function pipeline, while DX11 obviously uses shaders. I've got most of the parts working fine, as in I can switch renderers and notice almost no difference. The most advanced features are 2 directional lights with a single texture  
      My last problem is lighting; albeit there's documentation on the D3D lighting model I still can't get the behaviour right. My mistake shows most prominently in the dark side opposite the lights. I'm pretty sure the ambient calculation is off, but that one's supposed to be the most simple one and should be hard to get wrong.
      Interestingly I've been searching high and low, and have yet to find a resource that shows how to build a HLSL shader where diffuse, ambient and specular are used together with material properties. I've got various shaders for all the variations I'm supporting. I stepped through the shader with the graphics debugger, but the calculation seems to do what I want. I'm just not sure the formula is correct.
      This one should suffice though, it's doing two directional lights, texture modulated with vertex color and a normal. Maybe someone can spot one (or more mistakes). And yes, this is in the vertex shader and I'm aware lighting will be as "bad" as in fixed function; that's my goal currently.
      // A constant buffer that stores the three basic column-major matrices for composing geometry. cbuffer ModelViewProjectionConstantBuffer : register(b0) { matrix model; matrix view; matrix projection; matrix ortho2d; }; struct DirectionLight { float3 Direction; float PaddingL1; float4 Ambient; float4 Diffuse; float4 Specular; }; cbuffer LightsConstantBuffer : register( b1 ) { float4 Ambient; float3 EyePos; float PaddingLC1; DirectionLight Light[8]; }; struct Material { float4 MaterialEmissive; float4 MaterialAmbient; float4 MaterialDiffuse; float4 MaterialSpecular; float MaterialSpecularPower; float3 MaterialPadding; }; cbuffer MaterialConstantBuffer : register( b2 ) { Material _Material; }; // Per-vertex data used as input to the vertex shader. struct VertexShaderInput { float3 pos : POSITION; float3 normal : NORMAL; float4 color : COLOR0; float2 tex : TEXCOORD0; }; // Per-pixel color data passed through the pixel shader. struct PixelShaderInput { float4 pos : SV_POSITION; float2 tex : TEXCOORD0; float4 color : COLOR0; }; // Simple shader to do vertex processing on the GPU. PixelShaderInput main(VertexShaderInput input) { PixelShaderInput output; float4 pos = float4( input.pos, 1.0f ); // Transform the vertex position into projected space. pos = mul(pos, model); pos = mul(pos, view); pos = mul(pos, projection); output.pos = pos; // pass texture coords output.tex = input.tex; // Calculate the normal vector against the world matrix only. //set required lighting vectors for interpolation float3 normal = mul( input.normal, ( float3x3 )model ); normal = normalize( normal ); float4 ambientEffect = Ambient; float4 diffuseEffect = float4( 0, 0, 0, 0 ); float4 specularEffect = float4( 0, 0, 0, 0 ); for ( int i = 0; i < 2; ++i ) { // Invert the light direction for calculations. float3 lightDir = -Light[i].Direction; float lightFactor = max( dot( lightDir, input.normal ), 0 ); ambientEffect += Light[i].Ambient * _Material.MaterialAmbient; diffuseEffect += saturate( Light[i].Diffuse * dot( normal, lightDir ) );// * _Material.MaterialDiffuse; //specularEffect += Light[i].Specular * dot( normal, halfangletolight ) * _Material.MaterialSpecularPower; } specularEffect *= _Material.MaterialSpecular; //ambientEffect.w = 1.0; ambientEffect = normalize( ambientEffect ); /* Ambient effect: (L1.ambient + L2.ambient) * object ambient color Diffuse effect: (L1.diffuse * Dot(VertexNormal, Light1.Direction) + L2.diffuse * Dot(VertexNormal, Light2.Direction)) * object diffuse color Specular effect: (L1.specular * Dot(VertexNormal, HalfAngleToLight1) * Object specular reflection power + L2.specular * Dot(VertexNormal, HalfAngleToLight2) * Object specular reflection power ) * object specular color Resulting color = Ambient effect + diffuse effect + specular effect*/ float4 totalFactor = ambientEffect + diffuseEffect + specularEffect; totalFactor.w = 1.0; output.color = input.color * totalFactor; return output; }   Edit: This message editor is driving me nuts (Arrrr!) - I don't write code in Word.
    • By Mercesa
      Hey folks. So I'm having this problem in which if my camera is close to a surface, the SSAO pass suddenly spikes up to around taking 16 milliseconds.
      When still looking towards the same surface, but less close. The framerate resolves itself and becomes regular again.
      This happens with ANY surface of my model, I am a bit clueless in regards to what could cause this. Any ideas?
      In attached image: y axis is time in ms, x axis is current frame. The dips in SSAO milliseconds are when I moved away from the surface, the peaks happen when I am very close to the surface.

       
      Edit: So I've done some more in-depth profiling with Nvidia nsight. So these are the facts from my results
      Count of command buffers goes from 4 (far away from surface) to ~20(close to surface).
      The command buffer duration in % goes from around ~30% to ~99%
      Sometimes the CPU duration takes up to 0.03 to 0.016 milliseconds per frame while comparatively usually it takes around 0.002 milliseconds.
      I am using a vertex shader which generates my full-screen quad and afterwards I do my SSAO calculations in my pixel shader, could this be a GPU driver bug? I'm a bit lost myself. It seems there could be a CPU/GPU resource stall. But why would the amount of command buffers be variable depending on distance from a surface?
       
       
      Edit n2: Any resolution above 720p starts to have this issue, and I am fairly certain my SSAO is not that performance heavy it would crap itself at a bit higher resolutions.
       
    • By turanszkij
      In DirectX 11 we have a 24 bit integer depth + 8bit stencil format for depth-stencil resources ( DXGI_FORMAT_D24_UNORM_S8_UINT ). However, in an AMD GPU documentation for consoles I have seen they mentioned, that internally this format is implemented as a 64 bit resource with 32 bits for depth (but just truncated for 24 bits) and 32 bits for stencil (truncated to 8 bits). AMD recommends using a 32 bit floating point depth buffer instead with 8 bit stencil which is this format: DXGI_FORMAT_D32_FLOAT_S8X24_UINT.
      Does anyone know why this is? What is the usual way of doing this, just follow the recommendation and use a 64 bit depthstencil? Are there performance considerations or is it just recommended to not waste memory? What about Nvidia and Intel, is using a 24 bit depthbuffer relevant on their hardware?
      Cheers!
       
    • By gsc
      Hi! I am trying to implement simple SSAO postprocess. The main source of my knowledge on this topic is that awesome tutorial.
      But unfortunately something doesn't work... And after a few long hours I need some help. Here is my hlsl shader:
      float3 randVec = _noise * 2.0f - 1.0f; // noise: vec: {[0;1], [0;1], 0} float3 tangent = normalize(randVec - normalVS * dot(randVec, normalVS)); float3 bitangent = cross(tangent, normalVS); float3x3 TBN = float3x3(tangent, bitangent, normalVS); float occlusion = 0.0; for (int i = 0; i < kernelSize; ++i) { float3 samplePos = samples[i].xyz; // samples: {[-1;1], [-1;1], [0;1]} samplePos = mul(samplePos, TBN); samplePos = positionVS.xyz + samplePos * ssaoRadius; float4 offset = float4(samplePos, 1.0f); offset = mul(offset, projectionMatrix); offset.xy /= offset.w; offset.y = -offset.y; offset.xy = offset.xy * 0.5f + 0.5f; float sampleDepth = tex_4.Sample(textureSampler, offset.xy).a; sampleDepth = vsPosFromDepth(sampleDepth, offset.xy).z; const float threshold = 0.025f; float rangeCheck = abs(positionVS.z - sampleDepth) < ssaoRadius ? 1.0 : 0.0; occlusion += (sampleDepth <= samplePos.z + threshold ? 1.0 : 0.0) * rangeCheck; } occlusion = saturate(1 - (occlusion / kernelSize)); And current result: http://imgur.com/UX2X1fc
      I will really appreciate for any advice!
  • Popular Now