Sign in to follow this  
Yours3!f

textureGatherOffsets slower than texture?

Recommended Posts

Hi

 

so I wanted to speed up rendering with using texture gather, however I got 10fps less, than with traditional texture (78 vs 88)

I expected the opposite...

 

here's the code I used:

#version 430 core

layout(binding=0) uniform sampler2D tex;

uniform vec2 screen_size;

in vec2 uv;

layout(location=0) out vec4 color;

void main()
{
    /*
    -1  1 | 0  1 | 1  1
    -1  0 | 0  0 | 1  0
    -1 -1 | 0 -1 | 1 -1
    */

    float res = 0;

    const int size = 1;
    vec2 scale = 1.0 / screen_size.xy;

    /**/
    //3x3 blurring
    for( int x = -size; x <= size; ++x )
        for( int y = -size; y <= size; ++y )
            res += texture( tex, uv + vec2(x, y) * scale ).y;
    /**/

    /**/
    //gather based approach
    //2x2 sample
    ivec2 offsets0[4] = { ivec2(-1, 0), ivec2(0, 0), ivec2(0, -1), ivec2(-1, -1) };
    //the rest from the sides
    ivec2 offsets1[4] = { ivec2(-1, 1), ivec2(0, 1), ivec2(1, 0), ivec2(1, -1) };
        
    //fetch the y component
    vec4 res0 = textureGatherOffsets( tex, uv, offsets0, 1 );
    vec4 res1 = textureGatherOffsets( tex, uv, offsets1, 1 );
    //the last one is done separately
    float res2 = texture( tex, uv + vec2(1, 1) * scale ).y;

    res += res0.x + res0.y + res0.z + res0.w + res1.x + res1.y + res1.z + res1.w + res2;
    /**/

    color = vec4(res / 9.0);
}

here's the corresponding shader assembly for HD5770 from GPU ShaderAnalyzer:
 

; --------  Disassembly --------------------
00 ALU: ADDR(32) CNT(24) KCACHE0(CB0:0-15)
      0  t: RCP_e       R0.w,  KC0[0].x      
      1  t: RCP_e       R0.z,  KC0[0].y      
      2  x: INTERP_XY   R0.x,  R0.y,  Param0.x      VEC_210
         y: INTERP_XY   R0.y,  R0.x,  Param0.x      VEC_210
         z: INTERP_XY   ____,  R0.y,  Param0.x      VEC_210
         w: INTERP_XY   ____,  R0.x,  Param0.x      VEC_210
      3  x: MULADD      R1.x,  R0.w, -1.0f,  PV2.x      
         y: MULADD      R1.y,  R0.z, -1.0f,  PV2.y      
         z: MULADD      R1.z,  R0.w, -1.0f,  PV2.x      
         w: MULADD      R1.w,  R0.z,  0.0f,  PV2.y      
         t: MULADD      R2.x,  R0.w, -1.0f,  PV2.x      VEC_021
      4  x: MULADD      R3.x,  R0.w,  0.0f,  R0.x      
         y: MULADD      R2.y,  R0.z,  1.0f,  R0.y      
         z: MULADD      R3.z,  R0.z, -1.0f,  R0.y      
         w: MULADD      R3.w,  R0.w,  0.0f,  R0.x      
         t: MULADD      R3.y,  R0.z,  0.0f,  R0.y      VEC_021
      5  x: MULADD      R4.x,  R0.w,  0.0f,  R0.x      
         y: MULADD      R4.y,  R0.z,  1.0f,  R0.y      
         z: MULADD      R2.z,  R0.w,  1.0f,  R0.x      
         w: MULADD      R2.w,  R0.z, -1.0f,  R0.y      
         t: MULADD      R5.x,  R0.w,  1.0f,  R0.x      VEC_021
      6  x: MULADD      R0.x,  R0.w,  1.0f,  R0.x      
         y: MULADD      R5.y,  R0.z,  0.0f,  R0.y      
         z: MULADD      R0.z,  R0.z,  1.0f,  R0.y      
01 TEX: ADDR(80) CNT(9) VALID_PIX
      7  SAMPLE R0.___y, R1.xy0x, t0, s0
      8  SAMPLE R1.__y_, R1.zw0z, t0, s0
      9  SAMPLE R2._y__, R2.xy0x, t0, s0
     10  SAMPLE R3.y___, R3.xz0x, t0, s0
     11  SAMPLE R2.y___, R3.wy0w, t0, s0
     12  SAMPLE R4.y___, R4.xy0x, t0, s0
     13  SAMPLE R4._y__, R2.zw0z, t0, s0
     14  SAMPLE R5.y___, R5.xy0x, t0, s0
     15  SAMPLE R0.y___, R0.xz0x, t0, s0
02 ALU: ADDR(56) CNT(11)
     16  w: ADD         ____,  0.0f,  R0.w      
     17  z: ADD         ____,  PV16.w,  R1.z      
     18  w: ADD         ____,  PV17.z,  R2.y      
     19  z: ADD         ____,  PV18.w,  R3.x      
     20  w: ADD         ____,  PV19.z,  R2.x      
     21  z: ADD         ____,  PV20.w,  R4.x      
     22  w: ADD         ____,  PV21.z,  R4.y      
     23  z: ADD         ____,  PV22.w,  R5.x      
     24  y: ADD         ____,  PV23.z,  R0.x      
     25  x: MUL_e       R0.x,  PV24.y,  (0x3DE38E39, 0.1111111119f).x      
03 EXP_DONE: PIX0, R0.xxxx
END_OF_PROGRAM

and for the gather version:
 

; --------  Disassembly --------------------
00 ALU: ADDR(32) CNT(22) KCACHE0(CB0:0-15)
      0  x: MOV         R1.x,  -1      
         y: MOV         R1.y,  0.0f      
         z: MOV         R2.z,  0.0f      
         w: MOV         R0.w,  0.0f      
         t: MOV         R2.y,  -1      
      1  x: INTERP_XY   R5.x,  R0.y,  Param0.x      VEC_210
         y: INTERP_XY   R5.y,  R0.x,  Param0.x      VEC_210
         z: INTERP_XY   ____,  R0.y,  Param0.x      VEC_210
         w: INTERP_XY   ____,  R0.x,  Param0.x      VEC_210
      2  x: MOV         R0.x,  -1      
         y: MOV         R0.y,  1      
         z: MOV         R0.z,  -1      
         w: MOV         R3.w,  0.0f      
         t: MOV         R3.y,  1      
      3  x: MOV         R4.x,  1      
         y: MOV         R4.y,  0.0f      
         z: MOV         R1.z,  1      
         w: MOV         R1.w,  -1      
         t: RCP_e       ____,  KC0[0].x      
      4  x: ADD         R6.x,  PS3,  R5.x      
         t: RCP_e       ____,  KC0[0].y      
      5  y: ADD         R6.y,  PS4,  R5.y      
01 TEX: ADDR(80) CNT(16) VALID_PIX
      6  SET_TEXTURE_OFFSETS ____, R1.xyxx, t0, s0
      7  GATHER4_O_y R2.___w, R5.xy0x, t0, s0
      8  SET_TEXTURE_OFFSETS ____, R0.wwww, t0, s0
      9  GATHER4_O_y R3.__w_, R5.xy0x, t0, s0
     10  SET_TEXTURE_OFFSETS ____, R2.zyzz, t0, s0
     11  GATHER4_O_y R2._w__, R5.xy0x, t0, s0
     12  SET_TEXTURE_OFFSETS ____, R0.zzzz, t0, s0
     13  GATHER4_O_y R1.w___, R5.xy0x, t0, s0
     14  SET_TEXTURE_OFFSETS ____, R0.xyxx, t0, s0
     15  GATHER4_O_y R0.___w, R5.xy0x, t0, s0
     16  SET_TEXTURE_OFFSETS ____, R3.wyww, t0, s0
     17  GATHER4_O_y R0.__w_, R5.xy0x, t0, s0
     18  SET_TEXTURE_OFFSETS ____, R4.xyxx, t0, s0
     19  GATHER4_O_y R4._w__, R5.xy0x, t0, s0
     20  SET_TEXTURE_OFFSETS ____, R1.zwzz, t0, s0
     21  GATHER4_O_y R5.w___, R5.xy0x, t0, s0
02 TEX: ADDR(112) CNT(1) VALID_PIX
     22  SAMPLE R6._y__, R6.xy0x, t0, s0
03 ALU: ADDR(54) CNT(11)
     23  y: ADD         ____,  R2.w,  R3.z      
     24  x: ADD         ____,  R2.y,  PV23.y      
     25  w: ADD         ____,  R1.x,  PV24.x      
     26  z: ADD         ____,  R0.w,  PV25.w      
     27  y: ADD         ____,  R0.z,  PV26.z      
     28  x: ADD         ____,  R4.y,  PV27.y      
     29  w: ADD         ____,  R5.x,  PV28.x      
     30  z: ADD         ____,  R6.y,  PV29.w      
     31  y: ADD         ____,  PV30.z,  0.0f      
     32  x: MUL_e       R5.x,  PV31.y,  (0x3DE38E39, 0.1111111119f).x      
04 EXP_DONE: PIX0, R5.xxxx
END_OF_PROGRAM

you can clearly see that there is actually 8 gather operations as opposed to just two, why is that?

 

can anyone explain this?

Edited by Yours3!f

Share this post


Link to post
Share on other sites

textureGatherOffsets performs a textureGatherOffset for each element in the offsets array, thus, you get 8 gathers for 2 4-element offset arrays.

Share this post


Link to post
Share on other sites

textureGatherOffsets performs a textureGatherOffset for each element in the offsets array, thus, you get 8 gathers for 2 4-element offset arrays.

 

so I can't tell it exactly where to sample from? that is rather bad, well I guess I can still get away with less texture calls. thank you for clarifying this feature of glsl.

 

so if I read the specs correctly this should work correctly, right?

#version 330 core

layout(binding=0) uniform sampler2D tex;

uniform vec2 screen_size;

in vec2 uv;

layout(location=0) out vec4 color;

void main()
{
	/*
	-1  1 | 0  1 | 1  1
	-1  0 | 0  0 | 1  0
	-1 -1 | 0 -1 | 1 -1
	*/ 

	float res = 0;

        /**/
	//gather based approach		
	//fetch the y component
	vec4 res0 = textureGatherOffset( tex, uv, ivec2(-1, -1), 1 );
        vec4 res1 = textureGather( tex, uv, 1 );
        float res2 = texture( tex, uv + vec2(-1, 1) * scale ).y;
        float res3 = texture( tex, uv + vec2(1, -1) * scale ).y;

	res += res0.x + res0.y + res0.z + res0.w + res1.x + res1.y + res1.z + res2 + res3;
	/**/

	color = vec4(res / 9.0);
}
Edited by Yours3!f

Share this post


Link to post
Share on other sites

Create an account or sign in to comment

You need to be a member in order to leave a comment

Create an account

Sign up for a new account in our community. It's easy!

Register a new account

Sign in

Already have an account? Sign in here.

Sign In Now

Sign in to follow this