Jump to content

  • Log In with Google      Sign In   
  • Create Account

#ActualYours3!f

Posted 27 September 2013 - 11:49 AM

Hi

 

so I wanted to speed up rendering with using texture gather, however I got 10fps less, than with traditional texture (78 vs 88)

I expected the opposite...

 

here's the code I used:

#version 430 core

layout(binding=0) uniform sampler2D tex;

uniform vec2 screen_size;

in vec2 uv;

layout(location=0) out vec4 color;

void main()
{
    /*
    -1  1 | 0  1 | 1  1
    -1  0 | 0  0 | 1  0
    -1 -1 | 0 -1 | 1 -1
    */

    float res = 0;

    const int size = 1;
    vec2 scale = 1.0 / screen_size.xy;

    /**/
    //3x3 blurring
    for( int x = -size; x <= size; ++x )
        for( int y = -size; y <= size; ++y )
            res += texture( tex, uv + vec2(x, y) * scale ).y;
    /**/

    /**/
    //gather based approach
    //2x2 sample
    ivec2 offsets0[4] = { ivec2(-1, 0), ivec2(0, 0), ivec2(0, -1), ivec2(-1, -1) };
    //the rest from the sides
    ivec2 offsets1[4] = { ivec2(-1, 1), ivec2(0, 1), ivec2(1, 0), ivec2(1, -1) };
        
    //fetch the y component
    vec4 res0 = textureGatherOffsets( tex, uv, offsets0, 1 );
    vec4 res1 = textureGatherOffsets( tex, uv, offsets1, 1 );
    //the last one is done separately
    float res2 = texture( tex, uv + vec2(1, 1) * scale ).y;

    res += res0.x + res0.y + res0.z + res0.w + res1.x + res1.y + res1.z + res1.w + res2;
    /**/

    color = vec4(res / 9.0);
}

here's the corresponding shader assembly for HD5770 from GPU ShaderAnalyzer:
 

; --------  Disassembly --------------------
00 ALU: ADDR(32) CNT(24) KCACHE0(CB0:0-15)
      0  t: RCP_e       R0.w,  KC0[0].x      
      1  t: RCP_e       R0.z,  KC0[0].y      
      2  x: INTERP_XY   R0.x,  R0.y,  Param0.x      VEC_210
         y: INTERP_XY   R0.y,  R0.x,  Param0.x      VEC_210
         z: INTERP_XY   ____,  R0.y,  Param0.x      VEC_210
         w: INTERP_XY   ____,  R0.x,  Param0.x      VEC_210
      3  x: MULADD      R1.x,  R0.w, -1.0f,  PV2.x      
         y: MULADD      R1.y,  R0.z, -1.0f,  PV2.y      
         z: MULADD      R1.z,  R0.w, -1.0f,  PV2.x      
         w: MULADD      R1.w,  R0.z,  0.0f,  PV2.y      
         t: MULADD      R2.x,  R0.w, -1.0f,  PV2.x      VEC_021
      4  x: MULADD      R3.x,  R0.w,  0.0f,  R0.x      
         y: MULADD      R2.y,  R0.z,  1.0f,  R0.y      
         z: MULADD      R3.z,  R0.z, -1.0f,  R0.y      
         w: MULADD      R3.w,  R0.w,  0.0f,  R0.x      
         t: MULADD      R3.y,  R0.z,  0.0f,  R0.y      VEC_021
      5  x: MULADD      R4.x,  R0.w,  0.0f,  R0.x      
         y: MULADD      R4.y,  R0.z,  1.0f,  R0.y      
         z: MULADD      R2.z,  R0.w,  1.0f,  R0.x      
         w: MULADD      R2.w,  R0.z, -1.0f,  R0.y      
         t: MULADD      R5.x,  R0.w,  1.0f,  R0.x      VEC_021
      6  x: MULADD      R0.x,  R0.w,  1.0f,  R0.x      
         y: MULADD      R5.y,  R0.z,  0.0f,  R0.y      
         z: MULADD      R0.z,  R0.z,  1.0f,  R0.y      
01 TEX: ADDR(80) CNT(9) VALID_PIX
      7  SAMPLE R0.___y, R1.xy0x, t0, s0
      8  SAMPLE R1.__y_, R1.zw0z, t0, s0
      9  SAMPLE R2._y__, R2.xy0x, t0, s0
     10  SAMPLE R3.y___, R3.xz0x, t0, s0
     11  SAMPLE R2.y___, R3.wy0w, t0, s0
     12  SAMPLE R4.y___, R4.xy0x, t0, s0
     13  SAMPLE R4._y__, R2.zw0z, t0, s0
     14  SAMPLE R5.y___, R5.xy0x, t0, s0
     15  SAMPLE R0.y___, R0.xz0x, t0, s0
02 ALU: ADDR(56) CNT(11)
     16  w: ADD         ____,  0.0f,  R0.w      
     17  z: ADD         ____,  PV16.w,  R1.z      
     18  w: ADD         ____,  PV17.z,  R2.y      
     19  z: ADD         ____,  PV18.w,  R3.x      
     20  w: ADD         ____,  PV19.z,  R2.x      
     21  z: ADD         ____,  PV20.w,  R4.x      
     22  w: ADD         ____,  PV21.z,  R4.y      
     23  z: ADD         ____,  PV22.w,  R5.x      
     24  y: ADD         ____,  PV23.z,  R0.x      
     25  x: MUL_e       R0.x,  PV24.y,  (0x3DE38E39, 0.1111111119f).x      
03 EXP_DONE: PIX0, R0.xxxx
END_OF_PROGRAM

and for the gather version:
 

; --------  Disassembly --------------------
00 ALU: ADDR(32) CNT(22) KCACHE0(CB0:0-15)
      0  x: MOV         R1.x,  -1      
         y: MOV         R1.y,  0.0f      
         z: MOV         R2.z,  0.0f      
         w: MOV         R0.w,  0.0f      
         t: MOV         R2.y,  -1      
      1  x: INTERP_XY   R5.x,  R0.y,  Param0.x      VEC_210
         y: INTERP_XY   R5.y,  R0.x,  Param0.x      VEC_210
         z: INTERP_XY   ____,  R0.y,  Param0.x      VEC_210
         w: INTERP_XY   ____,  R0.x,  Param0.x      VEC_210
      2  x: MOV         R0.x,  -1      
         y: MOV         R0.y,  1      
         z: MOV         R0.z,  -1      
         w: MOV         R3.w,  0.0f      
         t: MOV         R3.y,  1      
      3  x: MOV         R4.x,  1      
         y: MOV         R4.y,  0.0f      
         z: MOV         R1.z,  1      
         w: MOV         R1.w,  -1      
         t: RCP_e       ____,  KC0[0].x      
      4  x: ADD         R6.x,  PS3,  R5.x      
         t: RCP_e       ____,  KC0[0].y      
      5  y: ADD         R6.y,  PS4,  R5.y      
01 TEX: ADDR(80) CNT(16) VALID_PIX
      6  SET_TEXTURE_OFFSETS ____, R1.xyxx, t0, s0
      7  GATHER4_O_y R2.___w, R5.xy0x, t0, s0
      8  SET_TEXTURE_OFFSETS ____, R0.wwww, t0, s0
      9  GATHER4_O_y R3.__w_, R5.xy0x, t0, s0
     10  SET_TEXTURE_OFFSETS ____, R2.zyzz, t0, s0
     11  GATHER4_O_y R2._w__, R5.xy0x, t0, s0
     12  SET_TEXTURE_OFFSETS ____, R0.zzzz, t0, s0
     13  GATHER4_O_y R1.w___, R5.xy0x, t0, s0
     14  SET_TEXTURE_OFFSETS ____, R0.xyxx, t0, s0
     15  GATHER4_O_y R0.___w, R5.xy0x, t0, s0
     16  SET_TEXTURE_OFFSETS ____, R3.wyww, t0, s0
     17  GATHER4_O_y R0.__w_, R5.xy0x, t0, s0
     18  SET_TEXTURE_OFFSETS ____, R4.xyxx, t0, s0
     19  GATHER4_O_y R4._w__, R5.xy0x, t0, s0
     20  SET_TEXTURE_OFFSETS ____, R1.zwzz, t0, s0
     21  GATHER4_O_y R5.w___, R5.xy0x, t0, s0
02 TEX: ADDR(112) CNT(1) VALID_PIX
     22  SAMPLE R6._y__, R6.xy0x, t0, s0
03 ALU: ADDR(54) CNT(11)
     23  y: ADD         ____,  R2.w,  R3.z      
     24  x: ADD         ____,  R2.y,  PV23.y      
     25  w: ADD         ____,  R1.x,  PV24.x      
     26  z: ADD         ____,  R0.w,  PV25.w      
     27  y: ADD         ____,  R0.z,  PV26.z      
     28  x: ADD         ____,  R4.y,  PV27.y      
     29  w: ADD         ____,  R5.x,  PV28.x      
     30  z: ADD         ____,  R6.y,  PV29.w      
     31  y: ADD         ____,  PV30.z,  0.0f      
     32  x: MUL_e       R5.x,  PV31.y,  (0x3DE38E39, 0.1111111119f).x      
04 EXP_DONE: PIX0, R5.xxxx
END_OF_PROGRAM

you can clearly see that there is actually 8 gather operations as opposed to just two, why is that?

 

can anyone explain this?


#7Yours3!f

Posted 27 September 2013 - 11:48 AM

Hi

 

so I wanted to speed up rendering with using texture gather, however I got 10fps less, than with traditional texture (78 vs 88)

I expected the opposite...

 

here's the code I used:

#version 430 core

layout(binding=0) uniform sampler2D tex;

uniform vec2 screen_size;

in vec2 uv;

layout(location=0) out vec4 color;

void main()
{
    /*
    -1  1 | 0  1 | 1  1
    -1  0 | 0  0 | 1  0
    -1 -1 | 0 -1 | 1 -1
    */

    float res = 0;

    const int size = 1;
    vec2 scale = 1.0 / screen_size.xy;

    /**/
    //3x3 blurring
    for( int x = -size; x <= size; ++x )
        for( int y = -size; y <= size; ++y )
            res += texture( tex, uv + vec2(x, y) * scale ).y;
    /**/

    /**/
    //gather based approach
    //2x2 sample
    ivec2 offsets0[4] = { ivec2(-1, 0), ivec2(0, 0), ivec2(0, -1), ivec2(-1, -1) };
    //the rest from the sides
    ivec2 offsets1[4] = { ivec2(-1, 1), ivec2(0, 1), ivec2(1, 0), ivec2(1, -1) };
        
    //fetch the y component
    vec4 res0 = textureGatherOffsets( tex, uv, offsets0, 1 );
    vec4 res1 = textureGatherOffsets( tex, uv, offsets1, 1 );
    //the last one is done separately
    float res2 = texture( tex, uv + vec2(1, 1) * scale ).y;

    res += res0.x + res0.y + res0.z + res0.w + res1.x + res1.y + res1.z + res1.w + res2;
    /**/

    color = vec4(res / 9.0);
}

here's the corresponding shader assembly for HD5770 from GPU ShaderAnalyzer:
 

; --------  Disassembly --------------------
00 ALU: ADDR(32) CNT(24) KCACHE0(CB0:0-15)
      0  t: RCP_e       R0.w,  KC0[0].x      
      1  t: RCP_e       R0.z,  KC0[0].y      
      2  x: INTERP_XY   R0.x,  R0.y,  Param0.x      VEC_210
         y: INTERP_XY   R0.y,  R0.x,  Param0.x      VEC_210
         z: INTERP_XY   ____,  R0.y,  Param0.x      VEC_210
         w: INTERP_XY   ____,  R0.x,  Param0.x      VEC_210
      3  x: MULADD      R1.x,  R0.w, -1.0f,  PV2.x      
         y: MULADD      R1.y,  R0.z, -1.0f,  PV2.y      
         z: MULADD      R1.z,  R0.w, -1.0f,  PV2.x      
         w: MULADD      R1.w,  R0.z,  0.0f,  PV2.y      
         t: MULADD      R2.x,  R0.w, -1.0f,  PV2.x      VEC_021
      4  x: MULADD      R3.x,  R0.w,  0.0f,  R0.x      
         y: MULADD      R2.y,  R0.z,  1.0f,  R0.y      
         z: MULADD      R3.z,  R0.z, -1.0f,  R0.y      
         w: MULADD      R3.w,  R0.w,  0.0f,  R0.x      
         t: MULADD      R3.y,  R0.z,  0.0f,  R0.y      VEC_021
      5  x: MULADD      R4.x,  R0.w,  0.0f,  R0.x      
         y: MULADD      R4.y,  R0.z,  1.0f,  R0.y      
         z: MULADD      R2.z,  R0.w,  1.0f,  R0.x      
         w: MULADD      R2.w,  R0.z, -1.0f,  R0.y      
         t: MULADD      R5.x,  R0.w,  1.0f,  R0.x      VEC_021
      6  x: MULADD      R0.x,  R0.w,  1.0f,  R0.x      
         y: MULADD      R5.y,  R0.z,  0.0f,  R0.y      
         z: MULADD      R0.z,  R0.z,  1.0f,  R0.y      
01 TEX: ADDR(80) CNT(9) VALID_PIX
      7  SAMPLE R0.___y, R1.xy0x, t0, s0
      8  SAMPLE R1.__y_, R1.zw0z, t0, s0
      9  SAMPLE R2._y__, R2.xy0x, t0, s0
     10  SAMPLE R3.y___, R3.xz0x, t0, s0
     11  SAMPLE R2.y___, R3.wy0w, t0, s0
     12  SAMPLE R4.y___, R4.xy0x, t0, s0
     13  SAMPLE R4._y__, R2.zw0z, t0, s0
     14  SAMPLE R5.y___, R5.xy0x, t0, s0
     15  SAMPLE R0.y___, R0.xz0x, t0, s0
02 ALU: ADDR(56) CNT(11)
     16  w: ADD         ____,  0.0f,  R0.w      
     17  z: ADD         ____,  PV16.w,  R1.z      
     18  w: ADD         ____,  PV17.z,  R2.y      
     19  z: ADD         ____,  PV18.w,  R3.x      
     20  w: ADD         ____,  PV19.z,  R2.x      
     21  z: ADD         ____,  PV20.w,  R4.x      
     22  w: ADD         ____,  PV21.z,  R4.y      
     23  z: ADD         ____,  PV22.w,  R5.x      
     24  y: ADD         ____,  PV23.z,  R0.x      
     25  x: MUL_e       R0.x,  PV24.y,  (0x3DE38E39, 0.1111111119f).x      
03 EXP_DONE: PIX0, R0.xxxx
END_OF_PROGRAM

and for the gather version:
 

; --------  Disassembly --------------------
00 ALU: ADDR(32) CNT(22) KCACHE0(CB0:0-15)
      0  x: MOV         R1.x,  -1      
         y: MOV         R1.y,  0.0f      
         z: MOV         R2.z,  0.0f      
         w: MOV         R0.w,  0.0f      
         t: MOV         R2.y,  -1      
      1  x: INTERP_XY   R5.x,  R0.y,  Param0.x      VEC_210
         y: INTERP_XY   R5.y,  R0.x,  Param0.x      VEC_210
         z: INTERP_XY   ____,  R0.y,  Param0.x      VEC_210
         w: INTERP_XY   ____,  R0.x,  Param0.x      VEC_210
      2  x: MOV         R0.x,  -1      
         y: MOV         R0.y,  1      
         z: MOV         R0.z,  -1      
         w: MOV         R3.w,  0.0f      
         t: MOV         R3.y,  1      
      3  x: MOV         R4.x,  1      
         y: MOV         R4.y,  0.0f      
         z: MOV         R1.z,  1      
         w: MOV         R1.w,  -1      
         t: RCP_e       ____,  KC0[0].x      
      4  x: ADD         R6.x,  PS3,  R5.x      
         t: RCP_e       ____,  KC0[0].y      
      5  y: ADD         R6.y,  PS4,  R5.y      
01 TEX: ADDR(80) CNT(16) VALID_PIX
      6  SET_TEXTURE_OFFSETS ____, R1.xyxx, t0, s0
      7  GATHER4_O_y R2.___w, R5.xy0x, t0, s0
      8  SET_TEXTURE_OFFSETS ____, R0.wwww, t0, s0
      9  GATHER4_O_y R3.__w_, R5.xy0x, t0, s0
     10  SET_TEXTURE_OFFSETS ____, R2.zyzz, t0, s0
     11  GATHER4_O_y R2._w__, R5.xy0x, t0, s0
     12  SET_TEXTURE_OFFSETS ____, R0.zzzz, t0, s0
     13  GATHER4_O_y R1.w___, R5.xy0x, t0, s0
     14  SET_TEXTURE_OFFSETS ____, R0.xyxx, t0, s0
     15  GATHER4_O_y R0.___w, R5.xy0x, t0, s0
     16  SET_TEXTURE_OFFSETS ____, R3.wyww, t0, s0
     17  GATHER4_O_y R0.__w_, R5.xy0x, t0, s0
     18  SET_TEXTURE_OFFSETS ____, R4.xyxx, t0, s0
     19  GATHER4_O_y R4._w__, R5.xy0x, t0, s0
     20  SET_TEXTURE_OFFSETS ____, R1.zwzz, t0, s0
     21  GATHER4_O_y R5.w___, R5.xy0x, t0, s0
02 TEX: ADDR(112) CNT(1) VALID_PIX
     22  SAMPLE R6._y__, R6.xy0x, t0, s0
03 ALU: ADDR(54) CNT(11)
     23  y: ADD         ____,  R2.w,  R3.z      
     24  x: ADD         ____,  R2.y,  PV23.y      
     25  w: ADD         ____,  R1.x,  PV24.x      
     26  z: ADD         ____,  R0.w,  PV25.w      
     27  y: ADD         ____,  R0.z,  PV26.z      
     28  x: ADD         ____,  R4.y,  PV27.y      
     29  w: ADD         ____,  R5.x,  PV28.x      
     30  z: ADD         ____,  R6.y,  PV29.w      
     31  y: ADD         ____,  PV30.z,  0.0f      
     32  x: MUL_e       R5.x,  PV31.y,  (0x3DE38E39, 0.1111111119f).x      
04 EXP_DONE: PIX0, R5.xxxx
END_OF_PROGRAM

you can clearly see that there is actually 9 gather operations as opposed to just two, why is that?

 

can anyone explain this?


#6Yours3!f

Posted 27 September 2013 - 11:45 AM

Hi

 

so I wanted to speed up rendering with using texture gather, however I got 10fps less, than with traditional texture (78 vs 88)

I expected the opposite...

 

here's the code I used:

#version 430 core

layout(binding=0) uniform sampler2D tex;

uniform vec2 screen_size;

in vec2 uv;

layout(location=0) out vec4 color;

void main()
{
    /*
    -1  1 | 0  1 | 1  1
    -1  0 | 0  0 | 1  0
    -1 -1 | 0 -1 | 1 -1
    */

    float res = 0;

    const int size = 1;
    vec2 scale = 1.0 / screen_size.xy;

    /**/
    //3x3 blurring
    for( int x = -size; x <= size; ++x )
        for( int y = -size; y <= size; ++y )
            res += texture( tex, uv + vec2(x, y) * scale ).y;
    /**/

    /**/
    //gather based approach
    //2x2 sample
    ivec2 offsets0[4] = { ivec2(-1, 0), ivec2(0, 0), ivec2(0, -1), ivec2(-1, -1) };
    //the rest from the sides
    ivec2 offsets1[4] = { ivec2(-1, 1), ivec2(0, 1), ivec2(1, 0), ivec2(1, -1) };
        
    //fetch the y component
    vec4 res0 = textureGatherOffsets( tex, uv, offsets0, 1 );
    vec4 res1 = textureGatherOffsets( tex, uv, offsets1, 1 );
    //the last one is done separately
    float res2 = texture( tex, uv + vec2(1, 1) * scale ).y;

    res += res0.x + res0.y + res0.z + res0.w + res1.x + res1.y + res1.z + res1.w + res2;
    /**/

    color = vec4(res);
}

here's the corresponding shader assembly for HD5770 from GPU ShaderAnalyzer:
 

; --------  Disassembly --------------------
00 ALU: ADDR(32) CNT(24) KCACHE0(CB0:0-15) 
      0  t: RCP_e       R0.w,  KC0[0].x      
      1  t: RCP_e       R0.z,  KC0[0].y      
      2  x: INTERP_XY   R0.x,  R0.y,  Param0.x      VEC_210 
         y: INTERP_XY   R0.y,  R0.x,  Param0.x      VEC_210 
         z: INTERP_XY   ____,  R0.y,  Param0.x      VEC_210 
         w: INTERP_XY   ____,  R0.x,  Param0.x      VEC_210 
      3  x: MULADD      R1.x,  R0.w, -1.0f,  PV2.x      
         y: MULADD      R1.y,  R0.z, -1.0f,  PV2.y      
         z: MULADD      R1.z,  R0.w, -1.0f,  PV2.x      
         w: MULADD      R1.w,  R0.z,  0.0f,  PV2.y      
         t: MULADD      R2.x,  R0.w, -1.0f,  PV2.x      VEC_021 
      4  x: MULADD      R3.x,  R0.w,  0.0f,  R0.x      
         y: MULADD      R2.y,  R0.z,  1.0f,  R0.y      
         z: MULADD      R3.z,  R0.z, -1.0f,  R0.y      
         w: MULADD      R3.w,  R0.w,  0.0f,  R0.x      
         t: MULADD      R3.y,  R0.z,  0.0f,  R0.y      VEC_021 
      5  x: MULADD      R4.x,  R0.w,  0.0f,  R0.x      
         y: MULADD      R4.y,  R0.z,  1.0f,  R0.y      
         z: MULADD      R2.z,  R0.w,  1.0f,  R0.x      
         w: MULADD      R2.w,  R0.z, -1.0f,  R0.y      
         t: MULADD      R5.x,  R0.w,  1.0f,  R0.x      VEC_021 
      6  x: MULADD      R0.x,  R0.w,  1.0f,  R0.x      
         y: MULADD      R5.y,  R0.z,  0.0f,  R0.y      
         z: MULADD      R0.z,  R0.z,  1.0f,  R0.y      
01 TEX: ADDR(80) CNT(9) VALID_PIX 
      7  SAMPLE R0.___y, R1.xy0x, t0, s0
      8  SAMPLE R1.__y_, R1.zw0z, t0, s0
      9  SAMPLE R2._y__, R2.xy0x, t0, s0
     10  SAMPLE R3.y___, R3.xz0x, t0, s0
     11  SAMPLE R2.y___, R3.wy0w, t0, s0
     12  SAMPLE R4.y___, R4.xy0x, t0, s0
     13  SAMPLE R4._y__, R2.zw0z, t0, s0
     14  SAMPLE R5.y___, R5.xy0x, t0, s0
     15  SAMPLE R0.y___, R0.xz0x, t0, s0
02 ALU: ADDR(56) CNT(9) 
     16  w: ADD         ____,  0.0f,  R0.w      
     17  z: ADD         ____,  PV16.w,  R1.z      
     18  w: ADD         ____,  PV17.z,  R2.y      
     19  z: ADD         ____,  PV18.w,  R3.x      
     20  w: ADD         ____,  PV19.z,  R2.x      
     21  z: ADD         ____,  PV20.w,  R4.x      
     22  w: ADD         ____,  PV21.z,  R4.y      
     23  z: ADD         ____,  PV22.w,  R5.x      
     24  x: ADD         R0.x,  PV23.z,  R0.x      
03 EXP_DONE: PIX0, R0.xxxx
END_OF_PROGRAM

and for the gather version:
 

; --------  Disassembly --------------------
00 ALU: ADDR(32) CNT(22) KCACHE0(CB0:0-15) 
      0  x: MOV         R1.x,  -1      
         y: MOV         R1.y,  0.0f      
         z: MOV         R2.z,  0.0f      
         w: MOV         R0.w,  0.0f      
         t: MOV         R2.y,  -1      
      1  x: INTERP_XY   R5.x,  R0.y,  Param0.x      VEC_210 
         y: INTERP_XY   R5.y,  R0.x,  Param0.x      VEC_210 
         z: INTERP_XY   ____,  R0.y,  Param0.x      VEC_210 
         w: INTERP_XY   ____,  R0.x,  Param0.x      VEC_210 
      2  x: MOV         R0.x,  -1      
         y: MOV         R0.y,  1      
         z: MOV         R0.z,  -1      
         w: MOV         R3.w,  0.0f      
         t: MOV         R3.y,  1      
      3  x: MOV         R4.x,  1      
         y: MOV         R4.y,  0.0f      
         z: MOV         R1.z,  1      
         w: MOV         R1.w,  -1      
         t: RCP_e       ____,  KC0[0].x      
      4  x: ADD         R6.x,  PS3,  R5.x      
         t: RCP_e       ____,  KC0[0].y      
      5  y: ADD         R6.y,  PS4,  R5.y      
01 TEX: ADDR(64) CNT(16) VALID_PIX 
      6  SET_TEXTURE_OFFSETS ____, R1.xyxx, t0, s0
      7  GATHER4_O_y R2.___w, R5.xy0x, t0, s0
      8  SET_TEXTURE_OFFSETS ____, R0.wwww, t0, s0
      9  GATHER4_O_y R3.__w_, R5.xy0x, t0, s0
     10  SET_TEXTURE_OFFSETS ____, R2.zyzz, t0, s0
     11  GATHER4_O_y R2._w__, R5.xy0x, t0, s0
     12  SET_TEXTURE_OFFSETS ____, R0.zzzz, t0, s0
     13  GATHER4_O_y R1.w___, R5.xy0x, t0, s0
     14  SET_TEXTURE_OFFSETS ____, R0.xyxx, t0, s0
     15  GATHER4_O_y R0.___w, R5.xy0x, t0, s0
     16  SET_TEXTURE_OFFSETS ____, R3.wyww, t0, s0
     17  GATHER4_O_y R0.__w_, R5.xy0x, t0, s0
     18  SET_TEXTURE_OFFSETS ____, R4.xyxx, t0, s0
     19  GATHER4_O_y R4._w__, R5.xy0x, t0, s0
     20  SET_TEXTURE_OFFSETS ____, R1.zwzz, t0, s0
     21  GATHER4_O_y R5.w___, R5.xy0x, t0, s0
02 TEX: ADDR(96) CNT(1) VALID_PIX 
     22  SAMPLE R6._y__, R6.xy0x, t0, s0
03 ALU: ADDR(54) CNT(9) 
     23  x: ADD         ____,  R2.w,  R3.z      
     24  w: ADD         ____,  R2.y,  PV23.x      
     25  z: ADD         ____,  R1.x,  PV24.w      
     26  y: ADD         ____,  R0.w,  PV25.z      
     27  x: ADD         ____,  R0.z,  PV26.y      
     28  w: ADD         ____,  R4.y,  PV27.x      
     29  z: ADD         ____,  R5.x,  PV28.w      
     30  y: ADD         ____,  R6.y,  PV29.z      
     31  x: ADD         R5.x,  PV30.y,  0.0f      
04 EXP_DONE: PIX0, R5.xxxx
END_OF_PROGRAM


you can clearly see that there is actually 9 gather operations as opposed to just two, why is that?

 

can anyone explain this?


#5Yours3!f

Posted 27 September 2013 - 11:39 AM

Hi

 

so I wanted to speed up rendering with using texture gather, however I got 10fps less, than with traditional texture (78 vs 88)

I expected the opposite...

 

here's the code I used:

#version 430 core

layout(binding=0) uniform sampler2D tex;

uniform vec2 screen_size;

in vec2 uv;

layout(location=0) out vec4 color;

void main()
{
    /*
    -1  1 | 0  1 | 1  1
    -1  0 | 0  0 | 1  0
    -1 -1 | 0 -1 | 1 -1
    */

    float res = 0;

    const int size = 1;
    vec2 scale = 1.0 / screen_size.xy;

    /**/
    //3x3 blurring
    for( int x = -size; x <= size; ++x )
        for( int y = -size; y <= size; ++y )
            res += texture( tex, uv + vec2(x, y) * scale ).y;
    /**/

    /**/
    //gather based approach
    //2x2 sample
    ivec2 offsets0[4] = { ivec2(-1, 0), ivec2(0, 0), ivec2(0, -1), ivec2(-1, -1) };
    //the rest from the sides
    ivec2 offsets1[4] = { ivec2(-1, 1), ivec2(0, 1), ivec2(1, 0), ivec2(1, -1) };
        
    //fetch the y component
    vec4 res0 = textureGatherOffsets( tex, uv, offsets0, 1 );
    vec4 res1 = textureGatherOffsets( tex, uv, offsets1, 1 );
    //the last one is done separately
    float res2 = texture( tex, uv + vec2(1, 1) * scale ).y;

    res += res0.x + res0.y + res0.z + res0.w + res1.x + res1.y + res1.z + res1.w + res2;
    /**/

    color = vec4(res);
}

can anyone explain this?


#4Yours3!f

Posted 27 September 2013 - 11:39 AM

Hi

 

so I wanted to speed up rendering with using texture gather, however I got 10fps less, than with traditional texture (78 vs 88)

I expected the opposite...

 

here's the code I used:

#version 430 core

layout(binding=0) uniform sampler2D tex;

uniform vec2 screen_size;

in vec2 uv;

layout(location=0) out vec4 color;

void main()
{
    /*
    -1  1 | 0  1 | 1  1
    -1  0 | 0  0 | 1  0
    -1 -1 | 0 -1 | 1 -1
    */

    float res = 0;

    const int size = 1;
    vec2 scale = 1.0 / screen_size.xy;

    /**/
    //3x3 blurring
    for( int x = -size; x <= size; ++x )
        for( int y = -size; y <= size; ++y )
            res += texture( tex, uv + vec2(x, y) * scale ).y;
    /**/

    /**/
    //gather based approach
    //2x2 sample
    ivec2 offsets0[4] = { ivec2(-1, 0), ivec2(0, 0), ivec2(0, -1), ivec2(-1, -1) };
    //the rest from the sides
    ivec2 offsets1[4] = { ivec2(-1, 1), ivec2(0, 1), ivec2(1, 0), ivec2(1, -1) };
        
    //fetch the y component
    vec4 res0 = textureGatherOffsets( tex, uv, offsets0, 1 );
    vec4 res1 = textureGatherOffsets( tex, uv, offsets1, 1 );
    //the last one is done separately
    float res2 = texture( tex, uv + vec2(1, 1) * scale ).y;

    res += res0.x + res0.y + res0.z + res0.w + res1.x + res1.y + res1.z + res1.w + res2;ivec2 offsets[4] = { ivec2(-1, -1), ivec2(-1, 0), ivec2(0, -1), ivec2(0, 0)};
    /**/

    color = vec4(res);
}

can anyone explain this?


#3Yours3!f

Posted 27 September 2013 - 11:38 AM

Hi

 

so I wanted to speed up rendering with using texture gather, however I got 10fps less, than with traditional texture (78 vs 88)

I expected the opposite...

 

here's the code I used:

#version 330 core

layout(binding=0) uniform sampler2D tex;

uniform vec2 screen_size;

in vec2 uv;

layout(location=0) out vec4 color;

void main()
{
    /*
    -1  1 | 0  1 | 1  1
    -1  0 | 0  0 | 1  0
    -1 -1 | 0 -1 | 1 -1
    */

    float res = 0;

    const int size = 1;
    vec2 scale = 1.0 / screen_size.xy;

    /**/
    //3x3 blurring
    for( int x = -size; x <= size; ++x )
        for( int y = -size; y <= size; ++y )
            res += texture( tex, uv + vec2(x, y) * scale ).y;
    /**/

    /**/
    //gather based approach
    //2x2 sample
    ivec2 offsets0[4] = { ivec2(-1, 0), ivec2(0, 0), ivec2(0, -1), ivec2(-1, -1) };
    //the rest from the sides
    ivec2 offsets1[4] = { ivec2(-1, 1), ivec2(0, 1), ivec2(1, 0), ivec2(1, -1) };
        
    //fetch the y component
    vec4 res0 = textureGatherOffsets( tex, uv, offsets0, 1 );
    vec4 res1 = textureGatherOffsets( tex, uv, offsets1, 1 );
    //the last one is done separately
    float res2 = texture( tex, uv + vec2(1, 1) * scale ).y;

    res += res0.x + res0.y + res0.z + res0.w + res1.x + res1.y + res1.z + res1.w + res2;ivec2 offsets[4] = { ivec2(-1, -1), ivec2(-1, 0), ivec2(0, -1), ivec2(0, 0)};
    /**/

    color = vec4(res);
}

can anyone explain this?


PARTNERS