Hello again

I think, i solved my problem. I just made a help function for converting from 2D UV to 3D UVW and back, and it helped me

This is my code, it's ComputeShader as I am doing mostly only CS, it's faster then vertex shader+pixel shader ( rendering quadscreen, etc ... ).

Texture2D<float4> backbuffer : register(t0);
RWTexture2D<float4> backbufferOut : register(u0);
Texture2D<float4> palette : register(t1);
struct CSInput
{
uint3 groupID : SV_GroupID;
uint3 threadID : SV_DispatchThreadID;
};
uint3 To3D( uint2 uv )
{
uint blue = floor( uv.x / 16.0 );
uint red = uv.x - blue*16;
uint green= uv.y;
return uint3(red,green,blue);
}
uint2 To2D( uint3 uvw )
{
uint v = uvw.y;
uint u = uvw.z*16+uvw.x;
return uint2(u,v);
}
[numthreads(16, 16, 1)]
void main( CSInput input )
{
float4 oldColor = backbuffer.Load(uint3(input.threadID.xy,0));
uint3 uColor = oldColor.xyz*255;
uint3 uvw_low = floor( uColor/16.0);
uint3 uvw_high = ceil( uColor/16.0);
float3 uvw_avg = uColor/16.0;
float xpercent = 0.0; float ypercent = 0.0; float zpercent = 0.0;
if( uvw_high.x < 16) {if(uvw_high.x-uvw_low.x > 0) xpercent = 1.0-(uvw_high.x-uvw_avg.x); } else uvw_high.x=15;
if( uvw_high.y < 16) {if(uvw_high.y-uvw_low.y > 0) ypercent = 1.0-(uvw_high.y-uvw_avg.y); } else uvw_high.y=15;
if( uvw_high.z < 16) {if(uvw_high.z-uvw_low.z > 0) zpercent = 1.0-(uvw_high.z-uvw_avg.z); } else uvw_high.z=15;
float4 color1 = palette.Load( uint3( To2D(uvw_low),0 ) );
float4 color2 = palette.Load( uint3( To2D(uvw_high),0 ) );
float4 color3 = float4(0,0,0,1);
color3.x = lerp( color1.x, color2.x, xpercent );
color3.y = lerp( color1.y, color2.y, ypercent );
color3.z = lerp( color1.z, color2.z, zpercent );
backbufferOut[input.threadID.xy] = color3;
}

if anyone has any idea, how to optimalize it (if it's possible ), i will be happy. Thanks