Jump to content
  • Advertisement
Sign in to follow this  
thedoctor78

MMX to SSE3 update

This topic is 2774 days old which is more than the 365 day threshold we allow for new replies. Please post a new topic.

If you intended to correct an error in the post then please contact us.

Recommended Posts

Hi guys,

I need to update some old texturing code which was optimised for MMX to SSE3 instructions. It's just some blending procedures for the terrain texture. I am a bit rusty on the new instructions could anyone help me out with where should I start?



for(int y=ey-sy; y; bip+=nYStep, dp+=ostep, y--)
{
int* ip = bip;
poke.XStart(sx);
const __m64 ddwZero = _m_from_int( 0 );
for(int x=width>>2; x; --x)
{
int t1 = *ip+((poke.DDA())&ALPHAMASK);
ip += nXStep;
poke.XInc();

DWORD dwCol1 = dp[0]; // VERY IMPORTANT: the alphas are actually 8-bit signed!!
int d1=(t1>>TEXTURE_BLEND_SHIFT)-((int)dwCol1>>TEXTURE_BLEND_SHIFT);

__m64 ddwOld1 = _m_from_int( dwCol1 );

if ( d1 > 0 )
{
if ( d1 >= 8 )
{
dp[0] = t1;
}
else
{
__m64 ddwNew1 = _m_from_int( t1 );
ddwOld1 = _mm_unpacklo_pi8( ddwOld1, ddwZero );
ddwNew1 = _mm_unpacklo_pi8( ddwNew1, ddwZero );
__m64 ddwMask = g_ddwMasks2[ d1 ];
ddwNew1 = _mm_sub_pi16( ddwNew1, ddwOld1 );
ddwNew1 = _mm_mullo_pi16( ddwNew1, ddwMask );
ddwNew1 = _mm_srai_pi16( ddwNew1, 3 );
ddwNew1 = _mm_adds_pi16( ddwNew1, ddwOld1 );
ddwNew1 = _m_packuswb( ddwNew1, ddwZero );
dp[0] = _m_to_int( ddwNew1 );
}
}
++dp;


t1 = *ip+((poke.DDA())&ALPHAMASK);
ip += nXStep;
poke.XInc();

dwCol1 = dp[0]; // VERY IMPORTANT: the alphas are actually 8-bit signed!!
d1=(t1>>TEXTURE_BLEND_SHIFT)-((int)dwCol1>>TEXTURE_BLEND_SHIFT);

ddwOld1 = _m_from_int( dwCol1 );

if ( d1 > 0 )
{
if ( d1 >= 8 )
{
dp[0] = t1;
}
else
{
__m64 ddwNew1 = _m_from_int( t1 );
ddwOld1 = _mm_unpacklo_pi8( ddwOld1, ddwZero );
ddwNew1 = _mm_unpacklo_pi8( ddwNew1, ddwZero );
__m64 ddwMask = g_ddwMasks2[ d1 ];
ddwNew1 = _mm_sub_pi16( ddwNew1, ddwOld1 );
ddwNew1 = _mm_mullo_pi16( ddwNew1, ddwMask );
ddwNew1 = _mm_srai_pi16( ddwNew1, 3 );
ddwNew1 = _mm_adds_pi16( ddwNew1, ddwOld1 );
ddwNew1 = _m_packuswb( ddwNew1, ddwZero );
dp[0] = _m_to_int( ddwNew1 );
}
}
++dp;


t1 = *ip+((poke.DDA())&ALPHAMASK);
ip += nXStep;
poke.XInc();

dwCol1 = dp[0]; // VERY IMPORTANT: the alphas are actually 8-bit signed!!
d1=(t1>>TEXTURE_BLEND_SHIFT)-((int)dwCol1>>TEXTURE_BLEND_SHIFT);

ddwOld1 = _m_from_int( dwCol1 );

if ( d1 > 0 )
{
if ( d1 >= 8 )
{
dp[0] = t1;
}
else
{
__m64 ddwNew1 = _m_from_int( t1 );
ddwOld1 = _mm_unpacklo_pi8( ddwOld1, ddwZero );
ddwNew1 = _mm_unpacklo_pi8( ddwNew1, ddwZero );
__m64 ddwMask = g_ddwMasks2[ d1 ];
ddwNew1 = _mm_sub_pi16( ddwNew1, ddwOld1 );
ddwNew1 = _mm_mullo_pi16( ddwNew1, ddwMask );
ddwNew1 = _mm_srai_pi16( ddwNew1, 3 );
ddwNew1 = _mm_adds_pi16( ddwNew1, ddwOld1 );
ddwNew1 = _m_packuswb( ddwNew1, ddwZero );
dp[0] = _m_to_int( ddwNew1 );
}
}
++dp;


t1 = *ip+((poke.DDA())&ALPHAMASK);
ip += nXStep;
poke.XInc();

dwCol1 = dp[0]; // VERY IMPORTANT: the alphas are actually 8-bit signed!!
d1=(t1>>TEXTURE_BLEND_SHIFT)-((int)dwCol1>>TEXTURE_BLEND_SHIFT);

ddwOld1 = _m_from_int( dwCol1 );

if ( d1 > 0 )
{
if ( d1 >= 8 )
{
dp[0] = t1;
}
else
{
__m64 ddwNew1 = _m_from_int( t1 );
ddwOld1 = _mm_unpacklo_pi8( ddwOld1, ddwZero );
ddwNew1 = _mm_unpacklo_pi8( ddwNew1, ddwZero );
__m64 ddwMask = g_ddwMasks2[ d1 ];
ddwNew1 = _mm_sub_pi16( ddwNew1, ddwOld1 );
ddwNew1 = _mm_mullo_pi16( ddwNew1, ddwMask );
ddwNew1 = _mm_srai_pi16( ddwNew1, 3 );
ddwNew1 = _mm_adds_pi16( ddwNew1, ddwOld1 );
ddwNew1 = _m_packuswb( ddwNew1, ddwZero );
dp[0] = _m_to_int( ddwNew1 );
}
}
++dp;

}
if ( width & 2 )
{
int t1 = *ip+((poke.DDA())&ALPHAMASK);
ip += nXStep;
poke.XInc();

DWORD dwCol1 = dp[0]; // VERY IMPORTANT: the alphas are actually 8-bit signed!!
int d1=(t1>>TEXTURE_BLEND_SHIFT)-((int)dwCol1>>TEXTURE_BLEND_SHIFT);

__m64 ddwOld1 = _m_from_int( dwCol1 );

if ( d1 > 0 )
{
if ( d1 >= 8 )
{
dp[0] = t1;
}
else
{
__m64 ddwNew1 = _m_from_int( t1 );
ddwOld1 = _mm_unpacklo_pi8( ddwOld1, ddwZero );
ddwNew1 = _mm_unpacklo_pi8( ddwNew1, ddwZero );
__m64 ddwMask = g_ddwMasks2[ d1 ];
ddwNew1 = _mm_sub_pi16( ddwNew1, ddwOld1 );
ddwNew1 = _mm_mullo_pi16( ddwNew1, ddwMask );
ddwNew1 = _mm_srai_pi16( ddwNew1, 3 );
ddwNew1 = _mm_adds_pi16( ddwNew1, ddwOld1 );
ddwNew1 = _m_packuswb( ddwNew1, ddwZero );
dp[0] = _m_to_int( ddwNew1 );
}
}
++dp;



t1 = *ip+((poke.DDA())&ALPHAMASK);
ip += nXStep;
poke.XInc();

dwCol1 = dp[0]; // VERY IMPORTANT: the alphas are actually 8-bit signed!!
d1=(t1>>TEXTURE_BLEND_SHIFT)-((int)dwCol1>>TEXTURE_BLEND_SHIFT);

ddwOld1 = _m_from_int( dwCol1 );

if ( d1 > 0 )
{
if ( d1 >= 8 )
{
dp[0] = t1;
}
else
{
__m64 ddwNew1 = _m_from_int( t1 );
ddwOld1 = _mm_unpacklo_pi8( ddwOld1, ddwZero );
ddwNew1 = _mm_unpacklo_pi8( ddwNew1, ddwZero );
__m64 ddwMask = g_ddwMasks2[ d1 ];
ddwNew1 = _mm_sub_pi16( ddwNew1, ddwOld1 );
ddwNew1 = _mm_mullo_pi16( ddwNew1, ddwMask );
ddwNew1 = _mm_srai_pi16( ddwNew1, 3 );
ddwNew1 = _mm_adds_pi16( ddwNew1, ddwOld1 );
ddwNew1 = _m_packuswb( ddwNew1, ddwZero );
dp[0] = _m_to_int( ddwNew1 );
}
}
++dp;
}
if ( width & 1 )
{
int t1 = *ip+((poke.DDA())&ALPHAMASK);
ip += nXStep;
poke.XInc();

DWORD dwCol1 = dp[0]; // VERY IMPORTANT: the alphas are actually 8-bit signed!!
int d1=(t1>>TEXTURE_BLEND_SHIFT)-((int)dwCol1>>TEXTURE_BLEND_SHIFT);

__m64 ddwOld1 = _m_from_int( dwCol1 );

if ( d1 > 0 )
{
if ( d1 >= 8 )
{
dp[0] = t1;
}
else
{
__m64 ddwNew1 = _m_from_int( t1 );
ddwOld1 = _mm_unpacklo_pi8( ddwOld1, ddwZero );
ddwNew1 = _mm_unpacklo_pi8( ddwNew1, ddwZero );
__m64 ddwMask = g_ddwMasks2[ d1 ];
ddwNew1 = _mm_sub_pi16( ddwNew1, ddwOld1 );
ddwNew1 = _mm_mullo_pi16( ddwNew1, ddwMask );
ddwNew1 = _mm_srai_pi16( ddwNew1, 3 );
ddwNew1 = _mm_adds_pi16( ddwNew1, ddwOld1 );
ddwNew1 = _m_packuswb( ddwNew1, ddwZero );
dp[0] = _m_to_int( ddwNew1 );
}
}
++dp;
}
poke.YInc();
}
}
dp = op;
_mm_empty();
// TIMER_END(MixTex)

// TIMER_START(Shade)
cDDA shade( ((int)LANDSCAPE.Shade(mapx,mapy))<<24,
((int)LANDSCAPE.Shade(mapx+1,mapy))<<24,
((int)LANDSCAPE.Shade(mapx,mapy+1))<<24,
((int)LANDSCAPE.Shade(mapx+1,mapy+1))<<24, mCellShift );
shade.YStart(sy);
for(int y=ey-sy; y; dp+=ostep, y--)
{
shade.XStart(sx);
const __m64 ddwZero = _m_from_int( 0 );
for(int x=width >> 2; x; --x)
{
__m64 rgba1 = *LTM.Amb2Sun(shade.DDA()>>24);
shade.XInc();
__m64 rgba2 = *LTM.Amb2Sun(shade.DDA()>>24);
shade.XInc();

__m64 ddwOld1 = _m_from_int( dp[0] );
__m64 ddwOld2 = _m_from_int( dp[1] );
ddwOld1 = _mm_unpacklo_pi8( ddwOld1, ddwZero );
ddwOld1 = _mm_slli_pi16( ddwOld1, 2 );
ddwOld1 = _mm_mulhi_pi16( ddwOld1, rgba1 );
ddwOld1 = _m_packuswb( ddwOld1, ddwZero );
ddwOld2 = _mm_unpacklo_pi8( ddwOld2, ddwZero );
ddwOld2 = _mm_slli_pi16( ddwOld2, 2 );
ddwOld2 = _mm_mulhi_pi16( ddwOld2, rgba2 );
ddwOld2 = _m_packuswb( ddwOld2, ddwZero );
dp[0] = _m_to_int( ddwOld1 );
dp[1] = _m_to_int( ddwOld2 );
dp += 2;


rgba1 = *LTM.Amb2Sun(shade.DDA()>>24);
shade.XInc();
rgba2 = *LTM.Amb2Sun(shade.DDA()>>24);
shade.XInc();

ddwOld1 = _m_from_int( dp[0] );
ddwOld2 = _m_from_int( dp[1] );
ddwOld1 = _mm_unpacklo_pi8( ddwOld1, ddwZero );
ddwOld2 = _mm_unpacklo_pi8( ddwOld2, ddwZero );
ddwOld1 = _mm_slli_pi16( ddwOld1, 2 );
ddwOld2 = _mm_slli_pi16( ddwOld2, 2 );
ddwOld1 = _mm_mulhi_pi16( ddwOld1, rgba1 );
ddwOld2 = _mm_mulhi_pi16( ddwOld2, rgba2 );
ddwOld1 = _m_packuswb( ddwOld1, ddwZero );
ddwOld2 = _m_packuswb( ddwOld2, ddwZero );
dp[0] = _m_to_int( ddwOld1 );
dp[1] = _m_to_int( ddwOld2 );
dp += 2;
}
if ( width & 2 )
{
__m64 rgba1 = *LTM.Amb2Sun(shade.DDA()>>24);
shade.XInc();
__m64 rgba2 = *LTM.Amb2Sun(shade.DDA()>>24);
shade.XInc();

__m64 ddwOld1 = _m_from_int( dp[0] );
__m64 ddwOld2 = _m_from_int( dp[1] );
ddwOld1 = _mm_unpacklo_pi8( ddwOld1, ddwZero );
ddwOld2 = _mm_unpacklo_pi8( ddwOld2, ddwZero );
ddwOld1 = _mm_slli_pi16( ddwOld1, 2 );
ddwOld2 = _mm_slli_pi16( ddwOld2, 2 );
ddwOld1 = _mm_mulhi_pi16( ddwOld1, rgba1 );
ddwOld2 = _mm_mulhi_pi16( ddwOld2, rgba2 );
ddwOld1 = _m_packuswb( ddwOld1, ddwZero );
ddwOld2 = _m_packuswb( ddwOld2, ddwZero );
dp[0] = _m_to_int( ddwOld1 );
dp[1] = _m_to_int( ddwOld2 );
dp += 2;
}
if ( width & 1 )
{
__m64 rgba = *LTM.Amb2Sun(shade.DDA()>>24);
shade.XInc();

__m64 ddwOld = _m_from_int( dp[0] );
ddwOld = _mm_unpacklo_pi8( ddwOld, ddwZero );
ddwOld = _mm_slli_pi16( ddwOld, 2 );
ddwOld = _mm_mulhi_pi16( ddwOld, rgba );
ddwOld = _m_packuswb( ddwOld, ddwZero );
dp[0] = _m_to_int( ddwOld );
++dp;
}
shade.YInc();
}
_mm_empty();
// TIMER_END(Shade)
TIMER_END(DrawCellFlexi_MMX)
}



Share this post


Link to post
Share on other sites
Advertisement
Sign in to follow this  

  • Advertisement
×

Important Information

By using GameDev.net, you agree to our community Guidelines, Terms of Use, and Privacy Policy.

Participate in the game development conversation and more when you create an account on GameDev.net!

Sign me up!