• Create Account

# A Wild switch() Appears!

11 replies to this topic

### #1BGB  Crossbones+   -  Reputation: 1566

Like
6Likes
Like

Posted 20 November 2013 - 11:46 PM

POPULAR

yeah, switches are nothing new, and can sometimes if left unchecked get pretty big and ugly.

recently, in a piece of code of mine, this particularly glorious example has appeared:

	ret=0; mode=0; nextmode=0;
while((cs<cse) && (ct<cte) && !ret)
{
i=*cs++;
switch(i&0xE0)
{
case 0x80:
n1=(i&31)+1;
ct+=n1*stride;
break;
case 0xE0:
switch(i)
{
case 0xE0:
if(mode)
{ mode=0; nextmode=0; }
else
{ ret=1; }
break;
case 0xE1:
cs+=3;
break;
case 0xE8:
bgbbtj_rpza_memcpy8(ct, ct-stride);
ct+=stride;
break;
case 0xE9:
j=(*cs++)+1;
while(j--)
{
bgbbtj_rpza_memcpy8(ct, ct-stride);
ct+=stride;
}
break;
case 0xEA:
j=(*cs++)+1;
if((ct-j*stride)<blks) { ret=-1; break; }
bgbbtj_rpza_memcpy8(ct, ct-j*stride);
ct+=stride;
break;
case 0xEB:
j=(cs[0]<<8)+cs[1]+1;
cs+=2;
if((ct-j*stride)<blks) { ret=-1; break; }
bgbbtj_rpza_memcpy8(ct, ct-j*stride);
ct+=stride;
break;
case 0xEC:
j=(*cs++)+1;
k=(*cs++)+1;
if((ct+j*stride)>cte) { ret=-1; break; }
if((ct-k*stride)<blks) { ret=-1; break; }
while(j--)
{
bgbbtj_rpza_memcpy8(ct, ct-k*stride);
ct+=stride;
}
break;
case 0xED:
j=(cs[0])+1;
k=(cs[1]<<8)+cs[2]+1;
cs+=3;

if((ct+j*stride)>cte) { ret=-1; break; }
if((ct-k*stride)<blks) { ret=-1; break; }

while(j--)
{
bgbbtj_rpza_memcpy8(ct, ct-k*stride);
ct+=stride;
}
break;
case 0xF0:
switch(cs[0])
{
case 0xF0: cs++; mode=0; nextmode=0; break;
case 0xF1: cs++; mode=1; nextmode=1; break;
case 0xF2: cs++; mode=2; nextmode=2; break;
case 0xF3: cs++; mode=3; nextmode=3; break;
default: nextmode=mode; mode=0; break;
}
break;
case 0xF1: nextmode=mode; mode=1; break;
case 0xF2: nextmode=mode; mode=2; break;
case 0xF3: nextmode=mode; mode=3; break;

case 0xF8:
switch(*cs++)
{
case 0x81:
for(j=0; j<256; j++)
{ ctx->pal256[j]=rpza_blkidxcolor[j]; }
for(j=0; j<16; j++)
{ ctx->pal16[j]=rpza_blkidxcolor[j]; }
for(j=0; j<256; j++)
{ ctx->pat256[j]=rpza_blkidx_pixpat[j]; }
break;
case 0x82:
j=*cs++;
k=(*cs++)+1;
for(; j<k; j++)
{
l=(cs[0]<<8)|cs[1];
cs+=2;
ctx->pal256[j]=l;
}
break;
case 0x83:
j=(cs[0]>>4)&15;
k=(cs[0]&15)+1;
for(; j<k; j++)
{ ctx->pal16[j]=ctx->pal256[*cs++]; }
break;
case 0x84:
j=(cs[0]>>4)&15;
k=(cs[0]&15)+1;
for(; j<k; j++)
{
l=(cs[0]<<8)|cs[1]; cs+=2;
ctx->pal16[j]=l;
}
break;
case 0x85:
j=*cs++;
k=(*cs++)+1;
for(; j<k; j++)
{
l=(cs[0]<<24)|(cs[1]<<16)|(cs[2]<<8)|cs[3];
cs+=4;
ctx->pat256[j]=l;
}
break;
}
default:
break;
}
break;

default:
switch(mode)
{
case 0:
switch(i&0xE0)
{
case 0xA0:
j=(cs[0]<<8)|cs[1];
cs+=2;
l=j;
j=((j&0x7FE0)<<1)|(j&0x1F);

if(l&0x8000)
{
ctb[0]=j&0xFF;
ctb[1]=(j>>8)&0xFF;
ctb[2]=j&0xFF;
ctb[3]=(j>>8)&0xFF;
ctb[4]=0xFF; ctb[5]=0xFF;
ctb[6]=0xFF; ctb[7]=0xFF;
}else
{
ctb[0]=j&0xFF;
ctb[1]=(j>>8)&0xFF;
ctb[2]=j&0xFF;
ctb[3]=(j>>8)&0xFF;
ctb[4]=0; ctb[5]=0;
ctb[6]=0; ctb[7]=0;
}

n1=(i&31)+1;
for(i=0; i<n1; i++)
{
bgbbtj_rpza_memcpy8(ct, ctb);
ct+=stride;
}

break;
case 0xC0:
j=(cs[0]<<8)|cs[1];
k=(cs[2]<<8)|cs[3];
cs+=4;
l=k;
j=((j&0x7FE0)<<1)|(j&0x1F);
k=((k&0x7FE0)<<1)|(k&0x1F);

if(l&0x8000)
{
if(j<=k)
{
ctb[0]=j&0xFF;
ctb[1]=(j>>8)&0xFF;
ctb[2]=k&0xFF;
ctb[3]=(k>>8)&0xFF;
csm=rpza_blkmap1;
}else
{
ctb[0]=k&0xFF;
ctb[1]=(k>>8)&0xFF;
ctb[2]=j&0xFF;
ctb[3]=(j>>8)&0xFF;
csm=rpza_blkmap2;
}
}else
{
if(j>k)
{
ctb[0]=j&0xFF;
ctb[1]=(j>>8)&0xFF;
ctb[2]=k&0xFF;
ctb[3]=(k>>8)&0xFF;
csm=rpza_blkmap1;
}else
{
ctb[0]=k&0xFF;
ctb[1]=(k>>8)&0xFF;
ctb[2]=j&0xFF;
ctb[3]=(j>>8)&0xFF;
csm=rpza_blkmap2;
}
}

n1=(i&31)+1;
for(i=0; i<n1; i++)
{
ctb[4]=csm[cs[0]]; ctb[5]=csm[cs[1]];
ctb[6]=csm[cs[2]]; ctb[7]=csm[cs[3]];
cs+=4;
bgbbtj_rpza_memcpy8(ct, ctb);
ct+=stride;
}
break;

default:
if(cs[1]&0x80)
{
cs--;
j=(cs[0]<<8)|cs[1];
k=(cs[2]<<8)|cs[3];
cs+=4;
j=((j&0x7FE0)<<1)|(j&0x1F);
k=((k&0x7FE0)<<1)|(k&0x1F);

if(j>k)
{
ctb[0]=j&0xFF;
ctb[1]=(j>>8)&0xFF;
ctb[2]=k&0xFF;
ctb[3]=(k>>8)&0xFF;
csm=rpza_blkmap1;
}else
{
ctb[0]=k&0xFF;
ctb[1]=(k>>8)&0xFF;
ctb[2]=j&0xFF;
ctb[3]=(j>>8)&0xFF;
csm=rpza_blkmap2;
}

ctb[4]=csm[cs[0]]; ctb[5]=csm[cs[1]];
ctb[6]=csm[cs[2]]; ctb[7]=csm[cs[3]];
cs+=4;
bgbbtj_rpza_memcpy8(ct, ctb);
ct+=stride;
}else
{
memset(ctb, 0, 8);
//dummy...
cs+=31;
bgbbtj_rpza_memcpy8(ct, ctb); ct+=stride;
}
break;
}
break;
case 1:
switch(i&0xE0)
{
case 0xA0:
j=ctx->pal256[*cs++];
j=((j&0x7FE0)<<1)|(j&0x1F);

ctb[0]=j&0xFF;
ctb[1]=(j>>8)&0xFF;
ctb[2]=j&0xFF;
ctb[3]=(j>>8)&0xFF;
ctb[4]=0; ctb[5]=0;
ctb[6]=0; ctb[7]=0;

n1=(i&31)+1;
for(i=0; i<n1; i++)
{
bgbbtj_rpza_memcpy8(ct, ctb);
ct+=stride;
}
break;
case 0xC0:
j=ctx->pal256[cs[0]];
k=ctx->pal256[cs[1]];
cs+=2;
j=((j&0x7FE0)<<1)|(j&0x1F);
k=((k&0x7FE0)<<1)|(k&0x1F);

ctb[0]=j&0xFF;
ctb[1]=(j>>8)&0xFF;
ctb[2]=k&0xFF;
ctb[3]=(k>>8)&0xFF;
csm=rpza_blkmap1;

n1=(i&31)+1;
for(i=0; i<n1; i++)
{
j=rpza_blkidxmap1[cs[0]];
k=rpza_blkidxmap1[cs[1]];
cs+=2;

ctb[4]=csm[(j>>8)&255]; ctb[5]=csm[j&255];
ctb[6]=csm[(k>>8)&255]; ctb[7]=csm[k&255];
bgbbtj_rpza_memcpy8(ct, ctb);
ct+=stride;
}
break;
default:
cs--;
j=ctx->pal256[cs[0]];
k=ctx->pal256[cs[1]];
cs+=2;
j=((j&0x7FE0)<<1)|(j&0x1F);
k=((k&0x7FE0)<<1)|(k&0x1F);

ctb[0]=j&0xFF;
ctb[1]=(j>>8)&0xFF;
ctb[2]=k&0xFF;
ctb[3]=(k>>8)&0xFF;
csm=rpza_blkmap1;

j=rpza_blkidxmap1[cs[0]];
k=rpza_blkidxmap1[cs[1]];
cs+=2;

ctb[4]=csm[(j>>8)&255]; ctb[5]=csm[j&255];
ctb[6]=csm[(k>>8)&255]; ctb[7]=csm[k&255];
bgbbtj_rpza_memcpy8(ct, ctb);
ct+=stride;

break;
}
break;
case 2:
switch(i&0xE0)
{
case 0xA0:
j=ctx->pal16[(*cs++)&15];
j=((j&0x7FE0)<<1)|(j&0x1F);

ctb[0]=j&0xFF;
ctb[1]=(j>>8)&0xFF;
ctb[2]=j&0xFF;
ctb[3]=(j>>8)&0xFF;
ctb[4]=0; ctb[5]=0;
ctb[6]=0; ctb[7]=0;

n1=(i&31)+1;
for(i=0; i<n1; i++)
{
bgbbtj_rpza_memcpy8(ct, ctb);
ct+=stride;
}
break;
case 0xC0:
j=ctx->pal16[(cs[0]>>4)&15];
k=ctx->pal16[(cs[0]>>0)&15];
cs++;
j=((j&0x7FE0)<<1)|(j&0x1F);
k=((k&0x7FE0)<<1)|(k&0x1F);

ctb[0]=j&0xFF;
ctb[1]=(j>>8)&0xFF;
ctb[2]=k&0xFF;
ctb[3]=(k>>8)&0xFF;
csm=rpza_blkmap1;

n1=(i&31)+1;
for(i=0; i<n1; i++)
{
j=ctx->pat256[*cs++];
ctb[4]=csm[(j>>24)&255]; ctb[5]=csm[(j>>16)&255];
ctb[6]=csm[(j>> 8)&255]; ctb[7]=csm[(j    )&255];
bgbbtj_rpza_memcpy8(ct, ctb);
ct+=stride;
}
break;
default:
cs--;
j=ctx->pal16[(cs[0]>>4)&15];
k=ctx->pal16[(cs[0]>>0)&15];
cs++;
j=((j&0x7FE0)<<1)|(j&0x1F);
k=((k&0x7FE0)<<1)|(k&0x1F);

ctb[0]=j&0xFF;
ctb[1]=(j>>8)&0xFF;
ctb[2]=k&0xFF;
ctb[3]=(k>>8)&0xFF;
csm=rpza_blkmap1;

j=ctx->pat256[*cs++];
ctb[4]=csm[(j>>24)&255]; ctb[5]=csm[(j>>16)&255];
ctb[6]=csm[(j>> 8)&255]; ctb[7]=csm[(j    )&255];
bgbbtj_rpza_memcpy8(ct, ctb);
ct+=stride;

break;
}
break;
case 3:
switch(i&0xE0)
{
case 0xA0:
j=ctx->pal256[*cs++];
j=((j&0x7FE0)<<1)|(j&0x1F);

ctb[0]=j&0xFF;
ctb[1]=(j>>8)&0xFF;
ctb[2]=j&0xFF;
ctb[3]=(j>>8)&0xFF;
ctb[4]=0; ctb[5]=0;
ctb[6]=0; ctb[7]=0;

n1=(i&31)+1;
for(i=0; i<n1; i++)
{
bgbbtj_rpza_memcpy8(ct, ctb);
ct+=stride;
}
break;
case 0xC0:
j=ctx->pal256[cs[0]];
k=ctx->pal256[cs[1]];
cs+=2;
j=((j&0x7FE0)<<1)|(j&0x1F);
k=((k&0x7FE0)<<1)|(k&0x1F);

ctb[0]=j&0xFF;
ctb[1]=(j>>8)&0xFF;
ctb[2]=k&0xFF;
ctb[3]=(k>>8)&0xFF;
csm=rpza_blkmap1;

n1=(i&31)+1;
for(i=0; i<n1; i++)
{
j=ctx->pat256[*cs++];
ctb[4]=csm[(j>>24)&255]; ctb[5]=csm[(j>>16)&255];
ctb[6]=csm[(j>> 8)&255]; ctb[7]=csm[(j    )&255];
bgbbtj_rpza_memcpy8(ct, ctb);
ct+=stride;
}
break;
default:
cs--;
j=ctx->pal256[cs[0]];
k=ctx->pal256[cs[1]];
cs+=2;
j=((j&0x7FE0)<<1)|(j&0x1F);
k=((k&0x7FE0)<<1)|(k&0x1F);

ctb[0]=j&0xFF;
ctb[1]=(j>>8)&0xFF;
ctb[2]=k&0xFF;
ctb[3]=(k>>8)&0xFF;
csm=rpza_blkmap1;

j=ctx->pat256[*cs++];
ctb[4]=csm[(j>>24)&255]; ctb[5]=csm[(j>>16)&255];
ctb[6]=csm[(j>> 8)&255]; ctb[7]=csm[(j    )&255];
bgbbtj_rpza_memcpy8(ct, ctb);
ct+=stride;

break;
}
break;

default:
break;
}
mode=nextmode;
break;
}
}



context:

this is the primary image decoding loop from one of my video codecs (BTIC1C).

it started out simpler (when the format had a few less features) but is getting a bit more hairy as time goes on.

part of the hair is due to the recent addition of modality in terms of the block-encoding (optional cheaper / lower-quality blocks).

side note: I actually have a small army of specialized codecs at this point, it is actually starting to get a bit silly...

each is slightly different though with different trade-offs.

OTOH, some big switches have appeared elsewhere, but have generally been big flat switches, rather than nested ones.

### #2Nypyren  Crossbones+   -  Reputation: 10520

Like
5Likes
Like

Posted 21 November 2013 - 01:37 AM

POPULAR

The largest mess of switch statements I've ever written was for my x86-64 OpcodeMapDisassembler.

Base level switch statement for the first byte with 255 cases.
-> If the first byte is 0x0F, another 255 case switch for 2-byte opcodes.
---> If the second byte is 0x38 or 0x3A, another pair of case switches (far fewer cases since these maps haven't been filled up by Intel/AMD yet) for 3-byte opcodes.
-> If the first byte is 0xD8 through 0xDF, another 8 sets of 8-case switches for FPU instructions.

Various special-case if/else/switch chains for MMX/SSE discrimination.

Scattered throughout the opcode map are special case instruction groups (16 of them so far) which typically each have switches with between 4 and 8 cases apiece.

Down in the deep bowels, you'll see functions like this to handle the last byte of the opcode (this is for opcodes 0F 70 through 0F 77, which is a very tiny percent of the overall instruction set):

        private bool FindOpcode0F_70to77()
{
switch (op2)
{
case 0x70:
if (simdNone)    return Valid(Opcode.PSHUFW, Pq, Qq, Ib); // MMX
else if (simd66) return ValidSV123(Opcode.VPSHUFD, Vdq, Wdq, Ib);
else if (simdF3) return ValidSV123(Opcode.VPSHUFHW, Vdq, Wdq, Ib);
else if (simdF2) return ValidSV123(Opcode.VPSHUFLW, Vdq, Wdq, Ib);
else throw new NotSupportedException();

case 0x71: return Grp12();
case 0x72: return Grp13();
case 0x73: return Grp14();

case 0x74:
if (simdNone)    return Valid(Opcode.PCMPEQB, Pq, Qq); // MMX
else if (simd66) return ValidSV13(Opcode.VPCMPEQB, Vdq, Hdq, Wdq);
else if (simdF3) return false; // blank
else if (simdF2) return false; // blank
else throw new NotSupportedException();

case 0x75:
if (simdNone)    return Valid(Opcode.PCMPEQW, Pq, Qq); // MMX
else if (simd66) return ValidSV13(Opcode.VPCMPEQW, Vdq, Hdq, Wdq);
else if (simdF3) return false; // blank
else if (simdF2) return false; // blank
else throw new NotSupportedException();

case 0x76:
if (simdNone)    return Valid(Opcode.PCMPEQD, Pq, Qq); // MMX
else if (simd66) return ValidSV13(Opcode.VPCMPEQD, Vdq, Hdq, Wdq);
else if (simdF3) return false; // blank
else if (simdF2) return false; // blank
else throw new NotSupportedException();

case 0x77:
if (simdNone)
{
if (!vexPresent)
return Valid(Opcode.EMMS);       // 0F 77              EMMS
else if (!vexL)
return Valid(Opcode.VZEROUPPER); // VEX.128.0F.WIG 77  VZEROUPPER
else
return Valid(Opcode.VZEROALL);   // VEX.256.0F.WIG 77  VZEROALL
}
else if (simd66) return false; // blank
else if (simdF3) return false; // blank
else if (simdF2) return false; // blank
else throw new NotSupportedException();
}

return false;
}

It's a truly horrifying amount of code.

Edited by Nypyren, 21 November 2013 - 01:40 AM.

### #3BGB  Crossbones+   -  Reputation: 1566

Like
0Likes
Like

Posted 21 November 2013 - 11:08 AM

@Nypyren:

when I had written an x86 disassembler, and later an interpreter/emulator (userland only), I had used the listing tables (basically, the same tables as used by my assembler). the listings were basically held in a text file which was using a notation derived from the Intel docs, which during the build process would be converted into C source files and matching headers via a tool (so parts of the assembler and disassembler/emulator were procedurally generated).

for the interpreter, for speeding up the decoding, I had used a set of index tables (IIRC, 2x 256), where one table would match single-byte instructions, and fall-back tables would match 2-byte forms, followed by table searches for anything that couldn't be matched in 2 bytes (it would have a list of possible matching instructions). this made the code a lot less hairy, but potentially lots of big switches could have been faster.

granted, to make the interpretation faster, I had ended up effectively implementing a CPU cache which would convert "traces" of x86 instructions into threaded code (sequences of calls via function pointers), which was later used by my script-language interpreter (and, my script VM JIT uses a hybrid strategy, mixing direct instruction sequences with calls back into C land, hence part of the reason for its weak performance).

decided to refrain from going to much into the nature and issues with trying to get my script VM to go faster (summary: most of the time anyways goes into the C->script calls, but there is no "good" way to really optimize this short of making the C side VM interface behave more like COM+ or similar, *, ...).

granted, decoding instructions for my script language does itself use some giant switch() blocks (quick check, the "main one" is about 2000 lines, or about 4x longer than the one from BTIC1C), which mostly just serves for unpacking the instruction into a structure and picking the correct function-pointer to execute it (however, I consider it "cleaner" and less "wild" mostly as it is better organized and is a single flat-list switch, vs some unwieldy mess of nested switches implementing a small state-machine).

most of the actual "execution logic" within the C side of the VM then mostly consists of short functions often doing a single operation (such as: adding two integers and storing the result, ...)

*: basically, ability to get C side interfaces for various script-language classes, and then make more direct calls into the compiled script-code, vs calling via variable-argument C functions which end up wasting clock-cycles doing things like copying function-arguments from a va_list based on a signature string, ... as-is, this actually outweighs the cost of the naive JIT output, as most of the script functions / methods are fairly short.

Edited by BGB, 21 November 2013 - 11:09 AM.

### #4Nypyren  Crossbones+   -  Reputation: 10520

Like
0Likes
Like

Posted 21 November 2013 - 12:49 PM

My previous disassembler worked like that - a large text file that I would parse into a data structure that was used to disassemble.  Unfortunately my existing data structure wasn't very well suited to dealing with SSE/VEX prefixes and was very difficult to debug.  So when I was updating my disassembler to support SSE/AVX instructions I decided to change to hardcoded switch statements.  It's much easier to debug overall, and I can use more clever rules:

return ValidSV13(Opcode.VPCMPEQB, Vdq, Hdq, Wdq);

The "ValidSV13" is a special way of dealing with SSE vs. AVX instructions which are almost identical. Most AVX instructions only differ from their SSE counterparts by having one extra operand (a nondestructive output register). However, there are a few different patterns:

(excuse my naming convention - I often called AVX "VEX" in my code instead due to dealing with the VEX prefixes too often)

        private bool Valid(Opcode opcode, params Func<Operand>[] operandDecoders)
{
AccountForSIMD66(opcode); // If prefix 66 is present but the opcode is SIMD, revert the operandSize to default!

Operand[] operands = new Operand[operandDecoders.Length];
for (int i=0; i<operandDecoders.Length; ++i)
operands[i] = operandDecoders[i]();

var machineCode = stream.ToArray().Take((int)stream.Position).ToArray();

Instruction = new Instruction(addr, machineCode, opcode, operands);

return true;
}

private void AccountForSIMD66(Opcode opcode)
{
if (g3 == 0x66 && simd66Opcodes.Contains(opcode))
{
g3 = 0;
ApplyOperandSizePrefixes();
}
}

/// <summary>
/// Valid, only in non-64-bit modes.
/// </summary>
private bool Valid_i64(Opcode opcode, params Func<Operand>[] operandDecoders)
{
if (x64)
return false;

return Valid(opcode, operandDecoders);
}

/// <summary>
/// Valid, only in 64-bit mode.
/// </summary>
private bool Valid_o64(Opcode opcode, params Func<Operand>[] operandDecoders)
{
if (!x64)
return false;

return Valid(opcode, operandDecoders);
}

/// <summary>
/// Valid, in 64-bit mode the 32-bit operand size is forced to 64-bits.
/// </summary>
private bool Valid_d64(Opcode opcode, params Func<Operand>[] operandDecoders)
{
if (x64 && operandSize == 32)
operandSize = 64;

return Valid(opcode, operandDecoders);
}

/// <summary>
/// Valid, in 64-bit mode all operand sizes are forced to 64-bits.
/// </summary>
private bool Valid_f64(Opcode opcode, params Func<Operand>[] operandDecoders)
{
if (x64)
operandSize = 64;

return Valid(opcode, operandDecoders);
}

// Naming convention for the following functions:
// SV###.  Each digit indicates the operand that the SSE version takes.  The VEX version takes all arguments.

// For example, SV13 means:
// SV = SSE/VEX, automapping the opcode from VEX opcode to SSE.
// 1 = argument 1 is the first SSE operand.
// 3 = argument 3 is the second SSE operand.

// Functions which take all the same arguments are provided for consistency.

/// <summary>
/// SSE or VEX both taking the same two operands.
/// Auto-maps the VEX opcode to SSE when SSE is used.
/// </summary>
private bool ValidSV12(Opcode vex, Func<Operand> v1, Func<Operand> v2)
{
return vexPresent ? Valid(vex, v1, v2)
: Valid(VEXtoSSE[vex], v1, v2);
}

/// <summary>
/// SSE of VEX both taking the same three operands.
/// Auto-maps the VEX opcode to SSE when SSE is used.
/// </summary>
private bool ValidSV123(Opcode vex, Func<Operand> v1, Func<Operand> v2, Func<Operand> v3)
{
return vexPresent ? Valid(vex, v1, v2, v3)
: Valid(VEXtoSSE[vex], v1, v2, v3);
}

/// <summary>
/// SSE with operands 1 and 3 or VEX with all three.
/// Auto-maps the VEX opcode to SSE when SSE is used.
/// </summary>
private bool ValidSV13(Opcode vex, Func<Operand> v1, Func<Operand> v2, Func<Operand> v3)
{
return vexPresent ? Valid(vex, v1, v2, v3)
: Valid(VEXtoSSE[vex], v1, v3);
}

/// <summary>
/// SSE with operands 2 and 3 or VEX with all three.
/// Auto-maps the VEX opcode to SSE when SSE is used.
/// </summary>
private bool ValidSV23(Opcode vex, Func<Operand> v1, Func<Operand> v2, Func<Operand> v3)
{
return vexPresent ? Valid(vex, v1, v2, v3)
: Valid(VEXtoSSE[vex], v2, v3);
}

/// <summary>
/// SSE with operands 1, 3 and 4 or VEX with all four.
/// Auto-maps the VEX opcode to SSE when SSE is used.
/// </summary>
private bool ValidSV134(Opcode vex, Func<Operand> v1, Func<Operand> v2, Func<Operand> v3, Func<Operand> v4)
{
return vexPresent ? Valid(vex, v1, v2, v3, v4)
: Valid(VEXtoSSE[vex], v1, v3, v4);
}


### #5Madhed  Crossbones+   -  Reputation: 4089

Like
0Likes
Like

Posted 21 November 2013 - 01:13 PM

Wow that looks very much like my first attempt at writing a c64 emulator.

### #6BGB  Crossbones+   -  Reputation: 1566

Like
0Likes
Like

Posted 21 November 2013 - 05:30 PM

@Nypyren:

I still use the listing-based strategy with SSE, XOP, and AVX stuff, but at one point did have to tweak things to allow opcodes with more than 3 operands (IIRC, I increased the limit to 7);

also had to extend the notation some to allow representing the rules for the added prefixes (there was a little hair WRT these prefixes, and neither Intel nor AMD stuck with a straightforward plain-ASCII notation, so another notation was devised). actually, the listing notation also includes special characters for where the various prefixes go, ...

my interpreter didn't use SSE, FWIW, but was limited to a more 486-like subset, namely basic integer operations and the x87 FPU (emulated mostly via 64-bit integer math).

FWIW: at this point, this is the only really free / unencumbered form of x86, as having MMX or SSE support or similar risks stepping on patents (so, for example, a CPU vendor could freely make a chip using a 486 like version of the ISA, but would have to license things from Intel and AMD if they wanted things like SSE and x86-64).

I think I had planned on adding emulated MMX and SSE, but didn't get to it though, and the interpreter turned out to be "nowhere near as useful as originally imagined". mostly at the time I had figured x86 could be used as an IL, but then I came to realize: x86 doesn't really make for a good IL.

### #7Nypyren  Crossbones+   -  Reputation: 10520

Like
0Likes
Like

Posted 21 November 2013 - 05:53 PM

...x86 doesn't really make for a good IL.

I wholeheartedly agree. The x86 encoding's main redeeming feature over common RISC encodings is that the flexible encoding length allows for much more space-efficient code (doing MOVs are typically 2 bytes, immediate operands don't require two separate hi/lo instructions, common stuff like push/pop are encoded in 1 byte). But for all other considerations it's WAY too complicated.

My decompiler uses an IL with 15 instructions (5 of which are for the SCAS/MOVS/etc instructions since those are almost always part of a strlen/strcpy/memcpy function that got inlined and are much easier to detect if I don't convert them to full loops in IL).

### #8BGB  Crossbones+   -  Reputation: 1566

Like
0Likes
Like

Posted 21 November 2013 - 08:56 PM

...x86 doesn't really make for a good IL.

I wholeheartedly agree. The x86 encoding's main redeeming feature over common RISC encodings is that the flexible encoding length allows for much more space-efficient code (doing MOVs are typically 2 bytes, immediate operands don't require two separate hi/lo instructions, common stuff like push/pop are encoded in 1 byte). But for all other considerations it's WAY too complicated.

My decompiler uses an IL with 15 instructions (5 of which are for the SCAS/MOVS/etc instructions since those are almost always part of a strlen/strcpy/memcpy function that got inlined and are much easier to detect if I don't convert them to full loops in IL).

the main problem I had with trying to use x86 as an IL (for an interpreter) was that it tends to get too many architecture details tied up with how the instructions work, so it is pretty much "nearly impossible" to effectively gloss over things like what the pointer-size is or whether registers are naturally 32 or 64 bits, ... while still keeping it binary compatible with native x86.

the result then ends up looking like a sub-program operating within a main program with needing contortions to get data or calls into or out-of the interpreter, which wasn't really worthwhile. it could work maybe, but compared poorly with just using a script language compiled to a bytecode (and leaving most other things to native code).

meanwhile, a bytecode then can be designed to gloss over a lot of architectural details (like how big the native-size pointer or integer is, how exactly many values are represented, ...).

most of my bytecode formats have generally been stack-based (for example, current script-VM bytecode is a statically-typed stack-machine). it has a lot of instructions, but is in some ways "rather CISC".

I have a few times considered trying to move over to 3AC (*), but the level of work involved (would need to re-implement much of the VM core) has thus far prevented me from doing so.

some early tests (with a small prototype interpreter loop and hand-crafted instruction sequences) had shown some rather promising looking performance statistics (I was getting within 9x of C just with just an interpreter). my estimates are that a JIT for it could likely produce C competitive performance.

besides the effort thus far killing off most efforts to make the move, there has been conflict over the best way to get from the ASTs to 3AC (whether to first compile to the existing bytecode, then convert this to 3AC and run it through an optimizer, or to write a new upper-end and compile directly from the ASTs to 3AC), ...

*: sort of like a RISC but with variable-length instructions.

most of the 3AC bytecode wouldn't be all that notable.

sort of gives the basic idea:

the logical representation of the instructions was in an ASM-like format (*2), but the actual instructions generally consisted of a prefix+opcode pair followed by any operands (the prefix would encode the value-type and operand layout, with the opcode saying which logical operation to perform).

*2:

opcode.type dest, srca, srcb

opcode.type dest, src, immed

...

### #9wodinoneeye  Members   -  Reputation: 1629

Like
0Likes
Like

Posted 09 December 2013 - 01:28 PM

OP

cant help much - irregular logic code is just irregular...

Possibly use a C macro for the stuffing of the   ctb[0]..[3] to reduce it to one line      and  another for ctb[4] ..[7]

--------------------------------------------Ratings are Opinion, not Fact

### #10BGB  Crossbones+   -  Reputation: 1566

Like
0Likes
Like

Posted 09 December 2013 - 03:27 PM

OP

cant help much - irregular logic code is just irregular...

Possibly use a C macro for the stuffing of the   ctb[0]..[3] to reduce it to one line      and  another for ctb[4] ..[7]

yeah, it is possible...

a lot of code is written fairly quickly and so tends to develop repeating patterns and some verbosity.

some things are left expanded out in case I see a way to make them faster, where putting things in macros can sometimes hide potential optimizations, or sometimes make some issues for looking at and stepping through stuff in the debugger.

though, for example, "*_memcpy8()" is actually a macro in this case (chooses a target-specific mechanism to copy 8 bytes).

### #11wodinoneeye  Members   -  Reputation: 1629

Like
0Likes
Like

Posted 10 December 2013 - 08:34 AM

OP

cant help much - irregular logic code is just irregular...

Possibly use a C macro for the stuffing of the   ctb[0]..[3] to reduce it to one line      and  another for ctb[4] ..[7]

yeah, it is possible...

a lot of code is written fairly quickly and so tends to develop repeating patterns and some verbosity.

some things are left expanded out in case I see a way to make them faster, where putting things in macros can sometimes hide potential optimizations, or sometimes make some issues for looking at and stepping through stuff in the debugger.

though, for example, "*_memcpy8()" is actually a macro in this case (chooses a target-specific mechanism to copy 8 bytes).

I looked at it for a while looking for processing commonality and that was the closest/easiest/clear thing just to shrink the long code length down  a bit.

Those 2 macros  I mention seemed to be simple patterns reused more than a few times  and  they can  just stuff 4 given parameters (the assignment equations) into the 0,1,2,3  and 4,5,6,7  offsets of the of  4 consecutive ctb[] array slots - it coukld shrink the function length some significant amount  (I usually use ALL uppercase for macro names to make them stand out clearly, but thats just my style)   I'd make the macro name short so you wont have the params running off the right side (I also indent only 3 blanks so I can have quite long 1 line macro param sets fit)

The other thing I notice (I think Ive done something like this code at one time)   is to document the mode data formats (words bytes bits) and command codes Right in with the Code at teh  spots that carry out each variation.   (also maybe name the encoding variations even to make it more mnemonic).

Also Im not sure of the effects on the flow clarity but those 'modes' ---can they be broken out into a seperate function each mode to get rid of some of the nesting?

Again, its quite irregular  and very few common processing variations to not just spell most of it out linear like that (I dont really see that going away)

--------------------------------------------Ratings are Opinion, not Fact

### #12BGB  Crossbones+   -  Reputation: 1566

Like
0Likes
Like

Posted 10 December 2013 - 02:41 PM

OP

cant help much - irregular logic code is just irregular...

Possibly use a C macro for the stuffing of the   ctb[0]..[3] to reduce it to one line      and  another for ctb[4] ..[7]

yeah, it is possible...

a lot of code is written fairly quickly and so tends to develop repeating patterns and some verbosity.

some things are left expanded out in case I see a way to make them faster, where putting things in macros can sometimes hide potential optimizations, or sometimes make some issues for looking at and stepping through stuff in the debugger.

though, for example, "*_memcpy8()" is actually a macro in this case (chooses a target-specific mechanism to copy 8 bytes).

I looked at it for a while looking for processing commonality and that was the closest/easiest/clear thing just to shrink the long code length down  a bit.

Those 2 macros  I mention seemed to be simple patterns reused more than a few times  and  they can  just stuff 4 given parameters (the assignment equations) into the 0,1,2,3  and 4,5,6,7  offsets of the of  4 consecutive ctb[] array slots - it coukld shrink the function length some significant amount  (I usually use ALL uppercase for macro names to make them stand out clearly, but thats just my style)   I'd make the macro name short so you wont have the params running off the right side (I also indent only 3 blanks so I can have quite long 1 line macro param sets fit)

it is possible, I may consider it.

many of them are for things like filling in word-values, which could be done in macros without obscuring the behavior.

macros are usually, but not always, uppercase. in this case it is lowercase because it directly replaced the use of memcpy calls, mostly as early on in tests these memcpy calls were eating a lot of time, and MSVC can be a little "naive" about things like this sometimes.

sometimes whether or not the macro is uppercase depends on whether or not it is intended to be treated mostly as if it were a normal function call, but making function names all lower-case is also special in my conventions, as normal function names are FirstLettersCapital.

The other thing I notice (I think Ive done something like this code at one time)   is to document the mode data formats (words bytes bits) and command codes Right in with the Code at teh  spots that carry out each variation.   (also maybe name the encoding variations even to make it more mnemonic).

Also Im not sure of the effects on the flow clarity but those 'modes' ---can they be broken out into a seperate function each mode to get rid of some of the nesting?

Again, its quite irregular  and very few common processing variations to not just spell most of it out linear like that (I dont really see that going away)

the basic format is documented on my wiki:

http://cr88192.dyndns.org:8080/wiki/index.php/BTIC1C

I may consider reworking the organization of the loop some anyways mostly as I had recently been thinking that some sort of motion compensation is needed, as basically a sort of skip+translate thing, which will require some modest changes.

what the modes basically do is allow for more compact block encodings to be used, at the cost of some color accuracy.

for example, we can have a mode where 8 and 4 bit palette based colors are used instead of RGB555, and where 1bpp and 0.5bpp interpolation is used (in the 0.5bpp interpolation mode, blocks are only represented "approximately" via tables of block-patterns).

effectively, they change the meaning and format of the commands which follow them, either for a single command, or until the next time the mode is changed.

I had considered breaking it up into multiple functions a few times, but had put it off previously as that would have made the code a little longer and also require moving some more of the state into the context structure or similar (as opposed to keeping it in local variables).

I am also idly thinking about something, options between:

implement a new codec (I am calling BTIC1E) to deal with BPTC (BC6H and BC7), which could offer high-speed and high quality, but would likely use a high bitrate;

switch to a more generic format for BC6H and BC7 support (using real-time encoding, possibly using BTIC2C, would cost in terms of performance and image quality);

newer (possibly hacky) special BTIC1C decoder loop to target them (probably a subset, though this is unlikely to offer a whole lot over DXT);

mostly not bothering with BC6H and BC7 for video (I have been using them some for textures, though my BPTC encoder currently mostly ignores block-modes which use partitions, *).

*: I am not sure if there are any good algorithms for dealing with partitioned block modes.

it appears as if it would be necessary to iterate through all of the possible partitions to find a good one for a given block, which is likely to be slow.

there have been a few ideas though for a more direct (partially DCT and lookup-table based) strategy, which could be faster, but is likely to still be a bit slow for use in real-time or load-time conversion.

Edited by BGB, 10 December 2013 - 03:00 PM.

PARTNERS