I just wrote an SSE alpha blending routine which blends 16 byte pairs in parallel (drawback: uses the same alpha value for all 16 pairs), and I wanted to share it with you because its performance seems quite sweet to me and I would like to hear your opinions on it. I wrote a small single threaded program that blends apx 2 billion byte pairs in a second on my 2.66 ghz core2 duo (actually, the same 16 pairs * 125000000 times...). It uses the SSE PAVGB instruction. This is actually the first assembly routine I wrote that does something "useful" :P
-EDIT: see below for improved versions
The algorithm used for each byte is the following:
uint8_t in[2];
uint16_t out = 0;
uint8_t a;
for(int32_t i = 0; i < 8; i++)
{
out += (uint16_t)in[(a >> i) & 1] + 1;
out >>= 1;
}
#include <iostream>
#include <ctime>
#include <cstdint>
uint8_t in[32] = {1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
uint8_t out[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
uint64_t a = 255;
uint8_t save[512];
int main()
{
asm(".intel_syntax noprefix \n");
asm(" push r15 \n");
asm(" fxsave save \n");
asm(".att_syntax noprefix \n");
for(long i = 0; i < 125000000; i++)
{
asm(".intel_syntax noprefix \n");
asm(" movdqu xmm0, out \n");
asm(" mov r15, [a] \n");
asm(" shr r15, 0 \n");
asm(" and r15, 1 \n");
asm(" shl r15, 4 \n");
asm(" pavgb xmm0, [in + r15] \n");
asm(" mov r15, [a] \n");
asm(" shr r15, 1 \n");
asm(" and r15, 1 \n");
asm(" shl r15, 4 \n");
asm(" pavgb xmm0, [in + r15] \n");
asm(" mov r15, [a] \n");
asm(" shr r15, 2 \n");
asm(" and r15, 1 \n");
asm(" shl r15, 4 \n");
asm(" pavgb xmm0, [in + r15] \n");
asm(" mov r15, [a] \n");
asm(" shr r15, 3 \n");
asm(" and r15, 1 \n");
asm(" shl r15, 4 \n");
asm(" pavgb xmm0, [in + r15] \n");
asm(" mov r15, [a] \n");
asm(" shr r15, 4 \n");
asm(" and r15, 1 \n");
asm(" shl r15, 4 \n");
asm(" pavgb xmm0, [in + r15] \n");
asm(" mov r15, [a] \n");
asm(" shr r15, 5 \n");
asm(" and r15, 1 \n");
asm(" shl r15, 4 \n");
asm(" pavgb xmm0, [in + r15] \n");
asm(" mov r15, [a] \n");
asm(" shr r15, 6 \n");
asm(" and r15, 1 \n");
asm(" shl r15, 4 \n");
asm(" pavgb xmm0, [in + r15] \n");
asm(" mov r15, [a] \n");
asm(" shr r15, 7 \n");
asm(" and r15, 1 \n");
asm(" shl r15, 4 \n");
asm(" pavgb xmm0, [in + r15] \n");
asm(" movdqu out, xmm0 \n");
asm(".att_syntax noprefix \n");
}
asm(".intel_syntax noprefix \n");
asm(" fxrstor save \n");
asm(" pop r15 \n");
asm(".att_syntax noprefix \n");
for(int i = 0; i < 16; i++)
std::cout << (int32_t)out << '\t';
std::cout << std::endl;
return 0;
}
It's basically untested but it seems to work judging on some sample values i tried... It may be a shade off in some cases. Two possible optimizations I have thought are:
1) precomputing the middle 3 instructions of each pavgb chunk in an 8 byte array, or even using the 8 upper gprs in x86-64
2) it might be possible to trade off accuracy for speed by using fewer alpha bits and reducing the number of pavgb chunks
The timing was done by just using the time command on the Linux terminal, I guess it's OK for measuring times as large as a second..
-EDIT:
I changed movdqu to movdqa and did the precompute thing, so the time got down to 0.649 secs (0.8 with unaligned loads, huge difference). This is the new version:
#include <iostream>
#include <ctime>
#include <cstdint>
uint8_t in[32] = {1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
uint8_t out[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
uint64_t a = 255;
uint8_t save[512];
uint64_t pc[8] = {0, 0, 0, 0, 0, 0, 0, 0};
int main()
{
for(long i = 0; i < 8; i++)
pc = ((a >> i) & 1) << 4;
asm(".intel_syntax noprefix \n");
asm(" push r15 \n");
asm(" push r14 \n");
asm(" push r13 \n");
asm(" push r12 \n");
asm(" push r11 \n");
asm(" push r10 \n");
asm(" push r9 \n");
asm(" push r8 \n");
asm(" fxsave save \n");
asm("mov r8, [pc] \n");
asm("mov r9, [pc + 8] \n");
asm("mov r10, [pc + 16]\n");
asm("mov r11, [pc + 24]\n");
asm("mov r12, [pc + 32]\n");
asm("mov r13, [pc + 40]\n");
asm("mov r14, [pc + 48]\n");
asm("mov r15, [pc + 56]\n");
asm(".att_syntax noprefix \n");
for(long i = 0; i < 125000000; i++)
{
asm(" .intel_syntax noprefix \n");
asm(" movdqa xmm0, out \n");
asm(" pavgb xmm0, [in + r8] \n");
asm(" pavgb xmm0, [in + r9] \n");
asm(" pavgb xmm0, [in + r10] \n");
asm(" pavgb xmm0, [in + r11] \n");
asm(" pavgb xmm0, [in + r12] \n");
asm(" pavgb xmm0, [in + r13] \n");
asm(" pavgb xmm0, [in + r14] \n");
asm(" pavgb xmm0, [in + r15] \n");
asm(" movdqa out, xmm0 \n");
asm(" .att_syntax noprefix \n");
}
asm(".intel_syntax noprefix \n");
asm(" fxrstor save \n");
asm(" pop r8 \n");
asm(" pop r9 \n");
asm(" pop r10 \n");
asm(" pop r11 \n");
asm(" pop r12 \n");
asm(" pop r13 \n");
asm(" pop r14 \n");
asm(" pop r15 \n");
asm(".att_syntax noprefix \n");
for(int i = 0; i < 16; i++)
std::cout << (int32_t)out << '\t';
std::cout << std::endl;
return 0;
}
[Edited by - D_Tr on May 31, 2009 2:11:18 AM]