If you can think of any way to speed this up without using MMX (I'm sure there are ways with MMX, but not all of us have it) I'd be glad to hear from you. My email is email@example.com. I'm hoping this goes into a code section, and if you don't understand any Assembler, PLEASE don't ask me about it. You can find nice tutorials on Assembler from many places, and it isn't as complex as it looks. Just low level.
Well, have a nice day, and I really hope this helps some of you still using slow C++ methods of blending. If you're putting this into C++, as I'm sure you will be since it uses variables which means it probably won't work with a straight Assembler, you'll want to enclose it in brackets like so:
code goes here
xor eax, eax // Initialize the variables. This is VC++ 6
xor ebx, ebx // inline Assembler. Mostly this is designed
mov ecx, xheight // for a DirectDraw locked surface.
xor edx, edx
mov esi, ddsdSource.lpSurface // This variable should be the pointer to the source surface.
mov edi, ddsdDest.lpSurface // This variable should be the pointer to the dest surface.
mov ecx, xwidth
mov ax, [esi] // Reads the 16-bit pixel (this is designed for 565 RGB).
test ax, ax // Tests it with an AND to itself. Sets flags, basically.
jz exLoop16bit565 // Skips if pure black (transparent in my game). Use cmp ax, value
// if black isn't transparent for you. Take out this line and the previous
// if you don't need a transparent additive effect.
mov dx, [edi]
and edx, 63454 // Take out the last bits of red, green, and blue.
and eax, 63454 // This ensures the last bits aren't used by the colors.
add eax, edx // This is necessary later on. Total source and dest up into a 32-bit register.
mov ebx, eax // EBX is going to be used as our temporary register here.
and ebx, 65536 // AND it by 65536, which strips out all but the last bit if it overflowed past the max.
jz noRedLoop16bitAAB565 // Skips the next line if an overflow did not occur.
or eax, 63488 // This OR turns on all the red bits.
mov ebx, eax // We do the same thing to green.
and ebx, 2048 // Oh and don't worry about bit 17 being turned on in red,
jz noGreenLoop16bitAAB565 // that will be solved later.
or eax, 2016
mov ebx, eax // Finally we process blue.
and ebx, 32 // Almost done here!
or eax, 31
mov [edi], ax // Move the latter 16-bits of the pixel into the destination.
add esi, 2 // Add 2 bytes to the source pointer, to go to the next pixel.
add edi, 2 // Ditto with the destination pointer.
dec ecx // Decrease ECX. As you may have noticed, height was PUSHed in ECX,
// and we did that to make room for width. x86 needs more registers.
jnz widthLoop16bit565 // If we aren't out of pixels on that row, go back for another pixel.
pop ecx // Pulls the height from the stack. Height is now in ECX for a sec.
add esi, ipitchsrc // ipitchsrc should be the pitch DirectDraw gave you in Lock()
add edi, ipitchdest // minus twice the source width. ipitchdest should be the same thing for
dec ecx // the destination surface. We're almost done, now we are decreasing
jnz heightLoop16bit565 // the height and looping back for the next row.