Jump to content
  • Advertisement

Archived

This topic is now archived and is closed to further replies.

thuned

how do i do a fast memcpy?

This topic is 5950 days old which is more than the 365 day threshold we allow for new replies. Please post a new topic.

If you intended to correct an error in the post then please contact us.

Recommended Posts

i''m using http://www.joryanick.com/memcpy.htm as the reference site but i''m having troubles since i don''t know asm. here''s what i have so far
  
#include "stdafx.h"
#include "fastmemcpy.h"
#include <stddef.h>

const int BLOCK_SIZE = 2048;
BYTE *tbuf[BLOCK_SIZE];

int whatCPUExtensions(void) {				
	__asm {							
		mov edi,0						
			pushfd							
			pop eax							
			mov edx,eax						
			xor eax,200000h						
			push eax						
			popfd							
			pushfd							
			pop eax							
			and eax,200000h						
			and edx,200000h						
			cmp eax,edx						
			je ExtensionsEnd					
			mov eax,1						
			/* 586 specific code */					
			cpuid							
			test edx,00800000h  // Test bit 23, for MMX existence.	
			je ExtensionsNoMMX					
			or edi,1						
ExtensionsNoMMX:					
		test edx, 02000000h // Test bit 25, for Streaming	
			je ExtensionsNoKNI  //  SIMD Extensions existence.	
			or edi,4						
ExtensionsNoKNI:					
		mov eax,080000000h  // Test CPUID bit 32, for		
			cpuid		    //  3DNow existence.		
			test eax,eax						
			jz ExtensionsEnd					
			mov eax,080000001h					
			cpuid							
			test edx,080000000h					
			je ExtensionsEnd					
			or edi,2						
ExtensionsEnd:						
		mov eax,edi						
	}							
	/*return eax;	*/
}	

void initmemcpy(void)
{
	int opt = whatCPUExtensions();
	if (opt == 0)
		HasMMX = 1;
	else if (opt == 2)
		HasSIMD = 1;
	{
		HasMMX = HasSIMD = 0;
	}
}
void fastmemcpy(void *dst, void *src, long nbytes)
{
	if (HasSIMD)
	{
		__asm { 
			mov esi, src 
				mov ecx, nbytes 
				mov ebx, ecx 
				shr ebx, 11 // 2048 bytes at a time 

				mov edi, dst 
				
loop2k: // Copy 2k into temporary buffer 

			push edi 
				mov edi, tbuf 
				mov ecx, 2048 
				shr ecx, 6 
				
loopMemToL1: 
			prefetchnta 64[ESI] // Prefetch next loop, non-temporal 

			prefetchnta 96[ESI] 
				
				movq mm1,  0[ESI] // Read in source data 

				movq mm2,  8[ESI] 
				movq mm3, 16[ESI] 
				movq mm4, 24[ESI] 
				movq mm5, 32[ESI] 
				movq mm6, 40[ESI] 
				movq mm7, 48[ESI] 
				movq mm0, 56[ESI] 
				
				movq  0[EDI], mm1 // Store into L1 

				movq  8[EDI], mm2 
				movq 16[EDI], mm3 
				movq 24[EDI], mm4 
				movq 32[EDI], mm5 
				movq 40[EDI], mm6 
				movq 48[EDI], mm7 
				movq 56[EDI], mm0 
				add esi, 64 
				add edi, 64 
				dec ecx 
				jnz loopMemToL1 
				
				pop edi // Now copy from L1 to system memory 

				push esi 
				mov esi, tbuf 
				mov ecx, 2048 
				shr ecx, 6 
				
loopL1ToMem: 
			movq mm1, 0[ESI] // Read in source data from L1 

				movq mm2, 8[ESI] 
				movq mm3, 16[ESI] 
				movq mm4, 24[ESI] 
				movq mm5, 32[ESI] 
				movq mm6, 40[ESI] 
				movq mm7, 48[ESI] 
				movq mm0, 56[ESI] 
				
				movntq 0[EDI], mm1 // Non-temporal stores 

				movntq 8[EDI], mm2 
				movntq 16[EDI], mm3 
				movntq 24[EDI], mm4 
				movntq 32[EDI], mm5 
				movntq 40[EDI], mm6 
				movntq 48[EDI], mm7 
				movntq 56[EDI], mm0 
				
				add esi, 64 
				add edi, 64 
				dec ecx 
				jnz loopL1ToMem 
				
				pop esi // Do next 2k block 

				dec ebx 
				jnz loop2k 
		} 

	}
	else if (HasMMX)
	{
		_asm { 
			mov esi, src 
				mov edi, dst 
				mov ecx, nbytes 
				shr ecx, 6 // 64 bytes per iteration 

				
loop1: 
			movq mm1,  0[ESI] // Read in source data 

				movq mm2,  8[ESI] 
				movq mm3, 16[ESI] 
				movq mm4, 24[ESI] 
				movq mm5, 32[ESI] 
				movq mm6, 40[ESI] 
				movq mm7, 48[ESI] 
				movq mm0, 56[ESI] 
				
				movntq  0[EDI], mm1 // Non-temporal stores 

				movntq  8[EDI], mm2 
				movntq 16[EDI], mm3 
				movntq 24[EDI], mm4 
				movntq 32[EDI], mm5 
				movntq 40[EDI], mm6 
				movntq 48[EDI], mm7 
				movntq 56[EDI], mm0 
				
				add esi, 64 
				add edi, 64 
				dec ecx 
				jnz loop1 
				
				emms 
		}
	} 


	
	if (nbytes < BLOCK_SIZE || (!HasMMX && !HasSIMD))
	{
		memcpy(dst,src,nbytes);
	}

	else
	{
		int Balance = nbytes % BLOCK_SIZE;
		if (Balance > 0)
		{
			BYTE *dest		= (BYTE*)dst+nbytes-Balance;
			BYTE *source	= (BYTE*)src+nbytes-Balance;
			memcpy(dest,source,Balance);
		}
	}
}
  
have two errors: L:\C++\Video Project\vppcap0203\fastmemcpy.cpp(48) : warning C4035: ''whatCPUExtensions'' : no return value L:\C++\Video Project\vppcap0203\fastmemcpy.cpp(79) : fatal error C1601: unsupported inline assembly opcode i don''t think the warning matters that much but i have no idea what to fix with the unsupported inline assembly opcode here''s what it points to:
  
			prefetchnta 64[ESI] // Prefetch next loop, non-temporal 

			prefetchnta 96[ESI] 
  
what should i do? or are there any alternatives to fixing this? also, i''ve tested the mmx copy from http://alekdm.search.bg/details/mmx.html and it made my whole routine (didn''t test just the memcpy portion) about 3% faster. i''m hoping this SGI one will be faster with it''s block copying. as a side note, why do i need to include stdafx.h to add this code to my mfc based app?

Share this post


Link to post
Share on other sites
Advertisement
If u are useing VC6 professional D/L and install the processor pack. (make sure u have the VC6 SP5 installed before u install the processor pack) It gives native SSE,SSE2,3DNOW support to VC6. Alternately u can just #define the opcodes in.

-potential energy is easily made kinetic-

Share this post


Link to post
Share on other sites
i have the standard edition...

and sorry about the multiple posts; the script keeps giving me errors.

Share this post


Link to post
Share on other sites
DL the pdf's from AMD and Intels site. The following opcode is from one of the AMD 3dnow references (22466.pdf off there website):

PREFETCHNTA - Move Data Closer to the Processor Using the NTA Reference - 0Fh 18h 0

EDIT - This is from the Athlon extension to the MMX and 3dnow pdf, so it is Athlon or better instruction. If you want to include K6-2 support you'll either have to use the Prefetch instruction instead or implement a different code path for the k62. I don't have the SSE reference handy so you'll have to look up intels SSE prefetch instruction yourself.

-potential energy is easily made kinetic-

Edited by - Infinisearch on February 3, 2002 10:10:19 PM

Share this post


Link to post
Share on other sites
the thing is i don''t know anything about assembly. i have no idea what that is or how i should implement it into my code.
i guess i''m asking for a complete set of code, since i have no idea how to write it myself.

Share this post


Link to post
Share on other sites
quote:
Original post by Infinisearch
If u are useing VC6 professional D/L and install the processor pack. (make sure u have the VC6 SP5 installed before u install the processor pack) It gives native SSE,SSE2,3DNOW support to VC6. Alternately u can just #define the opcodes in.

-potential energy is easily made kinetic-



ok, i''ve managed to get my hands on a copy of professional. i have the service pack and the professor pack. is it automatic? i timed it and i don''t notice much improvements. and what kind of opcodes can i #define?

Share this post


Link to post
Share on other sites
I have a few comments about copying memory:

I was looking through my game, and couldn''t find any instances of MemCpy that copied enough memory to be slow. I guess if this was an image processing app it would be important

For copying small buffers the intrinsic function is just as good. VC++ optimizes it into a ''rep stosd'' opcode, which copies data 32-bits at a time. I don''t know how you can make a single opcode faster

and the example uses movntq, which may cause cache problems

I love optimizing and tweaking, but just cant find any real opritunity in my game, at most I have to copy 3kb of data. if this was an image program, then it would certainly be usefull

does anyone have any comments about memory copying?

Share this post


Link to post
Share on other sites
quote:
Original post by Cybertron
...does anyone have any comments about memory copying?


I do copies up to a few 100k, usually using one of the string mov commands. If you're doing serious graphics work and you want to use some of the advance instructions like mmx, try Intel Image Processing Library. Last time I checked it was a big download (>10mb), but what you get is a series of dll's and documentation. Check out the link for a list of all the things it does. (Its free, you just have to register and agree you won't distribute hacked versions of the dll's they provide).

Edited by - Michalson on March 9, 2002 12:58:04 PM

Share this post


Link to post
Share on other sites
quote:
Original post by Cybertron
I guess if this was an image processing app it would be important



Yeah, I''m doing some realtime processing of video frames (30fps) so I sometimes copy 10-30megs of data per second. The copying itself doesn''t take much of the loop (according the the profiler), but a 200-300% increase using sse/3dnow should be significant. I will try looking at the Intel Image Processing Library.

Share this post


Link to post
Share on other sites

  • Advertisement
×

Important Information

By using GameDev.net, you agree to our community Guidelines, Terms of Use, and Privacy Policy.

Participate in the game development conversation and more when you create an account on GameDev.net!

Sign me up!