I am trying to pass a list of pointers from one thread to another as fast as I possibly can. The solution I came up with is basically a ring buffer. It uses arrays of pointers in a linked list. When one "page" fills up it moves to the next one in the ring, adding a page if the next page has unread data. My exact implementation can be found below. Its written in asm, but the C++ equivalent is also included.
For testing I made a simple program that passed a series of integers instead of pointers from one thread to another. I noticed a major performance degradation when running it a hyper-threading processor and an even greater degradation when run on a multicore system. My initial results for 100,000,000 integers were.
Quote:Athlon XP 1800+: 3.2sec
Pentium 4(hyper-threading): 9.3sec
Pentium D(mutlicore): 12.1sec
I found the real killer was the use of memory fences. After removing the fences my results for 100,000,000 integers were.
Quote:Athlon XP 1800+: 2.3sec
Pentium 4(hyper-threading): .9sec
Pentium D(mutlicore): .7sec
From 12.1sec to .7sec is a significant difference. I realize that my game will never need to pass 100,000,000 pointers from one thread to another in less than a second. But in the interest of optimization, I started looking for alternatives to memory fences. After reading the Intel and AMD developer's manuals I started to question if they are really even needed.
I need a memory fence in two places basically. When I add a pointer I need to make sure the pointer is written before I move the write pointer, which means I need a store fence to enforce write ordering. When I read a pointer I have to make sure that the pointer is read before I move the read pointer, which means I need a full memory fence to make sure the write does not precede the read.
From the Intel developer's manual:
Quote:Writes are not reordered with older reads.
...
Writes to memory are not reordered with other writes, with the exception of writes executed with the CLFLUSH instruction and streaming stores (writes) executed with the non-temporal move instructions (MOVNTI, MOVNTQ, MOVNTDQ, MOVNTPS, and MOVNTPD).
From the AMD developer's manual:
Quote:Generally, out-of-order writes are not allowed. Write instructions executed out-of-order cannot commit (write) their result to memory until all previous instructions have completed in program order. The processor can, however, hold the result of an out-of-order write instruction in a private buffer (not visible to software) until that result can be committed to memory.
This seems contrary to what I have read in many places on the internet before, including these forums. One common example is data being filled in an array, array[1] and array[0] can be written by the processor in any order. The compiler may reorder writes, but according to the information above writes cannot be reordered by processors under normal circumstances. In addition, the processor will not commit a write before a read that comes first in program order. Am I missing something here or do I really not need the memory fences?
#define SZ_PAGE 1000
struct SafeListPage
{
unsigned write;
unsigned read;
SafeListPage* next;
unsigned data[SZ_PAGE];
};
class CSafeListPage
{
public:
CSafeListPage();
~CSafeListPage();
void __fastcall Write(unsigned data);
unsigned __fastcall Read();
private:
SafeListPage* m_write;
SafeListPage* m_read;
};
CSafeListPage::CSafeListPage()
{
SafeListPage* page;
page=new SafeListPage;
page->read=0;
page->write=0;
page->next=page;
m_read=page;
m_write=page;
}
CSafeListPage::~CSafeListPage()
{
SafeListPage* start;
SafeListPage* cur;
SafeListPage* next;
start=m_write;
cur=start;
do
{
next=cur->next;
delete cur;
cur=next;
}
while(cur!=start);
}
__declspec(naked) void __fastcall CSafeListPage::Write(unsigned data)
{
/*
SafeListPage* page;
unsigned write;
write=m_write->write; //get current write pos on page
m_write->data[write]=data; //write data
if(++write<SZ_PAGE) //if page full
{
__asm{sfence}; //make sure data is written before we inc the write pos
m_write->write=write;
return;
}
page=m_write->next;
if(page==m_read) //if the next page has unread data
{
page=new SafeListPage; //insert a page between them
page->next=m_write->next;
m_write->next=page;
}
page->write=0; //init page
page->read=0;
__asm{sfence}; //make sure data and page init written before write pos
m_write->write=write;
m_write=page;
*/
__asm
{
push esi //save register
//ecx=this
//edx=data
mov eax,[ecx] //eax=m_write
mov esi,[eax] //esi=m_write->write
mov [eax+12+esi*4],edx //m_write->data[m_write->write]=data
add esi,1 //esi+=1
cmp esi,SZ_PAGE
jge next_page
//sfence
mov [eax],esi //m_write->write=esi
pop esi //restore register
ret
next_page:
push ebx //save register
mov edx,[ecx+4] //edx=m_read
mov ebx,[eax+8] //ebx=m_write->next
cmp ebx,edx
jne set_page
mov ebx,ecx //ebx=this
push SIZE SafeListPage
call new
add esp,4
mov ecx,[ebx] //ecx=m_write
mov edx,[ecx+8] //edx=m_write->next
mov [eax+8],edx //new->next=m_write->next
mov [ecx+8],eax //m_write->next=new
mov dword ptr [eax],0 //new->write=0
mov dword ptr [eax+4],0 //new->read=0
//sfence
mov [ecx],esi //m_write->write=esi
mov [ebx],eax //m_write=new
pop ebx //restore registers
pop esi
ret
set_page:
mov dword ptr [ebx],0 //m_write->next->write=0
mov dword ptr [ebx+4],0 //m_write->next->read=0
//sfence
mov [eax],esi //m_write->write=esi
mov [ecx],ebx //m_write=ebx
pop ebx //restore registers
pop esi
ret
};
}
__declspec(naked) unsigned __fastcall CSafeListPage::Read()
{
/*
unsigned data;
if(m_read->read==m_read->write) //make sure we don't pass write pointer
return 0;
data=m_read->data[m_read->read]; //read data
m_read->read++; //inc read
if(m_read->read<SZ_PAGE) //if end of page
return data;
__asm{mfence}; //make sure data read before we leave the page
m_read=m_read->next; //get next page
return data;
*/
__asm
{
push ebx //save register
//ecx=this
mov ebx,[ecx+4] //ebx=m_read
mov edx,[ebx+4] //edx=m_read->read
mov eax,[ebx] //eax=m_read->write
cmp edx,eax
jne get_data
xor eax,eax //return 0
pop ebx //restore register
ret
get_data:
mov eax,[ebx+12+4*edx] //eax=m_read->data[m_read->read]
add edx,1 //edx+=1
mov [ebx+4],edx //m_read->read=edx
cmp edx,SZ_PAGE //if end of page
jge next_page
pop ebx //restore register
ret
next_page:
//mfence
mov edx,[ebx+8] //edx=m_read->next
mov [ecx+4],edx //m_read=edx
pop ebx //restore register
ret
};
}