The work is divided by simply splitting up the list of floats, so
1 thread has to do the entire list
2 threads have half the list each
3 threads have a third of the list each
and so on
I know that there are some division/remainder issues, and that there is no cleanup performed, but I wanted to keep it simple.
Seeing as I have a quad core, I expected the results to be more optimal around the 4 thread mark, and then get gradually worse as the number of threads increased. But thats no what I got at all, and was wondering why?
#include <windows.h>#include <iostream>using namespace std;const int NumFloats = 12000000;float* pFloatArray = 0;struct Task{ float* pFloatArray; int Count; Task( float* _pfloatArray, int _Count ) : pFloatArray(_pfloatArray), Count(_Count) {}};void Func( Task* pTask ){ for( int i = 0; i < pTask->Count; ++i ) { pTask->pFloatArray *= 5; }}void ThreadTask( int NumThreads ){ DWORD t1 = timeGetTime(); HANDLE* Handles = new HANDLE[NumThreads]; for( int j = 0; j < NumThreads; ++j ) { int Offset = j*(NumFloats/NumThreads); Task* ptask = new Task( pFloatArray + Offset, NumFloats/NumThreads ); Handles[j] = CreateThread( 0, 0, (LPTHREAD_START_ROUTINE)&Func, ptask, 0, 0 ); } WaitForMultipleObjects( NumThreads, Handles, true, INFINITE ); DWORD t2 = timeGetTime(); cout << "Threads " << NumThreads << " Time : " << t2 - t1 << " Rem " << NumFloats%NumThreads << endl;}int main(){ pFloatArray = new float[NumFloats]; ThreadTask( 1 ); ThreadTask( 2 ); ThreadTask( 3 ); ThreadTask( 4 ); ThreadTask( 6 ); ThreadTask( 8 ); ThreadTask( 12 ); ThreadTask( 16 ); ThreadTask( 24 ); ThreadTask( 30 ); ThreadTask( 40 ); ThreadTask( 50 ); ThreadTask( 60 ); ThreadTask( 80 ); ThreadTask( 100 ); ThreadTask( 120 ); ThreadTask( 140 ); ThreadTask( 180 ); ThreadTask( 220 ); ThreadTask( 250 ); ThreadTask( 300 ); ThreadTask( 350 ); ThreadTask( 400 ); ThreadTask( 450 ); ThreadTask( 500 ); cin.get();}
Results 1
Threads : 1, Time(ms) : 38, Rem 0Threads : 2, Time(ms) : 19, Rem 0Threads : 3, Time(ms) : 19, Rem 0Threads : 4, Time(ms) : 20, Rem 0Threads : 6, Time(ms) : 21, Rem 0Threads : 8, Time(ms) : 19, Rem 0Threads : 12, Time(ms) : 21, Rem 0Threads : 16, Time(ms) : 19, Rem 0Threads : 24, Time(ms) : 21, Rem 0Threads : 30, Time(ms) : 22, Rem 0Threads : 40, Time(ms) : 21, Rem 0Threads : 50, Time(ms) : 22, Rem 0Threads : 60, Time(ms) : 22, Rem 0Threads : 80, Time(ms) : 15, Rem 0Threads : 100, Time(ms) : 17, Rem 0Threads : 120, Time(ms) : 20, Rem 0Threads : 140, Time(ms) : 24, Rem 40Threads : 180, Time(ms) : 21, Rem 120Threads : 220, Time(ms) : 28, Rem 100Threads : 250, Time(ms) : 33, Rem 0Threads : 300, Time(ms) : 37, Rem 0Threads : 350, Time(ms) : 40, Rem 250Threads : 400, Time(ms) : 46, Rem 0Threads : 450, Time(ms) : 52, Rem 300Threads : 500, Time(ms) : 57, Rem 0
Results 2
Threads : 1, Time(ms) : 38, Rem 0Threads : 2, Time(ms) : 19, Rem 0Threads : 3, Time(ms) : 19, Rem 0Threads : 4, Time(ms) : 20, Rem 0Threads : 6, Time(ms) : 21, Rem 0Threads : 8, Time(ms) : 19, Rem 0Threads : 12, Time(ms) : 21, Rem 0Threads : 16, Time(ms) : 19, Rem 0Threads : 24, Time(ms) : 21, Rem 0Threads : 30, Time(ms) : 22, Rem 0Threads : 40, Time(ms) : 21, Rem 0Threads : 50, Time(ms) : 22, Rem 0Threads : 60, Time(ms) : 22, Rem 0Threads : 80, Time(ms) : 15, Rem 0Threads : 100, Time(ms) : 17, Rem 0Threads : 120, Time(ms) : 20, Rem 0Threads : 140, Time(ms) : 24, Rem 40Threads : 180, Time(ms) : 21, Rem 120Threads : 220, Time(ms) : 28, Rem 100Threads : 250, Time(ms) : 33, Rem 0Threads : 300, Time(ms) : 37, Rem 0Threads : 350, Time(ms) : 40, Rem 250Threads : 400, Time(ms) : 46, Rem 0Threads : 450, Time(ms) : 52, Rem 300Threads : 500, Time(ms) : 57, Rem 0