Loop Unrolling

Started by
12 comments, last by iMalc 15 years, 4 months ago
You might take a look at this: An optimized version of std::copy with boost

And about Duff's device not being readable - it's supposed to be fast not readable. Fast and readable often exclude each other (you should consider if it is worth to make it that fast). However, if the unreadability has a nice interface for the end user and limited to small amounts of code, why not?
Advertisement
Indeed, which the template gives you :)

These things really are groovy, I can't believe I avoided them for so long....wasted youth :\ :p


EDIT: Did some further study, and realised it was memcpy that of course was making all the gains :), but i did implement one that could use custom functionality, sort of policy based design, i think :s. But unfortuately i could only get close to the compilers for loop, not quite beat it, 2,000,000 ops for = 302ms, unroller 312ms :| Ah well it was interesting but i think i need to get back to my portfolio now :)

P.S I had thought about integrating some multithreading but realised then it would not really be a fair test against the for loop.

heres alittle code, i removed alot of repetition in the specializations to keep it small, gives the idea at least.

template<typename T>struct InitialiseDuo{	inline void DoOp(T &Dest, T &Source)	{		srand(12);		Dest = rand();		Source = rand();	}};template <typename T, unsigned unrollCount = 0, template<typename> class Op>class Unroller{public:	__forceinline void Assign ( T* Dest, T* Source ) 	{		//Oper.DoOp(Dest, Source);	}	Op<T> Oper;};template <typename T, template<typename> class Op>class Unroller<T, 1, Op>{public:	__forceinline void Assign ( T* Dest, T* Source ) 	{		Oper.DoOp(Dest[0], Source[0]);	}	Op<T> Oper;};.....****......template <typename T, template<typename> class Op>class Unroller<T, 20, Op>{public:	__forceinline void Assign ( T* Dest, T* Source ) 	{		Oper.DoOp(Dest[0], Source[0]);		Oper.DoOp(Dest[1], Source[1]);		Oper.DoOp(Dest[2], Source[2]);		Oper.DoOp(Dest[3], Source[3]);		Oper.DoOp(Dest[4], Source[4]);		Oper.DoOp(Dest[5], Source[5]);		Oper.DoOp(Dest[6], Source[6]);		Oper.DoOp(Dest[7], Source[7]);		Oper.DoOp(Dest[8], Source[8]);		Oper.DoOp(Dest[9], Source[9]);		Oper.DoOp(Dest[10], Source[10]);		Oper.DoOp(Dest[11], Source[11]);		Oper.DoOp(Dest[12], Source[12]);		Oper.DoOp(Dest[13], Source[13]);		Oper.DoOp(Dest[14], Source[14]);		Oper.DoOp(Dest[15], Source[15]);		Oper.DoOp(Dest[16], Source[16]);		Oper.DoOp(Dest[17], Source[17]);		Oper.DoOp(Dest[18], Source[18]);		Oper.DoOp(Dest[19], Source[19]);	}	Op<T> Oper;};//template <typename T, unsigned unrollCount, template<typename> class Op> unsigned Unroller<T, unrollCount, Op >::CopySize = sizeof(T) * unrollCount;template<typename T, template<typename> class Operation >class UnrollerWrap{public:	__forceinline void AssignmentIterate(T* dest, T* source, unsigned size)	{		unsigned count = (size) % 20;				switch( count )		{		case 1:			size -= count;			r1.Assign(&dest[size], &source[size]);			break;		case 2:			size -= count;			r2.Assign(&dest[size], &source[size]);			break;		case 3:			size -= count;			r3.Assign(&dest[size], &source[size]);			break;		case 4:			size -= count;			r4.Assign(&dest[size], &source[size]);			break;		case 5:			size -= count;			r5.Assign(&dest[size], &source[size]);			break;		case 6:			size -= count;			r6.Assign(&dest[size], &source[size]);			break;		case 7:			size -= count;			r7.Assign(&dest[size], &source[size]);			break;		case 8:			size -= count;			r8.Assign(&dest[size], &source[size]);			break;		case 9:			size -= count;			r9.Assign(&dest[size], &source[size]);			break;		case 10:			size -= count;			r10.Assign(&dest[size], &source[size]);			break;		case 11:			size -= count;			r11.Assign(&dest[size], &source[size]);			break;		case 12:			size -= count;			r12.Assign(&dest[size], &source[size]);			break;		case 13:			size -= count;			r13.Assign(&dest[size], &source[size]);			break;		case 14:			size -= count;			r14.Assign(&dest[size], &source[size]);			break;		case 15:			size -= count;			r15.Assign(&dest[size], &source[size]);			break;		case 16:			size -= count;			r16.Assign(&dest[size], &source[size]);			break;		case 17:			size -= count;			r17.Assign(&dest[size], &source[size]);			break;		case 18:			size -= count;			r18.Assign(&dest[size], &source[size]);			break;		case 19:			size -= count;			r19.Assign(&dest[size], &source[size]);			break;		}		count = size / 20;		switch (count % 10) 		{		case 0: do { size -= 20; r20.Assign(&dest[size], &source[size]);		case 9:      size -= 20; r20.Assign(&dest[size], &source[size]);		case 8:      size -= 20; r20.Assign(&dest[size], &source[size]);		case 7:      size -= 20; r20.Assign(&dest[size], &source[size]);		case 6:      size -= 20; r20.Assign(&dest[size], &source[size]);		case 5:      size -= 20; r20.Assign(&dest[size], &source[size]);		case 4:      size -= 20; r20.Assign(&dest[size], &source[size]);		case 3:      size -= 20; r20.Assign(&dest[size], &source[size]);		case 2:      size -= 20; r20.Assign(&dest[size], &source[size]);		case 1:      size -= 20; r20.Assign(&dest[size], &source[size]);				   } while (size > 0);		}						}private:	Unroller<T, 1, Operation > r1;	Unroller<T, 2, Operation > r2;	Unroller<T, 3, Operation > r3;	Unroller<T, 4, Operation > r4;	Unroller<T, 5, Operation > r5;	Unroller<T, 6, Operation > r6;	Unroller<T, 7, Operation > r7;	Unroller<T, 8, Operation > r8;	Unroller<T, 9, Operation > r9;	Unroller<T, 10, Operation > r10;	Unroller<T, 11, Operation > r11;	Unroller<T, 12, Operation > r12;	Unroller<T, 13, Operation > r13;	Unroller<T, 14, Operation > r14;	Unroller<T, 15, Operation > r15;	Unroller<T, 16, Operation > r16;	Unroller<T, 17, Operation > r17;	Unroller<T, 18, Operation > r18;	Unroller<T, 19, Operation > r19;	Unroller<T, 20, Operation > r20;};


EDIT: This was the finalish version, no better really just less code, just put it up for completeness, Compilers are too good (generally) to spend ages trying to beat :p

template<typename T>struct InitialiseDuo{	__forceinline void DoOp(T &Dest, T &Source)	{		....	}};template <typename T, template<typename> class Op>class Unroller{public:	__forceinline void Assign ( T* &Dest, T* &Source, const unsigned &iterations ) 	{		switch( iterations )		{		case 40 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 39 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 38 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 37 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 36 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 35 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 34 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 33 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 32 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 31 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 30 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 29 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 28 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 27 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 26 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 25 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 24 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 23 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 22 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 21 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 20 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 19 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 18 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 17 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 16 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 15 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 14 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 13 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 12 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 11 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 10 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 9 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 8 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 7 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 6 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 5 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 4 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 3 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 2 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		case 1 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source;		}	}	Op<T> Oper;};const unsigned StepSize = 40;template<typename T, template<typename> class Operation >class UnrollerWrap{public:	__forceinline void AssignmentIterate(T* dest, T* source, unsigned size)	{		unsigned count = (size) % StepSize;		r1.Assign(dest, source, count);				while( count < size )		{			r1.Assign( dest, source, StepSize );			count += StepSize;		}        }        Unroller<T, Operation > r1;}


[Edited by - Guthur on December 7, 2008 5:34:47 PM]
Innovation not reiterationIf at any point I look as if I know what I'm doing don't worry it was probably an accident.
One way to employ templates might be to let it determine if count is cheaply available or not:

#include <iterator>#include <vector>#include <list>#include <algorithm>#include <cassert>#include <iostream>#include <cstdlib>template <typename I1, typename I2, typename ItType>I2 do_copy(I1 first, I1 last, I2 out, ItType){    std::cout << "Plain\n";    while(first != last)    {        *out = *first;        ++out;        ++first;    }    return out;}//if source iterators are not random access, last - first would not be available and distance would be slowtemplate <typename I1, typename I2>I2 do_copy(I1 first, I1 last, I2 out, std::random_access_iterator_tag){    std::cout << "Optimized\n";    int count = last - first;    int n = (count + 7) / 8;    switch ((count) % 8) {        case 0: do { *out++ = *first++;        case 7:      *out++ = *first++;        case 6:      *out++ = *first++;        case 5:      *out++ = *first++;        case 4:      *out++ = *first++;        case 3:      *out++ = *first++;        case 2:      *out++ = *first++;        case 1:      *out++ = *first++;                } while (--n > 0);    }    return out;}template<typename I1, typename I2>I2 copy(I1 first, I1 last, I2 out){   return do_copy(first, last, out, typename std::iterator_traits<I1>::iterator_category());}int main(){    const unsigned size = 1234;    std::vector<int> a(size);    int b[size];    int c[size];    std::list<int> d;    int e[size];    std::generate_n(b, size, std::rand);    ::copy(b, b + size, a.begin());    ::copy(a.begin(), a.end(), std::back_inserter(d));    ::copy(b, b + size, c);    ::copy(d.begin(), d.end(), e);    assert(std::equal(a.begin(), a.end(), b ));    assert(std::equal(a.begin(), a.end(), c ));    assert(std::equal(a.begin(), a.end(), d.begin()));    assert(std::equal(a.begin(), a.end(), e));}
For furthur studying on the idea of fast array copying, I highly recommend a look into the yasli_vector class from the Loki library. It has some brilliant scheme for detecting when it is safe to use memcpy for example, and also essentially allows classes to specify if/how they can be move-constructed, iirc.[cool]
"In order to understand recursion, you must first understand recursion."
My website dedicated to sorting algorithms

This topic is closed to new replies.

Advertisement