You might take a look at this: An optimized version of std::copy with boost
And about Duff's device not being readable - it's supposed to be fast not readable. Fast and readable often exclude each other (you should consider if it is worth to make it that fast). However, if the unreadability has a nice interface for the end user and limited to small amounts of code, why not?
Loop Unrolling
Indeed, which the template gives you :)
These things really are groovy, I can't believe I avoided them for so long....wasted youth :\ :p
EDIT: Did some further study, and realised it was memcpy that of course was making all the gains :), but i did implement one that could use custom functionality, sort of policy based design, i think :s. But unfortuately i could only get close to the compilers for loop, not quite beat it, 2,000,000 ops for = 302ms, unroller 312ms :| Ah well it was interesting but i think i need to get back to my portfolio now :)
P.S I had thought about integrating some multithreading but realised then it would not really be a fair test against the for loop.
heres alittle code, i removed alot of repetition in the specializations to keep it small, gives the idea at least.
EDIT: This was the finalish version, no better really just less code, just put it up for completeness, Compilers are too good (generally) to spend ages trying to beat :p
[Edited by - Guthur on December 7, 2008 5:34:47 PM]
These things really are groovy, I can't believe I avoided them for so long....wasted youth :\ :p
EDIT: Did some further study, and realised it was memcpy that of course was making all the gains :), but i did implement one that could use custom functionality, sort of policy based design, i think :s. But unfortuately i could only get close to the compilers for loop, not quite beat it, 2,000,000 ops for = 302ms, unroller 312ms :| Ah well it was interesting but i think i need to get back to my portfolio now :)
P.S I had thought about integrating some multithreading but realised then it would not really be a fair test against the for loop.
heres alittle code, i removed alot of repetition in the specializations to keep it small, gives the idea at least.
template<typename T>struct InitialiseDuo{ inline void DoOp(T &Dest, T &Source) { srand(12); Dest = rand(); Source = rand(); }};template <typename T, unsigned unrollCount = 0, template<typename> class Op>class Unroller{public: __forceinline void Assign ( T* Dest, T* Source ) { //Oper.DoOp(Dest, Source); } Op<T> Oper;};template <typename T, template<typename> class Op>class Unroller<T, 1, Op>{public: __forceinline void Assign ( T* Dest, T* Source ) { Oper.DoOp(Dest[0], Source[0]); } Op<T> Oper;};.....****......template <typename T, template<typename> class Op>class Unroller<T, 20, Op>{public: __forceinline void Assign ( T* Dest, T* Source ) { Oper.DoOp(Dest[0], Source[0]); Oper.DoOp(Dest[1], Source[1]); Oper.DoOp(Dest[2], Source[2]); Oper.DoOp(Dest[3], Source[3]); Oper.DoOp(Dest[4], Source[4]); Oper.DoOp(Dest[5], Source[5]); Oper.DoOp(Dest[6], Source[6]); Oper.DoOp(Dest[7], Source[7]); Oper.DoOp(Dest[8], Source[8]); Oper.DoOp(Dest[9], Source[9]); Oper.DoOp(Dest[10], Source[10]); Oper.DoOp(Dest[11], Source[11]); Oper.DoOp(Dest[12], Source[12]); Oper.DoOp(Dest[13], Source[13]); Oper.DoOp(Dest[14], Source[14]); Oper.DoOp(Dest[15], Source[15]); Oper.DoOp(Dest[16], Source[16]); Oper.DoOp(Dest[17], Source[17]); Oper.DoOp(Dest[18], Source[18]); Oper.DoOp(Dest[19], Source[19]); } Op<T> Oper;};//template <typename T, unsigned unrollCount, template<typename> class Op> unsigned Unroller<T, unrollCount, Op >::CopySize = sizeof(T) * unrollCount;template<typename T, template<typename> class Operation >class UnrollerWrap{public: __forceinline void AssignmentIterate(T* dest, T* source, unsigned size) { unsigned count = (size) % 20; switch( count ) { case 1: size -= count; r1.Assign(&dest[size], &source[size]); break; case 2: size -= count; r2.Assign(&dest[size], &source[size]); break; case 3: size -= count; r3.Assign(&dest[size], &source[size]); break; case 4: size -= count; r4.Assign(&dest[size], &source[size]); break; case 5: size -= count; r5.Assign(&dest[size], &source[size]); break; case 6: size -= count; r6.Assign(&dest[size], &source[size]); break; case 7: size -= count; r7.Assign(&dest[size], &source[size]); break; case 8: size -= count; r8.Assign(&dest[size], &source[size]); break; case 9: size -= count; r9.Assign(&dest[size], &source[size]); break; case 10: size -= count; r10.Assign(&dest[size], &source[size]); break; case 11: size -= count; r11.Assign(&dest[size], &source[size]); break; case 12: size -= count; r12.Assign(&dest[size], &source[size]); break; case 13: size -= count; r13.Assign(&dest[size], &source[size]); break; case 14: size -= count; r14.Assign(&dest[size], &source[size]); break; case 15: size -= count; r15.Assign(&dest[size], &source[size]); break; case 16: size -= count; r16.Assign(&dest[size], &source[size]); break; case 17: size -= count; r17.Assign(&dest[size], &source[size]); break; case 18: size -= count; r18.Assign(&dest[size], &source[size]); break; case 19: size -= count; r19.Assign(&dest[size], &source[size]); break; } count = size / 20; switch (count % 10) { case 0: do { size -= 20; r20.Assign(&dest[size], &source[size]); case 9: size -= 20; r20.Assign(&dest[size], &source[size]); case 8: size -= 20; r20.Assign(&dest[size], &source[size]); case 7: size -= 20; r20.Assign(&dest[size], &source[size]); case 6: size -= 20; r20.Assign(&dest[size], &source[size]); case 5: size -= 20; r20.Assign(&dest[size], &source[size]); case 4: size -= 20; r20.Assign(&dest[size], &source[size]); case 3: size -= 20; r20.Assign(&dest[size], &source[size]); case 2: size -= 20; r20.Assign(&dest[size], &source[size]); case 1: size -= 20; r20.Assign(&dest[size], &source[size]); } while (size > 0); } }private: Unroller<T, 1, Operation > r1; Unroller<T, 2, Operation > r2; Unroller<T, 3, Operation > r3; Unroller<T, 4, Operation > r4; Unroller<T, 5, Operation > r5; Unroller<T, 6, Operation > r6; Unroller<T, 7, Operation > r7; Unroller<T, 8, Operation > r8; Unroller<T, 9, Operation > r9; Unroller<T, 10, Operation > r10; Unroller<T, 11, Operation > r11; Unroller<T, 12, Operation > r12; Unroller<T, 13, Operation > r13; Unroller<T, 14, Operation > r14; Unroller<T, 15, Operation > r15; Unroller<T, 16, Operation > r16; Unroller<T, 17, Operation > r17; Unroller<T, 18, Operation > r18; Unroller<T, 19, Operation > r19; Unroller<T, 20, Operation > r20;};
EDIT: This was the finalish version, no better really just less code, just put it up for completeness, Compilers are too good (generally) to spend ages trying to beat :p
template<typename T>struct InitialiseDuo{ __forceinline void DoOp(T &Dest, T &Source) { .... }};template <typename T, template<typename> class Op>class Unroller{public: __forceinline void Assign ( T* &Dest, T* &Source, const unsigned &iterations ) { switch( iterations ) { case 40 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 39 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 38 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 37 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 36 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 35 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 34 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 33 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 32 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 31 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 30 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 29 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 28 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 27 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 26 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 25 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 24 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 23 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 22 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 21 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 20 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 19 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 18 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 17 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 16 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 15 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 14 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 13 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 12 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 11 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 10 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 9 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 8 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 7 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 6 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 5 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 4 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 3 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 2 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; case 1 : Oper.DoOp(*Dest, *Source); ++Dest; ++Source; } } Op<T> Oper;};const unsigned StepSize = 40;template<typename T, template<typename> class Operation >class UnrollerWrap{public: __forceinline void AssignmentIterate(T* dest, T* source, unsigned size) { unsigned count = (size) % StepSize; r1.Assign(dest, source, count); while( count < size ) { r1.Assign( dest, source, StepSize ); count += StepSize; } } Unroller<T, Operation > r1;}
[Edited by - Guthur on December 7, 2008 5:34:47 PM]
One way to employ templates might be to let it determine if count is cheaply available or not:
#include <iterator>#include <vector>#include <list>#include <algorithm>#include <cassert>#include <iostream>#include <cstdlib>template <typename I1, typename I2, typename ItType>I2 do_copy(I1 first, I1 last, I2 out, ItType){ std::cout << "Plain\n"; while(first != last) { *out = *first; ++out; ++first; } return out;}//if source iterators are not random access, last - first would not be available and distance would be slowtemplate <typename I1, typename I2>I2 do_copy(I1 first, I1 last, I2 out, std::random_access_iterator_tag){ std::cout << "Optimized\n"; int count = last - first; int n = (count + 7) / 8; switch ((count) % 8) { case 0: do { *out++ = *first++; case 7: *out++ = *first++; case 6: *out++ = *first++; case 5: *out++ = *first++; case 4: *out++ = *first++; case 3: *out++ = *first++; case 2: *out++ = *first++; case 1: *out++ = *first++; } while (--n > 0); } return out;}template<typename I1, typename I2>I2 copy(I1 first, I1 last, I2 out){ return do_copy(first, last, out, typename std::iterator_traits<I1>::iterator_category());}int main(){ const unsigned size = 1234; std::vector<int> a(size); int b[size]; int c[size]; std::list<int> d; int e[size]; std::generate_n(b, size, std::rand); ::copy(b, b + size, a.begin()); ::copy(a.begin(), a.end(), std::back_inserter(d)); ::copy(b, b + size, c); ::copy(d.begin(), d.end(), e); assert(std::equal(a.begin(), a.end(), b )); assert(std::equal(a.begin(), a.end(), c )); assert(std::equal(a.begin(), a.end(), d.begin())); assert(std::equal(a.begin(), a.end(), e));}
For furthur studying on the idea of fast array copying, I highly recommend a look into the yasli_vector class from the Loki library. It has some brilliant scheme for detecting when it is safe to use memcpy for example, and also essentially allows classes to specify if/how they can be move-constructed, iirc.[cool]
This topic is closed to new replies.
Advertisement
Popular Topics
Advertisement