fakeheap.H

Go to the documentation of this file.
00001 #ifndef FAKEHEAP_HAEDER_
00002 #define FAKEHEAP_HEADER_
00003 
00010 #include <vector>
00011 #include <algorithm>
00012 
00013 /*
00014  Der Standard-Vektor wird um die typischen PriorityQueue-Funktionen erweitert.
00015  Es handelt sich aber nicht wirklich um eine PriorityQueue; die Schnittstellen
00016  dienen lediglich dazu, in bestimmten Programmstellen langsamere echte
00017  Priorityqueues durch dieses Fake zu ersetzen, sofern bestimmte
00018  Nebenbedingungen erfüllt werden:
00019 
00020  Zugriff ist nur blockweise gestattet. Zwischen push-Operationen einerseits
00021  und top/pop-Operationen andererseits ist "sort" aufzurufen.
00022 
00023  Ohne FakeHeap sind zwar alle Priority-Queues, die das STL-Interface
00024  verwenden, einsetzbar (z.B. Fibonacci-Heaps). Aber nun ist es möglich, auch
00025  Implementierungen, die unsortiert einfügen und ausgeben (und die Sortierung
00026  erst auf explizite Aufforderung vornehmen), einzusetzen. Dies bereitet
00027  einen größeren Raum für Effizienzvergleiche.
00028 
00029  Echte Priority-Queue-Implementierungen können selbstverständlich ebenfalls
00030  dieses Interface verwenden und den sort()-Aufruf einfach ignorieren.
00031 */
00032 
00033 
00034 //#define FAKEHEAP_DEBUG
00035 
00058 template <class T> class FakeHeap : private std::vector<T>
00059 {
00060 private:
00061 
00062 #ifdef FAKEHEAP_DEBUG
00063  mutable bool push_locked;
00064 #endif
00065 
00066 public:
00067 
00068 #ifdef FAKEHEAP_DEBUG
00069  inline FakeHeap() : std::vector<T>(), push_locked(false) { cout << "FakeHeap:: constructor..." << endl; };
00070  inline ~FakeHeap() { };
00071 #endif
00072 
00073 #ifdef FAKEHEAP_DEBUG
00074  inline void clear()
00075   {
00076     cout << "FakeHeap:: CLEAR..." << endl;
00077     std::vector<T>::clear();
00078     push_locked=false;
00079   }
00080 #else
00081  inline void clear() { std::vector<T>::clear(); }
00082 #endif
00083 
00084 
00085  inline bool empty() const { return std::vector<T>::empty(); }
00086 
00087  inline void sort(void)
00088   {
00089     //std::cout << "FakeHeap: sorting " << std::vector<T>::size() << " elements... " << std::flush;
00090     std::sort(this->begin(),this->end());
00091     //std::cout << "done." << std::endl;
00092 #ifdef FAKEHEAP_DEBUG
00093     push_locked=true;
00094 #endif
00095   }
00096  inline void push(const T &x)
00097   {
00098 #ifdef FAKEHEAP_DEBUG
00099     if (push_locked)
00100      { 
00101        std::cerr << "FakeHeap:: Hey?!!! push is locked!!!" << endl;
00102        push_back(x);
00103        sort();
00104        return;
00105      }
00106 #endif
00107     push_back(x);
00108   }
00109  inline void pop(void)
00110   {
00111 #ifdef FAKEHEAP_DEBUG
00112     if (!push_locked)
00113      {
00114        std::cerr << "FakeHeap:: Hey?!!! pop is locked!!!" << endl;
00115        //exit(1);
00116      }
00117     push_locked=true;
00118 #endif
00119     std::vector<T>::pop_back();
00120   }
00121  inline const T& top(void) const
00122   {
00123 #ifdef FAKEHEAP_DEBUG
00124     if (!push_locked)
00125      {
00126        std::cerr << "FakeHeap:: Hey?!!! top is locked!!!" << endl;
00127        //exit(1);
00128      }
00129     push_locked=true;
00130 #endif
00131     return std::vector<T>::back();
00132   } 
00133 };
00134 
00135 
00136 #if 1 && (defined (ASM_MMX) || defined(ASM_ATHLON) || defined(ASM_SSE) || defined(ASM_X86_64))
00137  #ifdef DEBUG
00138   #warning "new experimental FAKEHEAP specialization enabled"
00139  #endif
00140 // specialization of the fakeheap using optimized
00141 // median-of-three quicksort with insertion sort near the leaves.
00142 
00143 template<> class FakeHeap<TSieve_Delta>
00144 {
00145 private:
00146  static const int MORESIZE = 0x10000;
00147  TSieve_Delta* p;
00148  int size, capacity;
00149 
00150  inline void swap_p (const long int i, const long int j)
00151   {
00152     asm volatile( \
00153      "movq (%[p],%[i],8),%%mm0 \n\t" \
00154      "movq (%[p],%[j],8),%%mm1 \n\t" \
00155      "movq %%mm0,(%[p],%[j],8) \n\t" \
00156      "movq %%mm1,(%[p],%[i],8)" \
00157      :
00158      : [p] "r" (p), [i] "r" (i), [j] "r" (j)
00159      : "memory", "mm0", "mm1");
00160   }
00161 
00162  void quicksort(register long int l, register long int r)
00163   {
00164     // we use iteration instead of recursion, therefore we need a stack:
00165     struct { int L,R; } stack[256];
00166     int stackpointer = 0;
00167 
00168     // okay, let's start the iteration:
00169   iteration_loop:
00170     if (r>l)
00171      {
00172        if (r-l<32)
00173         {
00174           // insertion sort (when near the leaves)
00175 #if 0
00176           for (int i=l+1; i<=r; ++i)
00177            {
00178              TSieve_Delta v=p[i];
00179              int j=i;
00180              while (j>l && p[j-1].delta<v.delta) { p[j]=p[j-1]; --j; }
00181              p[j]=v;
00182            }
00183 #elif defined(ASM_ATHLON)
00184           asm volatile( \
00185            "movl %[l],%%edx \n\t" \
00186            "jmp 2f \n\t" \
00187            ".balign 16 \n\t" \
00188            "3: movl %%edx,%%ecx \n\t " \
00189            "decl %%ecx \n\t" \
00190            "movq (%[p],%%edx,8),%%mm1 \n\t" \
00191            "movl (%[p],%%edx,8),%%eax \n\t" \
00192            "0: cmpl (%[p],%%ecx,8),%%eax \n\t" \
00193            "jle 1f \n\t" \
00194            "movq (%[p],%%ecx,8),%%mm0 \n\t" \
00195            "movq %%mm0,8(%[p],%%ecx,8) \n\t" \
00196            "decl %%ecx \n\t" \
00197            "jns 0b \n\t" \
00198            "1: movq %%mm1,8(%[p],%%ecx,8) \n\t" \
00199            "2: incl %%edx \n\t" \
00200            "cmpl %[r],%%edx \n\t" \
00201            "jle 3b" \
00202            :
00203            : [p] "r" (p), [l] "g" (l), [r] "g" (r)
00204            : "cc", "memory", "eax", "ecx", "edx", "mm0", "mm1");
00205 #elif defined(ASM_X86_64)
00206           asm volatile( \
00207            "mov %[l],%%rdx \n\t" \
00208            "jmp 2f \n\t" \
00209            ".balign 16 \n\t" \
00210            "3: mov %%rdx,%%rcx \n\t " \
00211            "dec %%rcx \n\t" \
00212            "mov (%[p],%%rdx,8),%%rax \n\t" \
00213            "0: cmpl (%[p],%%rcx,8),%%eax \n\t" \
00214            "jle 1f \n\t" \
00215            "movq (%[p],%%rcx,8),%%mm0 \n\t" \
00216            "movq %%mm0,8(%[p],%%rcx,8) \n\t" \
00217            "dec %%rcx \n\t" \
00218            "jns 0b \n\t" \
00219            "1: mov %%rax,8(%[p],%%rcx,8) \n\t" \
00220            "2: inc %%rdx \n\t" \
00221            "cmp %[r],%%rdx \n\t" \
00222            "jle 3b" \
00223            :
00224            : [p] "r" (p), [l] "g" (l), [r] "g" (r)
00225            : "cc", "memory", "rax", "rcx", "rdx", "mm0");
00226 #else  /* defined(ASM_MMX) || defined(ASM_SSE) and not an Athlon */
00227           asm volatile( \
00228            "movl %[l],%%edx \n\t" \
00229            "jmp 2f \n\t" \
00230            "3: movl %%edx,%%ecx \n\t " \
00231            "sub $1,%%ecx \n\t" \
00232            "movq (%[p],%%edx,8),%%mm1 \n\t" \
00233            "movl (%[p],%%edx,8),%%eax \n\t" \
00234            "0: cmpl (%[p],%%ecx,8),%%eax \n\t" \
00235            "jle 1f \n\t" \
00236            "movq (%[p],%%ecx,8),%%mm0 \n\t" \
00237            "movq %%mm0,8(%[p],%%ecx,8) \n\t" \
00238            "sub $1,%%ecx \n\t" \
00239            "jns 0b \n\t" \
00240            "1: movq %%mm1,8(%[p],%%ecx,8) \n\t" \
00241            "2: add $1,%%edx \n\t" \
00242            "cmpl %[r],%%edx \n\t" \
00243            "jle 3b" \
00244            :
00245            : [p] "r" (p), [l] "g" (l), [r] "g" (r)
00246            : "cc", "memory", "eax", "ecx", "edx", "mm0", "mm1");
00247 #endif
00248         }
00249        else
00250         {
00251           // quicksort with median of three
00252           //std::cout << l << " " << r << ":" << std::endl;
00253           register long int i = (l+r)>>1;
00254           if (p[l].delta<p[r].delta) swap_p(l,r);
00255           if (p[l].delta<p[i].delta) swap_p(l,i);
00256           if (p[r].delta<p[i].delta) swap_p(r,i);
00257 #if 0
00258           // quicksort in C++
00259           const int pivot=p[r].delta;
00260           i=l;
00261           int j=r;
00262           while (true)
00263            {
00264              do ++i; while (p[i].delta>pivot);
00265              do --j; while (p[j].delta<pivot);
00266              if (i>=j) break;
00267              swap_p(i,j);
00268            }
00269           swap_p(i,r);
00270 #elif defined(ASM_ATHLON)  /* and ATHLON-XP! */
00271           // pivot element in %%eax (key) + %%mm2 (all data)
00272           asm( \
00273            "movq (%[p],%[r],8),%%mm2 \n\t" \
00274            "movl (%[p],%[r],8),%%eax \n\t" \
00275            "movl %[l],%[i] \n\t " \
00276            "movl %[r],%%ecx \n\t" \
00277            "0: incl %[i] \n\t"
00278            "cmpl (%[p],%[i],8),%%eax \n\t" \
00279            "jl 0b \n\t" \
00280            "prefetchw 192(%[p],%[i],8) \n\t" \
00281            "1: decl %%ecx \n\t"
00282            "cmpl (%[p],%%ecx,8),%%eax \n\t" \
00283            "jg 1b \n\t" \
00284            "cmpl %%ecx,%[i] \n\t" \
00285            "movq (%[p],%[i],8),%%mm0 \n\t" \
00286            "jge 2f \n\t" \
00287            "movq (%[p],%%ecx,8),%%mm1 \n\t" \
00288            "movq %%mm0,(%[p],%%ecx,8) \n\t" \
00289            "movq %%mm1,(%[p],%[i],8) \n\t" \
00290            "prefetchw -192(%[p],%%ecx,8) \n\t" \
00291            "jmp 0b \n\t" \
00292            "2: movq %%mm2,(%[p],%[i],8) \n\t" \
00293            "movq %%mm0,(%[p],%[r],8)" \
00294            : [i] "=&r" (i) // important: early clobbered as it may conflict with l!
00295            : [p] "r" (p), [l] "r" (l), [r] "r" (r)
00296            : "cc", "memory", "eax", "ecx", "mm0", "mm1", "mm2");
00297 #elif defined(ASM_X86_64)
00298           // pivot element in %%eax (key) + %%rax (all data)
00299           asm( \
00300            "mov (%[p],%[r],8),%%rax \n\t" \
00301            "mov %[l],%[i] \n\t " \
00302            "mov %[r],%%rcx \n\t" \
00303            "0: inc %[i] \n\t"
00304            "cmpl (%[p],%[i],8),%%eax \n\t" \
00305            "jl 0b \n\t" \
00306            "#prefetchw 192(%[p],%[i],8) \n\t" \
00307            "1: dec %%ecx \n\t"
00308            "cmpl (%[p],%%rcx,8),%%eax \n\t" \
00309            "jg 1b \n\t" \
00310            "cmp %%rcx,%[i] \n\t" \
00311            "movq (%[p],%[i],8),%%mm0 \n\t" \
00312            "jge 2f \n\t" \
00313            "movq (%[p],%%rcx,8),%%mm1 \n\t" \
00314            "movq %%mm0,(%[p],%%rcx,8) \n\t" \
00315            "movq %%mm1,(%[p],%[i],8) \n\t" \
00316            "#prefetchw -192(%[p],%%rcx,8) \n\t" \
00317            "jmp 0b \n\t" \
00318            "2: mov %%rax,(%[p],%[i],8) \n\t" \
00319            "movq %%mm0,(%[p],%[r],8)" \
00320            : [i] "=&r" (i) // important: early clobbered as it may conflict with l!
00321            : [p] "r" (p), [l] "r" (l), [r] "r" (r)
00322            : "cc", "memory", "rax", "rcx", "mm0", "mm1");
00323 #else  /* defined(ASM_MMX) || defined(ASM_SSE) and not an Athlon */
00324           // pivot element in %%eax (key) + %%mm2 (all data)
00325           asm( \
00326            "lea (%[p],%[r],8),%%ecx \n\t" \
00327            "lea (%[p],%[l],8),%[i] \n\t " \
00328            "movl (%%ecx),%%eax \n\t" \
00329            "movq (%%ecx),%%mm2 \n\t" \
00330            "0: add $8,%[i] \n\t"
00331            "cmpl (%[i]),%%eax \n\t" \
00332            "jl 0b \n\t" \
00333            "movq (%[i]),%%mm0 \n\t" \
00334            "1: sub $8,%%ecx \n\t"
00335            "cmpl (%%ecx),%%eax \n\t" \
00336            "jg 1b \n\t" \
00337            "cmpl %%ecx,%[i] \n\t" \
00338            "jge 2f \n\t" \
00339            "movq (%%ecx),%%mm1 \n\t" \
00340            "movq %%mm0,(%%ecx) \n\t" \
00341            "movq %%mm1,(%[i]) \n\t" \
00342            "jmp 0b \n\t" \
00343            "2: movq %%mm2,(%[i]) \n\t" \
00344            "subl %[p],%[i] \n\t" \
00345            "shrl $3,%[i] \n\t" \
00346            "movq %%mm0,(%[p],%[r],8)" \
00347            : [i] "=&r" (i) // important: early clobbered as it may conflict with l!
00348            : [p] "r" (p), [l] "r" (l), [r] "r" (r)
00349            : "cc", "memory", "eax", "ecx", "mm0", "mm1", "mm2");
00350 #endif
00351           //std::cout << l << " " << i << " " << r << std::endl;
00352 
00353           // to save stack space, we always push the larger part on stack
00354           if (r-i<i-l)
00355            {
00356              stack[stackpointer].L=l; stack[stackpointer].R=i-1; l=i+1;
00357            }
00358           else
00359            {
00360              stack[stackpointer].L=i+1; stack[stackpointer].R=r; r=i-1;
00361            }
00362           ++stackpointer;
00363           goto iteration_loop; // and iterate the smaller part
00364         }
00365      }
00366     if (stackpointer)
00367      {
00368        --stackpointer;
00369        l=stack[stackpointer].L; r=stack[stackpointer].R;
00370        goto iteration_loop;
00371      }
00372   }
00373 
00374 public:
00375 
00376  inline FakeHeap() : p(new TSieve_Delta[MORESIZE]), size(0), capacity(MORESIZE) { }
00377  inline ~FakeHeap()
00378   {
00379     delete [] p;
00380 #ifdef DEBUG
00381     MARK; cout << "cleared " << capacity*8/1024 << " KB." << endl;
00382 #endif
00383   }
00384 
00385  inline void clear() { size=0; }
00386  inline bool empty() const { return size==0; }
00387 
00388  inline void sort(void)
00389   {
00390     //std::cout << "FakeHeap<TSieve_Delta>: sorting " << size << " elements... " << std::flush;
00391     if (size>1)
00392      {
00393        quicksort(0,size-1);
00394        asm volatile ("emms");
00395      }
00396     //std::cout << "done." << std::endl;
00397 #if 0 || defined(DEBUG)
00398     for (int i=1; i<size; ++i) if (p[i-1].delta<p[i].delta) { std::cout << "not sorted " << i << ": " << p[i-1].delta << " " << p[i].delta << std::endl; MARK; exit(1); }
00399 #endif
00400     //for (int i=0; i<size; ++i) std::cout << p[i].delta << " ";
00401     //std::cout << std::endl;
00402   }
00403  inline void push(const TSieve_Delta &x)
00404   {
00405     if (size>=capacity)
00406      {
00407        TSieve_Delta* p_old = p;
00408        capacity+=MORESIZE;
00409        p = new TSieve_Delta[capacity];
00410        for (int i=0; i<size; ++i) p[i]=p_old[i];
00411        delete [] p_old;
00412      }
00413     p[size]=x; ++size;
00414     __builtin_prefetch (&p[size+24], 1, 1);
00415   }
00416  inline void pop(void)
00417   {
00418     --size;
00419   }
00420  inline const TSieve_Delta& top(void) const
00421   {
00422     __builtin_prefetch (&p[size-16], 0, 0);
00423     return p[size-1];
00424   } 
00425 };
00426 
00427 #endif
00428 
00429 #endif

Generated on Wed Nov 7 23:29:25 2007 for Qsieve by  doxygen 1.5.4