myBitString_X86_64.H

Go to the documentation of this file.
00001 #ifndef MYBITSTRING_HEADER_
00002 #define MYBITSTRING_HEADER_
00003 
00010 #include <new>
00011 using std::nothrow;
00012 
00013 #include "utils.H"
00014 
00015 #if 0
00016  // use 32 bits
00017  typedef unsigned int BS_unsigned_int;
00018  typedef signed int BS_signed_int;
00019  #define DW_per_BS_unsigned_int 1
00020  #define BS_shifts 5
00021  #define BS_bits 32 
00022  #define BS_mask 31 
00023 #else
00024  // use 64 bits
00025  typedef unsigned long int BS_unsigned_int;
00026  typedef signed long int BS_signed_int;
00027  #define DW_per_BS_unsigned_int 2
00028  #define BS_shifts 6
00029  #define BS_bits 64 
00030  #define BS_mask 63 
00031 #endif
00032 
00033 class myBitString
00034 {
00035 protected:
00036   int size;
00037   BS_unsigned_int *data;
00038   inline void resize(int newsize)
00039     {
00040       if (newsize < size) return;
00041       //cerr << "Resize " << size << " to " << newsize << endl;
00042       BS_unsigned_int *olddata = data;
00043       data = new(nothrow) BS_unsigned_int[newsize];
00044       if (!data)
00045         {
00046           cerr << "myBitString.resize(" << newsize << ") out of memory" << endl;
00047           exit(1);
00048         }
00049       for (int i=0; i<size; i++) data[i] = olddata[i];
00050       for (int i=size; i< newsize; i++) data[i]=0;
00051       size=newsize;
00052       if (olddata) delete [] olddata;
00053     }
00054 public:
00055   inline void optisize(void)
00056     {
00057       int i=size-1;
00058       while (i>=0 && data[i]==0) i--;
00059       if (i==size-1) return; // nothing to do...
00060 
00061       if (size<128/DW_per_BS_unsigned_int)
00062        {
00063          // to avoid heap pollution for small sizes, simply decrease
00064          // the size without reallocating any heap memory
00065 #ifdef VERBOSE
00066          cout << "MyBitString::Optisize, no reallocate " << size << " to " << i+1 << endl;
00067 #endif
00068          size=i+1; return;
00069        }
00070 
00071 #ifdef VERBOSE
00072       cout << "MyBitString::Optisize, reallocate" << size << " to " << i+1 << endl;
00073 #endif
00074       if (i<0) { delete [] data; data=NULL; size=0; return; } // now empty
00075 
00076       size=i+1; // new size
00077       BS_unsigned_int *olddata = data;
00078       data = new(nothrow) BS_unsigned_int[size];
00079       if (!data)
00080         {
00081           cerr << "myBitString.optisize(" << size << ") out of memory" << endl;
00082           exit(1);
00083         }
00084       for (; i>=0; --i) data[i] = olddata[i];
00085       if (olddata) delete [] olddata;
00086     }
00087 
00088   inline myBitString(void) : size(0), data(NULL) { }
00089   inline ~myBitString(void)
00090     {
00091       if (data) delete [] data;
00092       data=NULL;
00093       size=0;
00094     }
00095   inline myBitString(const myBitString &rhs) : size(rhs.size), data(NULL)
00096    {
00097      data = new(nothrow) BS_unsigned_int[size];
00098      if (!data)
00099        {
00100          cerr << "myBitString.operator=(" << size << ") out of memory" << endl;
00101          exit(1);
00102        }
00103      for (int i=size-1; i>=0; --i) data[i] = rhs.data[i];
00104    }
00105   inline myBitString& operator= (const myBitString &rhs)
00106    {
00107      if (data) delete [] data;
00108      size=rhs.size;
00109      data = new(nothrow) BS_unsigned_int[size];
00110      if (!data)
00111        {
00112          cerr << "myBitString.operator=(" << size << ") out of memory" << endl;
00113          exit(1);
00114        }
00115      for (int i=size-1; i>=0; --i) data[i] = rhs.data[i];
00116      return *this;
00117    }
00118   inline int count(const bool b = true) const
00119     {
00120       int z=0;
00121       if (!b) return -1;
00122       for (int i=0; i<size; i++)
00123         {
00124           register BS_unsigned_int h = data[i];
00125           while (h)
00126             {
00127               if (h&1) z++;
00128               h>>=1;
00129             }
00130         }
00131       return z;
00132     }
00133   inline int first(const bool b = true) const
00134     {
00135       if (b)
00136         {
00137           for (int i=0; i<size; i++)
00138             {
00139               register BS_unsigned_int h = data[i];
00140               if (h)
00141                 {
00142                   BS_signed_int j;
00143                   asm("bsf %0,%1" : "+g" (h), "=c" (j) );
00144                   return BS_bits*i+j;
00145                 }
00146             }
00147           //cerr << "first(1)=-1" << endl;
00148           return -1;
00149         }
00150       else
00151         {
00152           cerr << "myBitString.first(0) not implemented" << endl;
00153           exit(1);
00154         }
00155     }
00156   inline int next(int pos, const bool b = true) const
00157     {
00158       int i,j;
00159       if (pos<0) return first(b);
00160       pos++;
00161       if (b)
00162         {
00163           i=pos>>BS_shifts;
00164           j=pos&BS_mask;
00165           if (i>=size) return -1;
00166           register BS_unsigned_int h = data[i];
00167           h>>=j;
00168           if (h)
00169             {
00170               BS_signed_int j;
00171               asm("bsf %0,%1" : "+g" (h), "=c" (j) );
00172               return pos+j;
00173             }
00174           else
00175             {
00176               j = 0;
00177               for (i++;i<size; i++)
00178                 {
00179                   h = data[i];
00180                   if (h)
00181                     {
00182                       BS_signed_int j;
00183                       asm("bsf %0,%1" : "+g" (h), "=c" (j) );
00184                       return BS_bits*i+j;
00185                     }
00186                 }
00187             }
00188           return -1;
00189         }
00190       else
00191         {
00192           cerr << "myBitString.next(0) not implemented" << endl;
00193           exit(1);
00194         }
00195     }
00196   inline int last(const bool b = true) const
00197     {
00198       if (b)
00199         {
00200           for (register int i=size-1; i>=0; i--)
00201             {
00202               //cerr << "i=" << i << endl;
00203               register BS_unsigned_int h = data[i];
00204               if (h)
00205                 {
00206                   BS_signed_int j;
00207                   asm("bsr %0,%1" : "+g" (h), "=c" (j) );
00208                   return BS_bits*i+j;
00209                 }
00210             }
00211           //cerr << "last(1)=-1"<<endl;
00212           return -1;
00213         }
00214       else
00215         {
00216           cerr << "myBitString.last(0) not implemented" << endl;
00217           exit(1);
00218         }
00219     }
00220   inline int prev(int pos, const bool b = true) const
00221     {
00222       if (pos>BS_bits*size) return last(b);
00223       if (pos<=0) return -1;
00224       pos--;
00225       if (b)
00226         {
00227           int i=pos>>BS_shifts;
00228           int j=pos&BS_mask;
00229           register BS_unsigned_int h = data[i];
00230           h<<=BS_mask-j;
00231           if (h)
00232             {
00233               BS_signed_int j;
00234               asm("bsr %0,%1" : "+g" (h), "=c" (j) );
00235               return pos-(BS_mask-j);
00236             }
00237           else
00238             {
00239               for (i--;i>=0; i--)
00240                 {
00241                   h = data[i];
00242                   if (h)
00243                     {
00244                       BS_signed_int j;
00245                       asm("bsr %0,%1" : "+g" (h), "=c" (j) );
00246                       return BS_bits*i+j;
00247                     }
00248                 }
00249             }
00250           return -1;
00251         }
00252       else
00253         {
00254           cerr << "myBitString.prev(0) not implemented" << endl;
00255           exit(1);
00256         }
00257     }
00258   inline void invert(const int pos)
00259     {
00260       //cerr << "invert("<<pos<<")"<<endl;
00261       int i=pos>>BS_shifts; // BS_bits bits = 2^BS_shifts
00262       if (i>=size) resize(i+1);
00263 
00264       // remark: as we modify only one bit, we use btcl instead of btcq
00265       asm volatile ("btcl %1,(%0)" : : "q" (data), "r" (pos) : "cc"); // bit-test and complement
00266       // actually data[pos/BS_bits] gets modified (but we do not inform the compiler about this...)
00267     }
00268   inline bool test_and_invert(const int pos)
00269     {
00270       //cerr << "invert("<<pos<<")"<<endl;
00271       int i=pos>>BS_shifts; // BS_bits bits = 2^BS_shifts
00272       if (i>=size) resize(i+1);
00273       bool ret;
00274       asm ("btcl %[pos],(%[data])\n\tsetc %[flag]" : [flag] "=g" (ret) : [data] "q" (data), [pos] "r" (pos): "cc"); // bit-test and complement
00275        // actually data[pos/BS_bits] gets modified (but we do not inform the compiler about this...)
00276       return ret;
00277     }
00278   inline bool test(const int pos) const
00279     {
00280       if ((pos>>BS_shifts)>=size) return false;
00281       bool r;
00282       asm ("btl %2,(%1)\n\tsetc %0" : "=r" (r) : "q" (data), "r" (pos) : "cc"); // bit-test and r=CarryFlag
00283       return r;
00284     }
00285   inline void set(const int pos, const bool b)
00286     {
00287       //cerr << "set("<<pos<< "," << b << ")" << endl;
00288       register int i=pos>>BS_shifts;
00289       if (i>=size)
00290        {
00291         if (b)
00292           resize(i+1);
00293         else
00294           return;
00295        }
00296       if (b)
00297         asm volatile ("btsl %1,(%0)" : : "q" (data), "r" (pos) : "cc"); //data[i] |= bitmask[j];
00298       else
00299         asm volatile ("btrl %1,(%0)" : : "q" (data), "r" (pos) : "cc"); //data[i] &= ~bitmask[j];
00300       // actually data[pos/BS_bits] gets modified (but we do not inform the compiler about this...)
00301     }
00302   inline void set(const int pos)
00303     {
00304       register int i=pos>>BS_shifts;
00305       if (i>=size) resize(i+1);
00306       asm  ("btsl %1,(%0)" : : "q" (data), "r" (pos) : "cc"); //data[i] |= bitmask[j];
00307       // actually data[pos/BS_bits] gets modified (but we do not inform the compiler about this...)
00308     }
00309   inline void _xor(const myBitString &s)
00310     {
00311       int i=s.size-1;
00312       while (i>=0 && s.data[i]==0) i--; // Nullen des zweiten BitStrings ignorieren
00313       if (i>=size) // <-> i+1>size
00314         {
00315           cerr << "myBitString.xor calls resize" << endl;
00316           resize(i+1);
00317         }
00318       for (; i>=0; --i)
00319         data[i] ^= s.data[i];
00320     }
00321   inline const myBitString& operator ^= (const myBitString &s)
00322     {
00323       _xor(s); return *this;   
00324     }
00325 
00326   inline void _and(const myBitString &s1, const myBitString &s2)
00327   // der eigene Bitstring wird durch s1&s2 (komponentenweise) überschrieben
00328     {
00329       int i = MIN(s1.size,s2.size)-1;
00330       if (i>=size) // <-> i+1>size
00331         {
00332           // CAUTION:  (this == &s1 || this == &s2) -> BOOM!!
00333           // But since (this == &s1 || this == &s2) also
00334           // implies MIN(s1.size,s2.size)-1 < size, we can safely
00335           // destroy the array...
00336           if (data) delete [] data; // alten Vektor löschen
00337           size=i+1;
00338           data = new BS_unsigned_int[size];
00339           // initialisieren der Bits des neuen Vektors nicht notwendig (geschieht ja gleich durch das and)
00340         }
00341       else
00342        {
00343          register int j=size-1;
00344          while (j>i) data[j--]=0; // überschüssige Bits im Resultat löschen
00345        }
00346       for (; i>=0; --i)
00347         data[i] = s1.data[i] & s2.data[i];
00348     }
00349 
00350   template<typename T> void test_and_add_carry(const myBitString &s2, T &CarryVec) const
00351   // generic version, but still fast (will be automatically used, if no
00352   // specialized version is defined below)
00353   // temporäre (komponentenweise) AND-Verknüpfung den eigenen Bitstrings
00354   // mit s2; den Vektor CarryVector an den korrespondierenden gesetzten
00355   // Positionen inkrementieren.
00356     {
00357       const int s = MIN(size,s2.size);
00358       const int Bits = 8*sizeof(BS_unsigned_int);
00359       for (int i=0; i<s; ++i)
00360        {
00361          register BS_unsigned_int h = data[i] & s2.data[i];
00362          register int j=0;
00363          while (h)
00364           {
00365             if (h&0xff)
00366              {
00367                CarryVec[Bits*i+j  ]+=h&1;
00368                CarryVec[Bits*i+j+1]+=(h>>1)&1;
00369                CarryVec[Bits*i+j+2]+=(h>>2)&1;
00370                CarryVec[Bits*i+j+3]+=(h>>3)&1;
00371                CarryVec[Bits*i+j+4]+=(h>>4)&1;
00372                CarryVec[Bits*i+j+5]+=(h>>5)&1;
00373                CarryVec[Bits*i+j+6]+=(h>>6)&1;
00374                CarryVec[Bits*i+j+7]+=(h>>7)&1;
00375              }
00376             h>>=8;
00377             j+=8;
00378           }
00379        }
00380     }
00381 
00382 #if 1 && defined(ASM_X86_64)
00383  #if defined(VERBOSE)
00384   #warning "support for test_and_add_carry(...), (uint16, using SSE2)"
00385  #endif
00386   void test_and_add_carry(const myBitString &s2, unsigned short int CarryVec[]) const
00387   // temporäre (komponentenweise) AND-Verknüpfung den eigenen Bitstrings
00388   // mit s2; den Vektor CarryVector an den korrespondierenden gesetzten
00389   // Positionen inkrementieren.
00390     {
00391       //cout << "spezialisierte Variante uint16 (using SSE2)" << endl;
00392   
00393       static const unsigned short int PackedMultipl[8] __attribute__ ((aligned (16))) = { 128,64,32,16,8,4,2,1 };
00394        // these multipliers are needed to shift the bits of the packed words to the desired position
00395 
00396 #ifdef DEBUG
00397       if (reinterpret_cast<unsigned long int>(&PackedMultipl[0]) & 0x0fUL)
00398        {
00399          cerr << "Address of PackedMultipl is " << &PackedMultipl[0]
00400               << " which is not 16 byte aligned as requested!" << endl;
00401        }
00402 #endif
00403 
00404       long int s = MIN(size,s2.size)*DW_per_BS_unsigned_int -1; // doublewords (minus 1) to process
00405       // int s = MIN(last(),s2.last()); if (s>0) s/=32;
00406 
00407       // important:
00408       //  (i) we preload -4(%[data1],%[i],4) and -4(%[data2],%[i],4);
00409       //      fencepost error (reading -4(%[data]), resp. -4(%[data2]) is avoided by using %%eax instead of %[i]
00410       //  (ii) CarryVec MUST be 16-byte aligned!! (otherwise the program segfaults!)
00411 
00412       asm volatile ( \
00413        "test %[i],%[i] \n\t" \
00414        "js 9f \n\t" \
00415        "movl $0x00010001,%%eax \n\t" \
00416        "mov %[i],%%rdx \n\t" \
00417        "movd %%eax,%%xmm7 \n\t" \
00418        "movdqa %[PM],%%xmm6 \n\t" \
00419        "movl (%[data1],%[i],4),%%eax \n\t" \
00420        "pshufd $0x00,%%xmm7,%%xmm7 \n\t" \
00421        "shl $6,%%rdx \n\t" \
00422        "andl (%[data2],%[i],4),%%eax \n\t" \
00423        "add %[V],%%rdx \n\t" \
00424        "movd %%eax,%%xmm4 \n\t" \
00425        "mov %[i],%%rax \n\t" \
00426        "1: \n\t" \
00427        "dec %%rax \n\t" \
00428        "cmovs %[i],%%rax \n\t" \
00429        "prefetchnta -72(%[data2],%[i],4) \n\t" \
00430        "pshuflw $0x00,%%xmm4,%%xmm1 \n\t" \
00431        "pshuflw $0x55,%%xmm4,%%xmm3 \n\t" \
00432        "movd (%[data1],%%rax,4),%%xmm5 \n\t" \
00433        "pshufd $0x00,%%xmm1,%%xmm1 \n\t" \
00434        "pshufd $0x00,%%xmm3,%%xmm2 \n\t" \
00435        "pmullw %%xmm6,%%xmm1 \n\t" \
00436        "movd (%[data2],%%rax,4),%%xmm4 \n\t" \
00437        "movdqa %%xmm1,%%xmm0 \n\t" \
00438        "psrlw $15,%%xmm1 \n\t" \
00439        "pmullw %%xmm6,%%xmm2 \n\t" \
00440        "psrlw $7,%%xmm0 \n\t" \
00441        "movdqa %%xmm2,%%xmm3 \n\t" \
00442        "psrlw $7,%%xmm2 \n\t" \
00443        "pand %%xmm7,%%xmm0 \n\t" \
00444        "psrlw $15,%%xmm3 \n\t" \
00445        "pand %%xmm7,%%xmm2 \n\t" \
00446        "paddw (%%rdx),%%xmm0 \n\t" \
00447        "paddw 16(%%rdx),%%xmm1 \n\t" \
00448        "pand %%xmm5,%%xmm4 \n\t" \
00449        "paddw 32(%%rdx),%%xmm2 \n\t" \
00450        "paddw 48(%%rdx),%%xmm3 \n\t" \
00451        "movdqa %%xmm0,(%%rdx) \n\t" \
00452        "movdqa %%xmm1,16(%%rdx) \n\t" \
00453        "movdqa %%xmm2,32(%%rdx) \n\t" \
00454        "movdqa %%xmm3,48(%%rdx) \n\t" \
00455        "3: sub $64,%%rdx \n\t" \
00456        "dec %[i] \n\t" \
00457        "jns 1b \n\t" \
00458        "9: " \
00459        : [i] "+r" (s)
00460        : [PM] "m" (PackedMultipl[0]), [data1] "r" (data), [data2] "r" (s2.data), [V] "D" (CarryVec)
00461        : "memory", "cc", "rax", "rdx", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
00462     }
00463 #else 
00464   void test_and_add_carry(const myBitString &s2, unsigned short int CarryVec[]) const
00465   // temporäre (komponentenweise) AND-Verknüpfung den eigenen Bitstrings
00466   // mit s2; den Vektor CarryVector an den korrespondierenden gesetzten
00467   // Positionen inkrementieren.
00468     {
00469       //cout << "spezialisierte Variante uint16" << endl;
00470       long int s = MIN(size,s2.size)-1;
00471       asm volatile ( \
00472        "test %[i],%[i] \n\t" \
00473        "js 9f \n\t" \
00474        "1: movl (%[data1],%[i],4),%%eax \n\t" \
00475        "mov %[i],%%rdx \n\t" \
00476        "andl (%[data2],%[i],4),%%eax \n\t" \
00477        "shl $5,%%rdx \n\t" \
00478        "2: shrl %%eax \n\t" \
00479        "adcw $0,(%[V],%%rdx,2) \n\t" \
00480        "shrl %%eax \n\t" \
00481        "adcw $0,2(%[V],%%rdx,2) \n\t" \
00482        "shrl %%eax \n\t" \
00483        "adcw $0,4(%[V],%%rdx,2) \n\t" \
00484        "shrl %%eax \n\t" \
00485        "adcw $0,6(%[V],%%rdx,2) \n\t" \
00486        "shrl %%eax \n\t" \
00487        "adcw $0,8(%[V],%%rdx,2) \n\t" \
00488        "shrl %%eax \n\t" \
00489        "adcw $0,10(%[V],%%rdx,2) \n\t" \
00490        "shrl %%eax \n\t" \
00491        "adcw $0,12(%[V],%%rdx,2) \n\t" \
00492        "shrl %%eax \n\t" \
00493        "adcw $0,14(%[V],%%rdx,2) \n\t" \
00494        "add $8,%%rdx \n\t" \
00495        "test %%eax,%%eax \n\t" \
00496        "jnz 2b \n\t" \
00497        "dec %[i] \n\t" \
00498        "jns 1b \n\t" \
00499        "9: " \
00500        : [i] "+r" (s)
00501        : [data1] "r" (data), [data2] "r" (s2.data), [V] "D" (CarryVec)
00502        : "memory", "cc", "eax", "rdx");
00503     }
00504 #endif
00505 
00506 
00507 #if 0
00508   void test_and_add_carry(const myBitString &s2, unsigned int CarryVec[]) const
00509   // temporäre (komponentenweise) AND-Verknüpfung den eigenen Bitstrings
00510   // mit s2; den Vektor CarryVector an den korrespondierenden gesetzten
00511   // Positionen inkrementieren.
00512     {
00513       //cout << "spezialisierte Variante uint32" << endl;
00514       int s = MIN(size,s2.size)-1;
00515       asm volatile ( \
00516        "test %[i],%[i] \n\t" \
00517        "js 9f \n\t" \
00518        "1: movl (%[data1],%[i],4),%%eax \n\t" \
00519        "movl %[i],%%edx \n\t" \
00520        "andl (%[data2],%[i],4),%%eax \n\t" \
00521        "shll $5,%%edx \n\t" \
00522        "2: btl $0,%%eax \n\t" \
00523        "adcl $0,(%[V],%%edx,4) \n\t" \
00524        "btl $1,%%eax \n\t" \
00525        "adcl $0,4(%[V],%%edx,4) \n\t" \
00526        "btl $2,%%eax \n\t" \
00527        "adcl $0,8(%[V],%%edx,4) \n\t" \
00528        "btl $3,%%eax \n\t" \
00529        "adcl $0,12(%[V],%%edx,4) \n\t" \
00530        "btl $4,%%eax \n\t" \
00531        "adcl $0,16(%[V],%%edx,4) \n\t" \
00532        "btl $5,%%eax \n\t" \
00533        "adcl $0,20(%[V],%%edx,4) \n\t" \
00534        "btl $6,%%eax \n\t" \
00535        "adcl $0,24(%[V],%%edx,4) \n\t" \
00536        "btl $7,%%eax \n\t" \
00537        "adcl $0,28(%[V],%%edx,4) \n\t" \
00538        "addl $8,%%edx \n\t" \
00539        "shrl $8,%%eax \n\t" \
00540        "jnz 2b \n\t" \
00541        "decl %[i] \n\t" \
00542        "jns 1b \n\t" \
00543        "9: " \
00544        : [i] "+r" (s)
00545        : [data1] "r" (data), [data2] "r" (s2.data), [V] "D" (CarryVec)
00546        : "memory", "cc", "eax", "edx");
00547     }
00548 #endif
00549 
00550 };
00551 
00552 #endif

Generated on Wed Nov 7 23:29:25 2007 for Qsieve by  doxygen 1.5.4