Sieving.cc

Go to the documentation of this file.
00001 
00006 #include "Sieving.H"
00007 #include "qsieve.H"
00008 #include "modulo.H"
00009 using namespace numtheory;
00010 
00011 #include "mpqsPolynom.H"
00012 extern CmpqsPolynom Polynom;
00013 
00014 
00015 int SieveOffset = -LogicalSieveSize;
00016 /* In general the logical sieve interval for each mpqs polynomial will exceed
00017    the size of the physical sieve. The logical sieve has to be sieved in
00018    several parts/steps. SieveOffset is the offset of the logical sieve at
00019    the starting point of a physical sieve.
00020  */
00021 
00022 
00023 // "virtual" forward declaration (will be resolved at link time!)
00024 // IMPORTANT: never ever link against the wrong version, this would result
00025 //            in strange behaviour and a glorious mess!!
00026 class StaticRelations
00027 {
00028  public:
00029   static void insert(CRelation *GL, const bool do_multi_combine_init=true);
00030 };
00031 
00032 TSieveElement SieveArray_[PhysicalSieveSize+64+8] __attribute__ ((aligned (64))); // sieve memory
00033 
00034 TSieveElement log_Primzahl_of_PrimePowers[StaticFactorbase::max_additional_Powers];
00035 // this is the place for converted logarithms of further prime-powers
00036 
00037 TSieveElement SieveControl::log_PrimeNumbers[StaticFactorbase::MaxSize] __attribute__ ((aligned (16)));
00038 TSieveElement SieveControl::log_PrimeNumbers_dirty[StaticFactorbase::MaxSize] __attribute__ ((aligned (16)));
00039 int SieveControl::myFBStartIndex = SieveControl::FBLowestStartIndex;
00040 int SieveControl::myFBStartIndexHint = SieveControl::FBLowestStartIndex;
00041 signed int SieveControl::DirtinessPegel = 0;
00042 
00043 // for efficient access the values (per sieve interval) are placed here.
00044 int CDeltaComputations::Delta_of_PrimeNumbers[StaticFactorbase::MaxSize][2] __attribute__ ((aligned (16)));
00045 int CDeltaComputations::Delta_of_PrimePowers[StaticFactorbase::max_additional_Powers][2] __attribute__ ((aligned (16)));
00046 
00047 int CDeltaComputations::D_mod_FBPrime[StaticFactorbase::MaxSize];
00048 signed int CDeltaComputations::D_difference_to_last = 0;
00049 CDeltaComputations::CmpqsDmodPUpdater CDeltaComputations::mpqsDmodPUpdater;
00050 
00051 void CDeltaComputations::CmpqsDmodPUpdater::update()
00052 {
00053   //cout << "-MPQS PolyD update- " << Polynom.get_D() << endl;
00054   mpz_sub(act_D,Polynom.get_D(),act_D);
00055   if (mpz_fits_uint_p(act_D))
00056    {
00057      const int d = D_difference_to_last = mpz_get_ui(act_D);
00058 #ifdef VERBOSE
00059      MARK; cout<< "D update difference is " << d << ", #FB=" << StaticFactorbase::Size() << endl;
00060 #endif
00061      // start with element 1 (instead of element 0, which is expected to be -1)
00062      int i;
00063      for (i=2; i+1<StaticFactorbase::Size() && PrimeNumbers[i+1]<46340; i+=2)
00064       {
00065         D_mod_FBPrime[i+1]+=d;
00066         while (D_mod_FBPrime[i+1]>=D_mod_FBPrime[i]) D_mod_FBPrime[i+1]-=D_mod_FBPrime[i];
00067       }
00068      for (; i<StaticFactorbase::Size(); ++i)
00069       {
00070         D_mod_FBPrime[i]+=d;
00071         while (D_mod_FBPrime[i]>=PrimeNumbers[i]) D_mod_FBPrime[i]-=PrimeNumbers[i];
00072       }
00073      mpz_set(act_D,Polynom.get_D());
00074    }
00075   else
00076    {
00077 #ifdef VERBOSE
00078      MARK; cout<< "D update difference is " << act_D << ", #FB=" << StaticFactorbase::Size() << endl;
00079 #endif 
00080      mpz_set(act_D,Polynom.get_D());
00081      int i;
00082      for (i=2; i+1<StaticFactorbase::Size() && PrimeNumbers[i+1]<46340; i+=2)
00083       {
00084         D_mod_FBPrime[i]=PrimeNumbers[i]*PrimeNumbers[i+1];
00085         D_mod_FBPrime[i+1]=mpz_remainder_ui(act_D,D_mod_FBPrime[i]);
00086       }
00087      for (; i<StaticFactorbase::Size(); ++i) D_mod_FBPrime[i]=mpz_remainder_ui(act_D,PrimeNumbers[i]);
00088    }
00089 
00090 #ifdef DEBUG
00091   int i;
00092   for (i=2; i+1<StaticFactorbase::Size() && PrimeNumbers[i+1]<46340; i+=2)
00093     if (mpz_remainder_ui(Polynom.get_D(),D_mod_FBPrime[i]) != D_mod_FBPrime[i+1])
00094      {
00095        MARK; cerr << "sanity check failed for i=" << i << ": "
00096                   << mpz_remainder_ui(Polynom.get_D(),D_mod_FBPrime[i])
00097                   << " != " << D_mod_FBPrime[i+1] << endl;
00098        exit(1);
00099      }
00100    for (; i<StaticFactorbase::Size(); ++i)
00101     if (mpz_remainder_ui(Polynom.get_D(),PrimeNumbers[i]) != D_mod_FBPrime[i])
00102      {
00103        MARK; cerr << "sanity check failed for i=" << i << ": "
00104                   << mpz_remainder_ui(Polynom.get_D(),PrimeNumbers[i])
00105                   << " != " << D_mod_FBPrime[i] << endl;
00106        exit(1);
00107      }
00108 #endif
00109 }
00110 
00111 
00112 #if (TSIEVEELEMENTSIZE == 1)
00113 unsigned short int CSieveStaticFactors::LogValRuns[128];
00114 unsigned short int CSieveStaticFactors::PrimeNumberDiffs[CSieveStaticFactors::MaxSize];
00115 #endif
00116 
00117 int* CSieveStaticFactors::DefaultHelperTable[2] = { NULL, NULL };
00118 
00119 unsigned int SieveControl::PhysicalSieveIntervalIndex; // number of the current physical interval to sieve with
00120 // starts with 0 for the start of each new logical sieve interval
00121 // and is increased by 1 for each new physical sieve interval
00122 
00123 unsigned int SieveControl::BestPSI[2];
00124 // indices of the two best PhysicalSieveIntervals
00125 
00126 TSieveElement SieveControl::PhysicalSieveIntervalThresholdCorrector[1024] = { 0 };
00127 // idea: the logical sieve interval differs slightly in quality for different
00128 // physical sieve intervals. So let's roughly correct the threshold for
00129 // each physical sieve interval from time to time. This may increase the quality of
00130 // detected relations.
00131 // SieveControl::GetLogThreshold() will always add this correction value to its
00132 // computed value. So it behaves totally neutral. You may change these values on the fly...
00133 // initial default values: 0, therefore neutral and no correction.
00134 
00135 
00136 void SieveControl::set_logVal_for_Primenumber(const int pos, const TSieveElement val, const TSieveElement dirtyval)
00137 {
00138   log_PrimeNumbers[pos]=val;
00139   log_PrimeNumbers_dirty[pos]=dirtyval;
00140 }
00141 
00142 void SieveControl::set_logVal_for_Primepower(const int pos, const TSieveElement val)
00143 {
00144   log_Primzahl_of_PrimePowers[pos]=val;
00145 }
00146 
00147 bool SieveControl::Hit_after_DirtyCorrection(const signed int SievePos, TSieveElement Value)
00148 {
00149   // An dieser Stelle im Sieb wurde ein Treffer gefunden;
00150   // aber wegen der Dirty-Siebfaktoren, die für alle Siebelemente als Treffer gezählt werden,
00151   // könnte es tatsächlich gar kein Treffer sein.
00152   // Wir müssen daher den Dirty-Siebwert um die nicht-teilenden Dirty-Faktorbasiselemente korrigieren.
00153   // Dieser Korrekturwert wird hier bestimmt.
00154 
00155   // Wenn diese Routine aufgerufen wird, ist möglicherweise ein Siebtreffer gelandet worden;
00156   // Da es sich um ein Regelungssystem handelt, ist dies (zwar kein perfekter, aber)
00157   // ein sinnvoller Indikator, um nun wieder etwas weniger Dirtiness zu verlangen,
00158   // wenn doch kein Siebtreffer vorlag...
00159 
00160   for (int i=FBStartIndex()-1; i>=FBLowestStartIndex; --i)
00161    {
00162      register int d = normalized_signed_mod(SievePos,StaticFactorbase::PrimeNumbers[i],StaticFactorbase::PrimeNumberReciprocals[i]);
00163      // same as: SievePos%PrimeNumbers[i]; if (d<0) d+=PrimeNumbers[i];
00164      // remark: %-operation does *not* eliminate the sign!
00165      //         ugly: -1=1 (mod 2), but (-1)%2=-1, 1%2=1
00166      // -> so we have to eliminate the sign!!!
00167      const bool Treffer = (d==CDeltaComputations::Delta_of_PrimeNumbers[i][0] || d==CDeltaComputations::Delta_of_PrimeNumbers[i][1]);
00168      if (Treffer) // Treffer means: hit
00169       {
00170         // Value is decreasing
00171         Value+=PrimzahlSieveWeight_dirty(i); // remove dirty-Weight
00172         Value-=PrimzahlSieveWeight(i); // and replace it by the real Weight
00173       }
00174      else
00175       {
00176         // no hit
00177         // Value is increasing
00178         Value+=PrimzahlSieveWeight_dirty(i); // remove dirty-weight, because we haven't had a hit
00179 
00180         // shortcut "return false" on value>=0 doesn't work anymore, since it's not monotone increasing
00181       }
00182    }
00183 
00184   if (Value>=0)
00185    {
00186      SieveControl::RequestLessDirtiness(); // "steer in the opposite direction"
00187      return false; // Threshold exceeded, no sieve hit
00188    }
00189   return true; // Threshold not exceeded -> sieve hit
00190 }
00191 
00192 
00193 TSieveElement SieveControl::log_Threshold = 0; // will be initialized with the "real" value while the whole sieving stuff is initialized.
00194 TSieveElement SieveControl::raw_log_Threshold = 0; // will be initialized with the "real" value while the whole sieving stuff is initialized.
00195 TSieveElement SieveControl::log_ThresholdDeltaHint = 0; // initial value
00196 
00197 void SieveControl::compute_SieveThreshold(void)
00198 {
00199   // Here we determine/estimate the Threshold
00200   
00201   // Constraints for calling this initialization:
00202   // the values for 
00203   // 1. LogicalSieveSize (for interval [-LogicalSieveSize,LogicalSieveSize])
00204   // 2. kN (pre-multiplier of N to increase quality of static factorbase)
00205   // 3. and the static factorbase itself
00206   // have to be already initialized.
00207  
00208   // computing logarithmic threshold:
00209   mpz_t x; mpz_init(x);
00210   mpz_div_ui(x,kN,2); mpz_sqrt(x,kN); mpz_mul_ui(x,x,LogicalSieveSize);
00211   double d = log(fabs(mpz_get_d(x))) - (Factor_Threshold*log(StaticFactorbase::BiggestPrime()));
00212   mpz_clear(x);
00213   if (StaticFactorbase::PrimeNumbers[1]==2) d-=log(2); // If 2 is in the factorbase, then 2 divides all sieve entries.
00214   d*=log_SieveEntryMultiplier;
00215 #ifdef VERBOSE_INFO
00216   cout << "Threshold (initial sieve entries) for detecting relations set to " << d << endl;
00217 #endif
00218   raw_log_Threshold=log_Threshold=static_cast<TSieveElement>(floor(d));
00219   if (log_Threshold!=floor(d)) 
00220    {
00221      cerr << "With the compiled-in types and constants this" << endl;
00222      cerr << "factorization cannot be done, sorry." << endl; 
00223      cerr << "TSieveElement (compile-time type) needs to be increased to fit the threshold!" << endl;
00224      cerr << "Please modify log_SieveEntryMultiplier and/or TSieveElement" << endl;
00225      cerr << "and compile the program again." << endl;
00226      exit(1);
00227    }
00228 
00229 #if TSIEVEELEMENTSIZE > 0
00230   // sanity check...
00231   if (sizeof(TSieveElement)!=TSIEVEELEMENTSIZE)
00232    {
00233      cerr << "Compiled types and values for " << endl
00234           << "TSieveElement (Size=" << sizeof(TSieveElement) << ")"
00235           << " and TSIEVEELEMENTSIZE (=" << TSIEVEELEMENTSIZE << ")" << endl
00236           << "do not match. Please correct them!" << endl;
00237      exit(1);
00238    }
00239 #endif
00240 }
00241 
00242 
00243 
00244 
00245 #if defined(ASM_386) || defined(ASM_X86_64)
00246 #define SIEBASM_386
00247 
00248 unsigned int clobbered_int; // dummy
00249 
00250 #if TSIEVEELEMENTSIZE == 1
00251 #if defined(ASM_SSE2) || defined(ASM_X86_64)
00252  #ifdef DEBUG
00253   #warning "Initialize Sieve: SSE2 Sieve initialization activated."
00254  #endif
00255 #define asm_sieb_init(logValue) \
00256   asm volatile(\
00257    "movd %%eax,%%xmm0 # -> (x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,a) \n\t" \
00258    "punpcklbw %%xmm0,%%xmm0 # (x,x,x,x,x,x,x,x,x,x,x,x,x,x,a,a) \n\t" \
00259    "movl %[count],%%eax \n\t" \
00260    "punpcklbw %%xmm0,%%xmm0 # (x,x,x,x,x,x,x,x,x,x,x,x,a,a,a,a) \n\t" \
00261    "punpcklbw %%xmm0,%%xmm0 # (x,x,x,x,x,x,x,x,a,a,a,a,a,a,a,a) \n\t" \
00262    "punpcklbw %%xmm0,%%xmm0 # (a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a) \n\t" \
00263    ".balign 16 \n\t" \
00264    "1: \n\t" \
00265    "movdqa %%xmm0,(%[Sieve]) \n\t" \
00266    "movdqa %%xmm0,0x10(%[Sieve]) \n\t" \
00267    "movdqa %%xmm0,0x20(%[Sieve]) \n\t" \
00268    "movdqa %%xmm0,0x30(%[Sieve]) \n\t" \
00269    "add $0x40,%[Sieve] \n\t" \
00270    "sub $0x40,%%eax \n\t" \
00271    "jnz 1b \n\t" \
00272    "emms \n" \
00273    : "=D" (clobbered_int), "=a" (clobbered_int) : [threshold] "a" (logValue), [Sieve] "D" (SieveArray), [count] "i" (PhysicalSieveSize) : "cc", "memory", "xmm0");
00274 #elif defined(ASM_MMX)
00275  #ifdef DEBUG
00276   #warning "Initialize Sieve: MMX Sieve initialization activated."
00277  #endif
00278 #define asm_sieb_init(logValue) \
00279   asm volatile(\
00280    "movd %%eax,%%mm0 # -> (x,x,x,x,x,x,x,a) \n\t" \
00281    "punpcklbw %%mm0,%%mm0 # (x,x,x,x,x,x,a,a) \n\t" \
00282    "movl %[count],%%eax \n\t" \
00283    "punpcklbw %%mm0,%%mm0 # (x,x,x,x,a,a,a,a) \n\t" \
00284    "punpcklbw %%mm0,%%mm0 # (a,a,a,a,a,a,a,a) \n\t" \
00285    ".balign 16 \n\t" \
00286    "1: \n\t" \
00287    "movq %%mm0,(%[Sieve]) \n\t" \
00288    "movq %%mm0,8(%[Sieve]) \n\t" \
00289    "movq %%mm0,16(%[Sieve]) \n\t" \
00290    "movq %%mm0,24(%[Sieve]) \n\t" \
00291    "movq %%mm0,32(%[Sieve]) \n\t" \
00292    "movq %%mm0,40(%[Sieve]) \n\t" \
00293    "movq %%mm0,48(%[Sieve]) \n\t" \
00294    "movq %%mm0,56(%[Sieve]) \n\t" \
00295    "add $64,%[Sieve] \n\t" \
00296    "sub $64,%%eax \n\t" \
00297    "jnz 1b \n\t" \
00298    "emms \n" \
00299    : "=D" (clobbered_int), "=a" (clobbered_int) : [threshold] "a" (logValue), [Sieve] "D" (SieveArray), [count] "i" (PhysicalSieveSize) : "cc", "memory", "mm0");
00300 #else
00301 #define asm_sieb_init(logValue) \
00302   asm volatile ( \
00303        "movb %%al,%%ah\n\t" \
00304        "pushw %%ax\n\t" \
00305        "shll $16,%%eax\n\t" \
00306        "popw %%ax\n\t" \
00307        "shrl $2,%%ecx\n\t" \
00308        "cld\n\t" \
00309        "rep\n\t" \
00310        "stosl" \
00311        : "=c" (clobbered_int), "=D" (clobbered_int) : "a" (logValue), "D" (SieveArray), "c" (PhysicalSieveSize) : "cc", "memory");
00312 #endif
00313 
00314 #if defined(ASM_ATHLON) || defined(ASM_SSE) || defined(ASM_X86_64)
00315 // ATHLON-optimized code
00316 #ifdef DEBUG
00317 #warning "using IA-32/ATHLON version for macro: asm_search_sieb"
00318 #endif
00319 #define asm_search_sieb(offset) \
00320   asm volatile ( \
00321        "0: # calibrate alignment \n\t" \
00322        "test $7,%[pos] \n\t" \
00323        "jz 1f # is aligned \n\t" \
00324        "dec %[pos] \n\t" \
00325        "testb $0x80,(%[sieb],%[pos]) \n\t" \
00326        "jns 0b \n\t" \
00327        "js 4f # hit detected \n\t" \
00328        "1: # search for hit, (big steps) loop \n\t" \
00329        "prefetcht2 -256(%[sieb],%[pos]) # tuning cache!! \n\t" \
00330        "sub $64,%[pos] \n\t" \
00331        "movq  (%[sieb],%[pos]),%%mm0 \n\t" \
00332        "movq 8(%[sieb],%[pos]),%%mm1 \n\t" \
00333        "por 16(%[sieb],%[pos]),%%mm0 \n\t" \
00334        "por 24(%[sieb],%[pos]),%%mm1 \n\t" \
00335        "por 32(%[sieb],%[pos]),%%mm0 \n\t" \
00336        "por 40(%[sieb],%[pos]),%%mm1 \n\t" \
00337        "por 48(%[sieb],%[pos]),%%mm0 \n\t" \
00338        "por 56(%[sieb],%[pos]),%%mm1 \n\t" \
00339        "por %%mm1,%%mm0 \n\t" \
00340        "pmovmskb %%mm0,%%eax # highest bits of packed bytes to %%ah \n\t" \
00341        "testl %%eax,%%eax \n\t" \
00342        "jz 1b \n\t" \
00343        "emms # clear MMX state \n\t" \
00344        "2: # search for hit, (little steps) init \n\t" \
00345        "add $64,%[pos] \n\t" \
00346        "3: # search for hit, (little steps) loop \n\t" \
00347        "sub $4,%[pos] \n\t" \
00348        "testl $0x80808080,(%[sieb],%[pos]) \n\t" \
00349        "jz 3b \n\t" \
00350        "movl (%[sieb],%[pos]),%%eax \n\t" \
00351        "add $3,%[pos] \n\t" \
00352        "testl $0x80000000,%%eax \n\t" \
00353        "jnz 4f \n\t" \
00354        "dec %[pos] \n\t" \
00355        "testl $0x00800000,%%eax \n\t" \
00356        "jnz 4f \n\t" \
00357        "dec %[pos] \n\t" \
00358        "testl $0x00008000,%%eax \n\t" \
00359        "jnz 4f \n\t" \
00360        "dec %[pos] \n\t" \
00361        "4: # hit detected" \
00362        : [pos] "=q" (offset) : "[pos]" (offset), [sieb] "r" (SieveArray) : "cc", "eax", "mm0", "mm1");
00363 #else
00364 // optimized i386 code
00365 #define asm_search_sieb(offset) \
00366   asm volatile ( \
00367        "0: # calibrate alignment \n\t" \
00368        "testl $3,%[pos] \n\t" \
00369        "jz 1f # is aligned \n\t" \
00370        "decl %[pos] \n\t" \
00371        "testb $0x80,(%[sieb],%[pos]) \n\t" \
00372        "jns 0b \n\t" \
00373        "js 2f # hit detected \n\t" \
00374        "1: # start search \n\t" \
00375        "subl $4,%[pos] \n\t" \
00376        "testl $0x80808080,(%[sieb],%[pos]) \n\t" \
00377        "jz 1b \n\t" \
00378        "addl $3,%[pos] \n\t" \
00379        "testb $0x80,(%[sieb],%[pos]) \n\t" \
00380        "js 2f \n\t" \
00381        "decl %[pos] \n\t" \
00382        "testb $0x80,(%[sieb],%[pos]) \n\t" \
00383        "js 2f \n\t" \
00384        "decl %[pos] \n\t" \
00385        "testb $0x80,(%[sieb],%[pos]) \n\t" \
00386        "js 2f \n\t" \
00387        "decl %[pos] \n\t" \
00388        "2: # hit detected" \
00389        : [pos] "=r" (offset) : "[pos]" (offset), [sieb] "q" (SieveArray) : "cc");
00390 #endif
00391 #if defined(ASM_X86_64)
00392 // code containing cmov optimizations for X86_64
00393  #ifdef DEBUG
00394   #warning "using CMOV X86_64 version for macro: asm_sieb"
00395  #endif
00396 #define asm_sieb(lp, d0, d1, P) \
00397   asm volatile ( \
00398        "cmp %[disp1],%[disp0]\n\t" \
00399        "cmova %[disp0],%%rax # conditional swap part1\n\t" \
00400        "cmova %[disp1],%[disp0] # part2 \n\t" \
00401        "cmova %%rax,%[disp1] # part3 \n\t" \
00402        "movl %[limit],%%eax\n\t" \
00403        "jmp 1f \n\t" \
00404        ".balign 16 \n\t" \
00405        "0: subb %[val],(%[sieb],%[disp1])\n\t" \
00406        "add %[step],%[disp1]\n\t" \
00407        "subb %[val],(%[sieb],%[disp0])\n\t" \
00408        "add %[step],%[disp0]\n\t" \
00409        "1: cmp %%rax,%[disp1]\n\t" \
00410        "jb 0b\n\t" \
00411        "cmp %%rax,%[disp0] \n\t" \
00412        "cmovb %[disp0],%%rax \n\t" \
00413        "subb %[val],(%[sieb],%%rax)\n\t" \
00414        "xor %%rax,%%rax \n\t" \
00415        "cmp %[limit],%[disp0] \n\t" \
00416        "cmovb %[step],%%rax \n\t" \
00417        "add %%rax,%[disp0] \n" \
00418        : [disp0] "+q" (d0), [disp1] "+q" (d1) : [step] "r" (P), [val] "r" (lp), [sieb] "r" (&SieveArray[0]), [limit] "i" (PhysicalSieveSize) : "cc", "rax");
00419 
00420 // do the same as for asm_sieb (but optimized for small P´s)
00421 #define asm_sieb_small(lp, d0, d1, P) \
00422   asm volatile ( \
00423        "cmp %[disp1],%[disp0]\n\t" \
00424        "cmova %[disp0],%%rax # conditional swap part1\n\t" \
00425        "cmova %[disp1],%[disp0] # part2 \n\t" \
00426        "cmova %%rax,%[disp1] # part3 \n\t" \
00427        "mov %[limit],%%eax\n\t" \
00428        "jmp 0f \n\t" \
00429        ".balign 16 \n\t" \
00430        "0: subb %[val],(%[sieb],%[disp1])\n\t" \
00431        "add %[step],%[disp1]\n\t" \
00432        "subb %[val],(%[sieb],%[disp0])\n\t" \
00433        "add %[step],%[disp0]\n\t" \
00434        "1: cmp %%rax,%[disp1]\n\t" \
00435        "jb 0b\n\t" \
00436        "cmp %%rax,%[disp0] \n\t" \
00437        "cmovb %[disp0],%%rax \n\t" \
00438        "subb %[val],(%[sieb],%%rax)\n\t" \
00439        "xor %%rax,%%rax \n\t" \
00440        "cmp %[limit],%[disp0] \n\t" \
00441        "cmovb %[step],%%rax \n\t" \
00442        "add %%rax,%[disp0] \n" \
00443        : [disp0] "+q" (d0), [disp1] "+q" (d1) : [step] "r" (P), [val] "r" (lp), [sieb] "r" (&SieveArray[0]), [limit] "i" (PhysicalSieveSize) : "cc", "rax");
00444 #elif defined(ASM_CMOV)
00445 // code containing cmov optimizations
00446  #ifdef DEBUG
00447   #warning "using CMOV version for macro: asm_sieb"
00448  #endif
00449 #define asm_sieb(lp, d0, d1, P) \
00450   asm volatile ( \
00451        "cmpl %[disp1],%[disp0]\n\t" \
00452        "cmova %[disp0],%%eax # conditional swap part1\n\t" \
00453        "cmova %[disp1],%[disp0] # part2 \n\t" \
00454        "cmova %%eax,%[disp1] # part3 \n\t" \
00455        "movl %[limit],%%eax\n\t" \
00456        "jmp 1f \n\t" \
00457        ".balign 16 \n\t" \
00458        "0: subb %[val],(%[sieb],%[disp1])\n\t" \
00459        "addl %[step],%[disp1]\n\t" \
00460        "subb %[val],(%[sieb],%[disp0])\n\t" \
00461        "addl %[step],%[disp0]\n\t" \
00462        "1: cmpl %%eax,%[disp1]\n\t" \
00463        "jb 0b\n\t" \
00464        "cmpl %%eax,%[disp0] \n\t" \
00465        "cmovb %[disp0],%%eax \n\t" \
00466        "subb %[val],(%[sieb],%%eax)\n\t" \
00467        "xorl %%eax,%%eax \n\t" \
00468        "cmpl %[limit],%[disp0] \n\t" \
00469        "cmovb %[step],%%eax \n\t" \
00470        "addl %%eax,%[disp0] \n" \
00471        : [disp0] "+q" (d0), [disp1] "+q" (d1) : [step] "r" (P), [val] "r" (lp), [sieb] "r" (&SieveArray[0]), [limit] "i" (PhysicalSieveSize) : "cc", "eax");
00472 
00473 // do the same as for asm_sieb (but optimized for small P´s)
00474 #define asm_sieb_small(lp, d0, d1, P) \
00475   asm volatile ( \
00476        "cmpl %[disp1],%[disp0]\n\t" \
00477        "cmova %[disp0],%%eax # conditional swap part1\n\t" \
00478        "cmova %[disp1],%[disp0] # part2 \n\t" \
00479        "cmova %%eax,%[disp1] # part3 \n\t" \
00480        "movl %[limit],%%eax\n\t" \
00481        "jmp 0f \n\t" \
00482        ".balign 16 \n\t" \
00483        "0: subb %[val],(%[sieb],%[disp1])\n\t" \
00484        "addl %[step],%[disp1]\n\t" \
00485        "subb %[val],(%[sieb],%[disp0])\n\t" \
00486        "addl %[step],%[disp0]\n\t" \
00487        "1: cmpl %%eax,%[disp1]\n\t" \
00488        "jb 0b\n\t" \
00489        "cmpl %%eax,%[disp0] \n\t" \
00490        "cmovb %[disp0],%%eax \n\t" \
00491        "subb %[val],(%[sieb],%%eax)\n\t" \
00492        "xorl %%eax,%%eax \n\t" \
00493        "cmpl %[limit],%[disp0] \n\t" \
00494        "cmovb %[step],%%eax \n\t" \
00495        "addl %%eax,%[disp0] \n" \
00496        : [disp0] "+q" (d0), [disp1] "+q" (d1) : [step] "r" (P), [val] "r" (lp), [sieb] "r" (&SieveArray[0]), [limit] "i" (PhysicalSieveSize) : "cc", "eax");
00497 #else
00498 #define asm_sieb(lp, d0, d1, P) \
00499   asm volatile ( \
00500        "cmp %[disp1],%[disp0]\n\t" \
00501        "jbe 1f \n\t" \
00502        "xchg %[disp0],%[disp1]\n\t" \
00503        "jmp 1f \n\t" \
00504        "0: subb %[val],(%[sieb],%[disp1])\n\t" \
00505        "add %[step],%[disp1]\n\t" \
00506        "subb %[val],(%[sieb],%[disp0])\n\t" \
00507        "add %[step],%[disp0]\n\t" \
00508        "1: cmp %[limit],%[disp1]\n\t" \
00509        "jb 0b\n\t" \
00510        "cmp %[limit],%[disp0] \n\t" \
00511        "jae 2f \n\t \n\t" \
00512        "subb %[val],(%[sieb],%[disp0])\n\t" \
00513        "add %[step],%[disp0] \n\t" \
00514        "2: #done \n" \
00515        : [disp0] "+q" (d0), [disp1] "+q" (d1) : [step] "r" (P), [val] "r" (lp), [sieb] "r" (&SieveArray[0]), [limit] "i" (PhysicalSieveSize) : "cc");
00516 
00517 #define asm_sieb_small(lp, d0, d1, P) \
00518   asm volatile ( \
00519        "cmp %[disp1],%[disp0]\n\t" \
00520        "jbe 0f \n\t" \
00521        "xchg %[disp0],%[disp1]\n\t" \
00522        "0: subb %[val],(%[sieb],%[disp1])\n\t" \
00523        "add %[step],%[disp1]\n\t" \
00524        "subb %[val],(%[sieb],%[disp0])\n\t" \
00525        "add %[step],%[disp0]\n\t" \
00526        "1: cmp %[limit],%[disp1]\n\t" \
00527        "jb 0b\n\t" \
00528        "cmp %[limit],%[disp0] \n\t" \
00529        "jae 2f \n\t \n\t" \
00530        "subb %[val],(%[sieb],%[disp0])\n\t" \
00531        "add %[step],%[disp0] \n\t" \
00532        "2: #done \n" \
00533        : [disp0] "+q" (d0), [disp1] "+q" (d1) : [step] "r" (P), [val] "r" (lp), [sieb] "r" (&SieveArray[0]), [limit] "i" (PhysicalSieveSize) : "cc");
00534 #endif
00535 #elif TSIEVEELEMENTSIZE == 2
00536 #define asm_sieb_init(logValue) \
00537   asm volatile ( \
00538        "pushw %%ax\n\t" \
00539        "shll $16,%%eax\n\t" \
00540        "popw %%ax\n\t" \
00541        "shrl $1,%%ecx\n\t" \
00542        "cld\n\t" \
00543        "rep\n\t" \
00544        "stosl" \
00545        : "=c" (clobbered_int), "=D" (clobbered_int) : "a" (logValue), "D" (SieveArray), "c" (PhysicalSieveSize) : "cc", "memory");
00546 #define asm_search_sieb(offset) \
00547   asm volatile ( \
00548        "0: # calibrate alignment \n\t" \
00549        "testl $1,%[pos] \n\t" \
00550        "jz 1f # is aligned \n\t" \
00551        "decl %[pos] \n\t" \
00552        "testw $0x8000,(%[sieb],%[pos],2) \n\t" \
00553        "js 2f # hit detected \n\t" \
00554        "1: # Treffer suchen \n\t" \
00555        "subl $2,%[pos] \n\t" \
00556        "testl $0x80008000,(%[sieb],%[pos],2) \n\t" \
00557        "jz 1b \n\t" \
00558        "incl %[pos] \n\t" \
00559        "testw $0x8000,(%[sieb],%[pos],2) \n\t" \
00560        "js 2f \n\t" \
00561        "decl %[pos] \n\t" \
00562        "2: # hit detected" \
00563        : [pos] "=r" (offset) : "[pos]" (offset), [sieb] "q" (SieveArray) : "cc");
00564 #define asm_sieb(lp, d0, d1, P) \
00565   asm volatile ( \
00566        "cmpl %[disp1],%[disp0]\n\t" \
00567        "jbe 1f \n\t" \
00568        "xchgl %[disp0],%[disp1]\n\t" \
00569        "jmp 1f \n\t" \
00570        "0: subw %[val],(%[sieb],%[disp1],2)\n\t" \
00571        "subw %[val],(%[sieb],%[disp0],2)\n\t" \
00572        "addl %[step],%[disp1]\n\t" \
00573        "addl %[step],%[disp0]\n\t" \
00574        "1: cmpl %[limit],%[disp1]\n\t" \
00575        "jb 0b\n\t" \
00576        "cmpl %[limit],%[disp0] \n\t" \
00577        "jae 2f \n\t \n\t" \
00578        "subw %[val],(%[sieb],%[disp0],2)\n\t" \
00579        "addl %[step],%[disp0] \n\t" \
00580        "2: \n" \
00581        : [disp0] "+q" (d0), [disp1] "+q" (d1) : [step] "r" (P), [val] "r" (lp), [sieb] "r" (&SieveArray[0]), [limit] "i" (PhysicalSieveSize) : "cc");
00582 #elif TSIEVEELEMENTSIZE == 4
00583 #define asm_sieb_init(logValue) \
00584   asm volatile ( \
00585        "cld\n\t" \
00586        "rep\n\t" \
00587        "stosl" \
00588        : "=c" (clobbered_int), "=D" (clobbered_int) : "a" (logValue), "D" (SieveArray), "c" (PhysicalSieveSize) : "memory");
00589 #define asm_search_sieb(offset) \
00590   asm volatile ( \
00591        "1: # start search \n\t" \
00592        "decl %[pos] \n\t" \
00593        "testl $0x80008000,(%[sieb],%[pos],4) \n\t" \
00594        "jz 1b \n\t" \
00595        "2: # hit detected" \
00596        : [pos] "=r" (offset) : "[pos]" (offset), [sieb] "q" (SieveArray) : "cc");
00597 #define asm_sieb(lp, d0, d1, P) \
00598   asm volatile ( \
00599        "cmpl %[disp1],%[disp0]\n\t" \
00600        "jbe 1f \n\t" \
00601        "xchgl %[disp0],%[disp1]\n\t" \
00602        "jmp 1f \n\t" \
00603        "0: subl %[val],(%[sieb],%[disp1],4)\n\t" \
00604        "subl %[val],(%[sieb],%[disp0],4)\n\t" \
00605        "addl %[step],%[disp1]\n\t" \
00606        "addl %[step],%[disp0]\n\t" \
00607        "1: cmpl %[limit],%[disp1]\n\t" \
00608        "jb 0b\n\t" \
00609        "cmpl %[limit],%[disp0] \n\t" \
00610        "jae 2f \n\t \n\t" \
00611        "subl %[val],(%[sieb],%[disp0],4)\n\t" \
00612        "addl %[step],%[disp0] \n\t" \
00613        "2: \n" \
00614        : [disp0] "+q" (d0), [disp1] "+q" (d1) : [step] "r" (P), [val] "r" (lp), [sieb] "r" (&SieveArray[0]), [limit] "i" (PhysicalSieveSize) : "cc");
00615 #else
00616 #undef SIEBASM_386
00617 #warning "No optimized assemblercode available for sieving with TSieveElement!"
00618 #endif
00619  #if !defined(asm_sieb_small)
00620   #warning "no optimized asm_sieb_small, therefore using asm_sieb"
00621   #define asm_sieb_small asm_sieb
00622  #endif
00623 #endif
00624 
00625 
00626 void SieveControl::RecalibrateThresholds()
00627 {
00628 #ifdef VERBOSE_NOTICE
00629   cout << "recalibrating MPQS thresholds" << endl;
00630 #endif
00631 
00632   const double FB_Threshold = log_SieveEntryMultiplier*Factor_Threshold*log(StaticFactorbaseSettings::BiggestPrime());
00633   TSieveElement Min=std::numeric_limits<TSieveElement>::max(), Max=std::numeric_limits<TSieveElement>::min();
00634   unsigned int I=0;
00635   for (signed int mySieveOffset=-LogicalSieveSize; mySieveOffset<LogicalSieveSize; mySieveOffset+=PhysicalSieveSize)
00636    {
00637     double wMin = log_SieveEntryMultiplier*Polynom.get_logval(mySieveOffset);
00638     double wMax = wMin;
00639     double dAvg = 0.0; unsigned int samplecount = 0;
00640     for (int i=0; i<PhysicalSieveSize; i+=8) // or use "i+=1" for finest granularity
00641      {
00642        const double w = log_SieveEntryMultiplier*Polynom.get_logval(mySieveOffset+i);
00643        if (w<wMin) wMin=w; // find minimum of the samples
00644        if (w>wMax) wMax=w; // find maximum of the samples
00645        dAvg+=w; ++samplecount;
00646      }
00647     dAvg /= samplecount;
00648     //cout << "pre I: " << I << ": " << dAvg << ", " << wMin << ", " << wMax << endl;
00649 
00650     // now we are able to set a reasonable threshold for this interval:
00651     register double d = dAvg; // you can try wAvg, wMin, wMax, (wMin+wMax)/2 or something else...
00652     //cout << "d=" << d << endl;
00653     d-=FB_Threshold;
00654     if (StaticFactorbaseSettings::StaticPrime(1)==2) d-=log_SieveEntryMultiplier*log(2);
00655     d-=GetRawLogThreshold();
00656     if ( d>std::numeric_limits<TSieveElement>::max() || d<std::numeric_limits<TSieveElement>::min() )
00657      {
00658        // threshold does not fit in TSieveElement...
00659        MARK; cerr << "numeric_limits<TSieveElement> exceeded. Out of range." << endl;
00660        exit(1);
00661      }
00662     register const TSieveElement w = static_cast<TSieveElement>(d); // convert to SieveElement type
00663     PhysicalSieveIntervalThresholdCorrector[I++]=w;
00664     //cout << "I: " << I-1 << ": " << (int)PhysicalSieveIntervalThresholdCorrector[I-1] << ", " << d << endl;
00665 
00666     #if TSIEVEELEMENTSIZE == 1
00667        Min=MIN(Min,w); Max=MAX(Max,w);
00668     #endif
00669    }
00670 
00671   // and now find out the two best physical intervals
00672   BestPSI[0]=0; BestPSI[1]=I-1;
00673   for (unsigned int i=1; i<I-i; ++i)
00674    {
00675      //cout << (int)(PhysicalSieveIntervalThresholdCorrector[i]) << " " << (int)(PhysicalSieveIntervalThresholdCorrector[I-1-i]) << endl;
00676      if (PhysicalSieveIntervalThresholdCorrector[i]<PhysicalSieveIntervalThresholdCorrector[BestPSI[0]]) BestPSI[0]=i;
00677      if (PhysicalSieveIntervalThresholdCorrector[I-1-i]<PhysicalSieveIntervalThresholdCorrector[BestPSI[1]]) BestPSI[1]=I-1-i;
00678    }
00679   cout << "Best two physical sieve intervals are " << BestPSI[0] << " and " << BestPSI[1] << endl;
00680 
00681 #if defined(VERBOSE_INFO)
00682  #if TSIEVEELEMENTSIZE == 1
00683   // do you want to get the picture?
00684   cout << "MPQS polynomial thresholds over physical intervals:" << endl;
00685   cout << "+";  for (unsigned int j=0; j<I; ++j) cout << "-"; cout << "+" << endl;
00686   for (signed int i=Max; i>=Min; --i)
00687    {
00688      if (i) cout << "|"; else cout << "+";
00689      const char ch = i? ' ' : '-'; 
00690      for (unsigned int j=0; j<I; ++j)
00691       if (PhysicalSieveIntervalThresholdCorrector[j]==i) cout << "*"; else cout << ch;
00692      cout << "|" << endl;
00693    }
00694   cout << "+";  for (unsigned int j=0; j<I; ++j) cout << "-"; cout << "+" << endl;
00695  #endif
00696 #endif /* VERBOSE_INFO */
00697 }
00698 
00699 void initialize_Sieve()
00700 {
00701   //cout << "Initializing sieve from " << SieveOffset
00702   //     << " to " << SieveOffset+PhysicalSieveSize << endl;
00703 
00704   // for each "false positive hit" the dirtiness gets decreased,
00705   // and for each physical interval the dirtiness is increased,
00706   // -> adaptive control system for dirtiness
00707   SieveControl::RequestMoreDirtiness();
00708 
00709 #if 1
00710   // static version using an overall precomputed threshold
00711   register TSieveElement w = SieveControl::GetLogThreshold();
00712 #else
00713   // should only be activated for debugging sessions, since the above code
00714   // can already make use of recalibrating MPQS interval thresholds! (Thorsten 2005-01-21)
00715 
00716   // dynamic version: compute an adaptive threshold for each physical sieve interval
00717     const TSieveElement FB_Threshold
00718       = static_cast<TSieveElement>(log_SieveEntryMultiplier*Factor_Threshold*log(StaticFactorbaseSettings::BiggestPrime()));
00719     register TSieveElement w = static_cast<TSieveElement>(log_SieveEntryMultiplier*Polynom.get_logval(SieveOffset));
00720 
00721     w-=FB_Threshold;
00722     if (StaticFactorbaseSettings::StaticPrime(1)==2) w-=static_cast<TSieveElement>(log_SieveEntryMultiplier*log(2));
00723     w-=SieveControl::GetLogDirtyCorrection();
00724 #endif
00725   /* log_Threshold denotes the threshold for detecting a relation.
00726      Instead of adding a logarithm at each place where a static factor hits
00727      the sieve (and comparing the sieve-entry with the threshold
00728      afterwards), we proceed the other way round: We initialize each sieve
00729      entry by using the threshold, then we're going subtract logarithms for
00730      each hit. This eases detection of hits, because the only thing to do
00731      is checking for negative sieve entries after the sieving is finished
00732      for a certain interval.
00733    */
00734   
00735   // Initialize sieve with the roughly calculated threshold
00736   // (hint: -> never calculate separate thresholds for each entry, as this slows down performance!)
00737 #ifdef SIEBASM_386
00738   asm_sieb_init(w); // confer the appropriate Assembler-Macro
00739 #else
00740   register int i=PhysicalSieveSize; do SieveArray[--i]=w; while (i);
00741 #endif
00742 
00743   // activate this block to compare the polynomial value and its logval against the actually used threshold
00744 #if 0
00745   {
00746     TSieveElement w_test = static_cast<TSieveElement>(log_SieveEntryMultiplier*Polynom.get_logval(SieveOffset));
00747     TSieveElement FB_Threshold
00748       = static_cast<TSieveElement>(log_SieveEntryMultiplier*Factor_Threshold*log(StaticFactorbaseSettings::BiggestPrime()));
00749     w_test-=FB_Threshold;
00750     if (StaticFactorbaseSettings::StaticPrime(1)==2) w_test-=static_cast<TSieveElement>(log_SieveEntryMultiplier*log(2));
00751     w_test-=SieveControl::GetLogDirtyCorrection();
00752 
00753     static unsigned long int overall_intervals = 0;
00754     static unsigned long int differing_intervals = 0;
00755 
00756     ++overall_intervals;
00757     if (w_test!=w)
00758      {
00759        ++differing_intervals;
00760        cout << "Thresholds bad/all: " << differing_intervals << "/" << overall_intervals << " ";
00761        cout << "Offset: " << SieveOffset;
00762     #if TSIEVEELEMENTSIZE == 1
00763        // chars are not printed as numbers, so we cast them:
00764        cout << " -> logval: " << static_cast<signed int>(w_test)
00765             << " <-> threshold: " << static_cast<signed int>(w) << endl;
00766     #else
00767        cout << " -> logval: " << w_test << " <-> threshold: " << w << endl;
00768     #endif
00769      }
00770   }
00771 #endif
00772 }
00773 
00774 
00775 void SieveControl::create_sieve_image(int* &Image, int mySieveOffset)
00776 {
00777 #ifdef VERBOSE
00778   cout << "creating MPQS physical sieve image for SieveOffset " << SieveOffset << endl;
00779 #endif
00780   if (Image==NULL) Image = new int[PhysicalSieveSize];
00781   const TSieveElement FB_Threshold
00782     = static_cast<TSieveElement>(log_SieveEntryMultiplier*Factor_Threshold*log(StaticFactorbaseSettings::BiggestPrime()));
00783   for (int i=0; i<PhysicalSieveSize; ++i)
00784    {
00785      register TSieveElement w = static_cast<TSieveElement>(log_SieveEntryMultiplier*Polynom.get_logval(mySieveOffset+i));
00786      w-=FB_Threshold;
00787      if (StaticFactorbaseSettings::StaticPrime(1)==2) w-=static_cast<TSieveElement>(log_SieveEntryMultiplier*log(2));
00788      w-=GetRawLogThreshold();
00789      Image[i]=w;
00790    }
00791 }
00792 
00793 
00794 void initialize_Sieve(const int Image[])
00795 {
00796   //cout << "Initializing sieve from " << SieveOffset
00797   //     << " to " << SieveOffset+PhysicalSieveSize << endl;
00798 
00799   // for each "false positive hit" the dirtiness gets decreased,
00800   // and for each physical interval the dirtiness is increased,
00801   // -> adaptive control system for dirtiness
00802   SieveControl::RequestMoreDirtiness();
00803 
00804 #if (defined(ASM_MMX) || defined(ASM_X86_64)) && (TSIEVEELEMENTSIZE == 1)
00805  #ifdef DEBUG
00806   #warning "Initialize Sieve: MMX Sieve Image copy activated."
00807  #endif
00808  // about using prefetchnta: it's not a pure MMX instruction, but a streaming extension;
00809  // also if the sieve image will fit into cache, "prefetchnta" is counterproductive.
00810   asm volatile(\
00811    "movd %%eax,%%mm0 # -> (x,x,x,x,x,x,x,a) \n\t" \
00812    "punpcklbw %%mm0,%%mm0 # (x,x,x,x,x,x,a,a) \n\t" \
00813    "movl %[count],%%eax \n\t" \
00814    "punpcklbw %%mm0,%%mm0 # (x,x,x,x,a,a,a,a) \n\t" \
00815    "punpcklbw %%mm0,%%mm0 # (a,a,a,a,a,a,a,a) \n\t" \
00816    ".balign 16 \n\t" \
00817    "1: \n\t" \
00818    "#prefetchnta 200(%[Image]) # this is an extension, not pure MMX \n\t" \
00819    "movq (%[Image]),%%mm1 \n\t" \
00820    "movq 8(%[Image]),%%mm2 \n\t" \
00821    "movq 16(%[Image]),%%mm3 \n\t" \
00822    "movq 24(%[Image]),%%mm4 \n\t" \
00823    "paddb %%mm0,%%mm1 \n\t" \
00824    "paddb %%mm0,%%mm2 \n\t" \
00825    "paddb %%mm0,%%mm3 \n\t" \
00826    "paddb %%mm0,%%mm4 \n\t" \
00827    "movq %%mm1,(%[Sieve]) \n\t" \
00828    "movq %%mm2,8(%[Sieve]) \n\t" \
00829    "movq %%mm3,16(%[Sieve]) \n\t" \
00830    "movq %%mm4,24(%[Sieve]) \n\t" \
00831    "movq 32(%[Image]),%%mm1 \n\t" \
00832    "movq 40(%[Image]),%%mm2 \n\t" \
00833    "movq 48(%[Image]),%%mm3 \n\t" \
00834    "movq 56(%[Image]),%%mm4 \n\t" \
00835    "paddb %%mm0,%%mm1 \n\t" \
00836    "paddb %%mm0,%%mm2 \n\t" \
00837    "paddb %%mm0,%%mm3 \n\t" \
00838    "paddb %%mm0,%%mm4 \n\t" \
00839    "movq %%mm1,32(%[Sieve]) \n\t" \
00840    "movq %%mm2,40(%[Sieve]) \n\t" \
00841    "movq %%mm3,48(%[Sieve]) \n\t" \
00842    "movq %%mm4,56(%[Sieve]) \n\t" \
00843    "add $64,%[Image] \n\t" \
00844    "add $64,%[Sieve] \n\t" \
00845    "sub $64,%%eax \n\t" \
00846    "jnz 1b \n\t" \
00847    "emms \n" \
00848    : "=D" (clobbered_int), "=S" (clobbered_int), "=a" (clobbered_int) /* these registers are clobbered, but we needed the input... */
00849    : [threshold] "a" (SieveControl::log_Threshold), [Image] "S" (Image),
00850      [Sieve] "D" (SieveArray), [count] "i" (PhysicalSieveSize)
00851    : "cc", "memory", "mm0", "mm1", "mm2", "mm3", "mm4");
00852 #else
00853   // generic version
00854   register TSieveElement w = SieveControl::log_Threshold;
00855   register int i=PhysicalSieveSize-1;
00856   do
00857    {
00858      register TSieveElement x=w+Image[i];
00859      SieveArray[i]=x;
00860    } while (--i>=0);
00861 #endif
00862 }
00863 
00864 
00865 int CDeltaComputations::LocalPhysInterval_Delta_of_PrimeNumbers[StaticFactorbase::MaxSize][2] __attribute__ ((aligned (16)));
00866 int CDeltaComputations::LocalPhysInterval_Delta_of_PrimePowers[StaticFactorbase::max_additional_Powers][2] __attribute__ ((aligned (16)));
00867 
00868 
00869 void CSieveStaticFactors::init_LocalDeltas()
00870 {
00871   SieveControl::RenewThresholds();
00872   // determine the first hit for each primenumber in the first physical sieve interval
00873   for (int nr=StaticFactorbase::Size()-1; nr>=SieveControl::FBStartIndex(); --nr) 
00874     {
00875       register int Primzahl=PrimeNumbers[nr];
00876       register int d0,d1;
00877       {
00878         register int h=normalized_signed_mod(SieveOffset,Primzahl,PrimeNumberReciprocals[nr]); // damned signed %-Operation...
00879         d0=Delta_of_PrimeNumbers[nr][0]-h;
00880         d1=Delta_of_PrimeNumbers[nr][1]-h;
00881         if (d1<0)
00882           d1+=Primzahl;
00883         if (d0<0)
00884           d0+=Primzahl;
00885       }
00886       LocalPhysInterval_Delta_of_PrimeNumbers[nr][0]=d0;
00887       LocalPhysInterval_Delta_of_PrimeNumbers[nr][1]=d1;
00888     }
00889 
00890   // higher powers of primes
00891   for (int nr=0; nr<NumberOf_more_PrimePowers; ++nr)
00892     {
00893       register int PPotenz = PrimePowers[nr];
00894       register int d0,d1;
00895       {
00896         register int h=normalized_signed_mod(SieveOffset,PPotenz,PrimePowerReciprocals[nr]); // damned signed %-Operation...
00897         d0=Delta_of_PrimePowers[nr][0]-h;
00898         d1=Delta_of_PrimePowers[nr][1]-h;
00899         if (d1<0)
00900           d1+=PPotenz;
00901         if (d0<0)
00902           d0+=PPotenz;
00903       }
00904       LocalPhysInterval_Delta_of_PrimePowers[nr][0]=d0;
00905       LocalPhysInterval_Delta_of_PrimePowers[nr][1]=d1;
00906     }
00907 }
00908 
00909 
00910 void CSieveStaticFactors::create_Tables_for_init_LocalDeltas(int* &Table1, int* &Table2, const int SieveOffset)
00911 {
00912   if (Table1==NULL) Table1 = new int[StaticFactorbase::Size()];
00913   if (Table2==NULL) Table2 = new int[StaticFactorbase::Size()];
00914   for (int nr=StaticFactorbase::Size()-1; nr>=SieveControl::FBStartIndex(); --nr) 
00915     Table1[nr]=normalized_signed_mod(SieveOffset,PrimeNumbers[nr],PrimeNumberReciprocals[nr]); // damned signed %-Operation...
00916 
00917   // higher powers of primes
00918   for (int nr=0; nr<NumberOf_more_PrimePowers; ++nr)
00919     Table2[nr]=normalized_signed_mod(SieveOffset,PrimePowers[nr],PrimePowerReciprocals[nr]); // damned signed %-Operation...
00920 }
00921 
00922 
00923 void CSieveStaticFactors::init_LocalDeltas(int const Table1[], int const Table2[])
00924 {
00925   SieveControl::RenewThresholds();
00926   // determine the first hit for each primenumber in the first physical sieve interval
00927   for (int nr=StaticFactorbase::Size()-1; nr>=SieveControl::FBStartIndex(); --nr) 
00928     {
00929       register int Primzahl=PrimeNumbers[nr];
00930       register int d0,d1;
00931       {
00932         register int h=Table1[nr]; // = normalized_signed_mod(SieveOffset,Primzahl);
00933         d0=Delta_of_PrimeNumbers[nr][0]-h;
00934         d1=Delta_of_PrimeNumbers[nr][1]-h;
00935         if (d1<0)
00936           d1+=Primzahl;
00937         if (d0<0)
00938           d0+=Primzahl;
00939       }
00940       LocalPhysInterval_Delta_of_PrimeNumbers[nr][0]=d0;
00941       LocalPhysInterval_Delta_of_PrimeNumbers[nr][1]=d1;
00942     }
00943 
00944   // higher powers of primes
00945   for (int nr=0; nr<NumberOf_more_PrimePowers; ++nr)
00946     {
00947       register int PPotenz = PrimePowers[nr];
00948       register int d0,d1;
00949       {
00950         register int h=Table2[nr]; // = normalized_signed_mod(SieveOffset,PPotenz);
00951         d0=Delta_of_PrimePowers[nr][0]-h;
00952         d1=Delta_of_PrimePowers[nr][1]-h;
00953         if (d1<0)
00954           d1+=PPotenz;
00955         if (d0<0)
00956           d0+=PPotenz;
00957       }
00958       LocalPhysInterval_Delta_of_PrimePowers[nr][0]=d0;
00959       LocalPhysInterval_Delta_of_PrimePowers[nr][1]=d1;
00960     }
00961 }
00962 
00963 
00964 void CSieveStaticFactors::do_sieving_StaticFactors()
00965 {
00966   //cout << "Sieving with primes of static FB: " << SieveOffset<< endl;
00967   
00968   /* We do not sieve with the first couples of primes of the static factorbase.
00969      We want to sieve fast, quick and dirty! 
00970 
00971      The lower the first sieving prime, the slower the sieving! But this
00972      introduces errors which must be compensated!
00973        -> increase Factor_Threshold slightly if necessary.
00974        -> sieve with powers of primes (so that the interesting hits can be corrected anyway)
00975 
00976      CAUTION: Do not define a too high value for shortcut! Otherwise many relations remain undetected!
00977      Attention! Starting value needs to be at least "2" to guarantee that -1 and 2 will be ignored! (For any other
00978                 elements of the static factorbase there are exactly two Deltas defined. But not for -1 and 2...)
00979                 -> relevant for looping the smaller prime numbers
00980   */
00981 
00982   // standard code, but uses also various assembler optimizations
00983 
00984   register long int nr = StaticFactorbase::Size()-1;
00985 
00986 #if 1 && defined(SIEBASM_386) && (TSIEVEELEMENTSIZE == 1) && defined(ASM_X86_64)
00987 // code containing cmov optimizations plus prefetching for Athlon 64
00988 // FIXME: prefetch need to be tuned according to cpu speed and memory timing,
00989 // optimal settings can speed up sieving for more than 10% !
00990 // TODO: this is adapted for Athlon-64 64bit mode, but not yet optimized...
00991 // at the moment this code seems to be slighly slower for small factorbases than the generic code!
00992  #ifdef DEBUG
00993   #warning "experimental (cmov) static factor sieving code X86_64"
00994  #endif
00995   asm volatile ( \
00996        "movb (%[SieveWeight],%[nr]),%%al \n\t" \
00997        "movb $1,%%ah # position of next run length \n\t" \
00998        "bswap %%eax # eax holds 3 different values\n\t" \
00999        "movw (%[LogValRuns]),%%ax # first run-length \n\t" \
01000        "movl (%[PrimeNumbers],%[nr],4),%%edi \n\t" \
01001        "incw %%ax \n\t" \
01002        "jmp 2f \n\t" \
01003        ".balign 16 \n\t" \
01004        "1: movl (%[Deltas],%[nr],8),%%esi\n\t" \
01005        "prefetchw -256(%[Deltas],%[nr],8) # tuning cache!! \n\t" \
01006        "prefetch -128(%[PrimeDiffs],%[nr],2)# tuning cache!! \n\t" \
01007        "bswap %%eax \n\t" \
01008        "movl %%esi,%%edx\n\t" \
01009        "subl %%ecx,%%esi\n\t" \
01010        "cmovnsl %%ecx,%%edx\n\t" \
01011        "movzx %%cl,%%ecx # assuming cl=0\n\t" \
01012        "movsx %%edx,%%rdx \n\t" \
01013        "cmovsl %%edi,%%ecx\n\t" \
01014        "subb %%al,(%[feld],%%rdx)\n\t" \
01015        "addl %%ecx,%%esi\n\t" \
01016        "movl %[PhSiSi],%%ecx\n\t" \
01017        "movl %%esi,(%[Deltas],%[nr],8)\n\t" \
01018        "movl 4(%[Deltas],%[nr],8),%%esi \n\t" \
01019        "movl %%esi,%%edx\n\t" \
01020        "subl %%ecx,%%esi\n\t" \
01021        "cmovnsl %%ecx,%%edx\n\t" \
01022        "movzx %%cl,%%ecx # assuming cl=0\n\t" \
01023        "movsx %%edx,%%rdx \n\t" \
01024        "cmovsl %%edi,%%ecx\n\t" \
01025        "subb %%al,(%[feld],%%rdx)\n\t" \
01026        "addl %%ecx,%%esi\n\t" \
01027        "movl %%esi,4(%[Deltas],%[nr],8)\n\t" \
01028        "bswap %%eax \n\t" \
01029        "movzwl (%[PrimeDiffs],%[nr],2),%%edx\n\t" \
01030        "dec %[nr] \n\t" \
01031        "subl %%edx,%%edi \n\t" \
01032        "2: movl %[PhSiSi],%%ecx\n\t" \
01033        "decw %%ax \n\t " \
01034        "jnz 1b \n\t" \
01035        "bswap %%eax \n\t" \
01036        "movzx %%ah,%%edx \n\t" \
01037        "dec %%al # LogVal for run \n\t" \
01038        "inc %%ah # next run \n\t" \
01039        "movsx %%edx,%%rdx \n\t" \
01040        "bswap %%eax \n\t" \
01041        "movw (%[LogValRuns],%%rdx,2),%%ax # run length \n\t" \
01042        "test %%ax,%%ax \n\t" \
01043        "jnz 1b \n" \
01044        : [nr] "+r" (nr)
01045        : [SieveWeight] "r" (&SieveControl::log_PrimeNumbers[0]),
01046          [PrimeNumbers] "r" (&PrimeNumbers[0]),
01047          [PrimeDiffs] "r" (&PrimeNumberDiffs[0]),
01048          [Deltas] "r" (&LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),
01049          [feld] "r" (&SieveArray[0]),
01050          [LogValRuns] "r" (&LogValRuns[0]),
01051          [PhSiSi] "i" (PhysicalSieveSize)
01052        : "cc", "memory", "rax", "rcx", "rdx", "rsi", "rdi");
01053 
01054 #elif 1 && defined(SIEBASM_386) && (TSIEVEELEMENTSIZE == 1) && defined(ASM_ATHLON)
01055 // code containing cmov optimizations plus prefetching for Athlon
01056 // Hint: MMX code (with prefetching) for Athlon seems to be very fast, too!
01057 //       So you may want to skip this block and use MMX instead.
01058 
01059 // FIXME: prefetch need to be tuned according to cpu speed and memory timing,
01060 // optimal settings can speed up sieving for more than 10% !
01061 // Athlon 1.2 Ghz, SD-RAM:
01062 //   prefetcht2 %[Deltas]-256(,%[nr],8) # tuning cache!!
01063 //   prefetcht2 %[PrimeDiffs]-128(,%[nr],2)# tuning cache!!
01064 // Athlon-XP 1.75 Ghz, Athlon64 (32bit mode) 1.8 Ghz, DDR-RAM:
01065 //   prefetch %[Deltas]-128(,%[nr],8) # tuning cache!!
01066 //   prefetch %[PrimeDiffs]-64(,%[nr],2)# tuning cache!!
01067 // P3,P4:
01068 //   this routine also works on these processors (using prefetcht2; better: not using software prefetch at all),
01069 //   but it runs much slower than a specialized version!
01070  #ifdef DEBUG
01071   #warning "experimental (cmov) static factor sieving code Athlon (and Athlon XP/Athlon-64 (32bit))"
01072  #endif
01073   asm volatile ( \
01074        "movb %[SieveWeight](%[nr]),%%al \n\t" \
01075        "movb $1,%%ah # position of next run length \n\t" \
01076        "bswap %%eax # eax holds 3 different values\n\t" \
01077        "movw %[LogValRuns],%%ax # first run-length \n\t" \
01078        "movl %[PrimeNumbers](,%[nr],4),%%edi \n\t" \
01079        "incw %%ax \n\t" \
01080        "jmp 2f \n\t" \
01081        ".balign 16 \n\t" \
01082        "1: movl %[Deltas](,%[nr],8),%%esi\n\t" \
01083        "prefetchw %[Deltas]-128(,%[nr],8) # tuning cache!! \n\t" \
01084        "prefetch %[PrimeDiffs]-64(,%[nr],2)# tuning cache!! \n\t" \
01085        "bswap %%eax \n\t" \
01086        "movl %%esi,%%edx\n\t" \
01087        "subl %%ecx,%%esi\n\t" \
01088        "cmovnsl %%ecx,%%edx\n\t" \
01089        "movzx %%cl,%%ecx # assuming cl=0\n\t" \
01090        "cmovsl %%edi,%%ecx\n\t" \
01091        "subb %%al,%[feld](%%edx)\n\t" \
01092        "addl %%ecx,%%esi\n\t" \
01093        "movl %[PhSiSi],%%ecx\n\t" \
01094        "movl %%esi,%[Deltas](,%[nr],8)\n\t" \
01095        "movl %[Deltas]+4(,%[nr],8),%%esi \n\t" \
01096        "movl %%esi,%%edx\n\t" \
01097        "subl %%ecx,%%esi\n\t" \
01098        "cmovnsl %%ecx,%%edx\n\t" \
01099        "movzx %%cl,%%ecx # assuming cl=0\n\t" \
01100        "cmovsl %%edi,%%ecx\n\t" \
01101        "subb %%al,%[feld](%%edx)\n\t" \
01102        "addl %%ecx,%%esi\n\t" \
01103        "movl %%esi,%[Deltas]+4(,%[nr],8)\n\t" \
01104        "bswap %%eax \n\t" \
01105        "movzwl %[PrimeDiffs](,%[nr],2),%%edx\n\t" \
01106        "dec %[nr] \n\t" \
01107        "subl %%edx,%%edi \n\t" \
01108        "2: movl %[PhSiSi],%%ecx\n\t" \
01109        "decw %%ax \n\t " \
01110        "jnz 1b \n\t" \
01111        "bswap %%eax \n\t" \
01112        "movzx %%ah,%%edx \n\t" \
01113        "dec %%al # LogVal for run \n\t" \
01114        "inc %%ah # next run \n\t" \
01115        "bswap %%eax \n\t" \
01116        "movw %[LogValRuns](,%%edx,2),%%ax # run length \n\t" \
01117        "test %%ax,%%ax \n\t" \
01118        "jnz 1b \n" \
01119        : [nr] "+r" (nr)
01120        : [SieveWeight] "o" (SieveControl::log_PrimeNumbers[0]),
01121          [PrimeNumbers] "o" (PrimeNumbers[0]),
01122          [PrimeDiffs] "o" (PrimeNumberDiffs[0]),
01123          [Deltas] "o" (LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),
01124          [feld] "o" (SieveArray[0]),
01125          [LogValRuns] "o" (LogValRuns[0]),
01126          [PhSiSi] "i" (PhysicalSieveSize)
01127        : "cc", "memory", "eax", "ecx", "edx", "esi", "edi");
01128 
01129 #elif 1 && defined(SIEBASM_386) && (TSIEVEELEMENTSIZE == 1) && (defined(ASM_3DNOW) && !defined(ASM_ATHLON))
01130 //   cpu type ------------------------------------------------------>       -- AMD K6 3D --
01131 // MMX variant using specialized 3DNow prefetching code
01132  #ifdef DEBUG
01133   #warning "experimental static factor sieving code (3DNow!) for K6 3D"
01134  #endif
01135   asm volatile ( \
01136        "movl %[PhSiSi],%%edx \n\t" \
01137        "movd %%edx,%%mm7 \n\t" \
01138        "punpckldq %%mm7,%%mm7 # lo/hi->lo/lo \n\t" \
01139        "jmp 2f \n\t" \
01140        ".balign 32,,8 \n\t" \
01141        "1: movq %[Deltas]-8(,%[nr],8),%%mm0\n\t" \
01142        "movq %[Deltas](,%[nr],8),%%mm1\n\t" \
01143        "pcmpgtd %%mm4,%%mm4 # set mm4 to 0xffff... \n\t" \
01144        "pcmpgtd %%mm5,%%mm5 # set mm5 to 0xffff... \n\t" \
01145        "movq %[PrimeNumbers]-4(,%[nr],4),%%mm2 \n\t" \
01146        "psubd %%mm7,%%mm0 \n\t" \
01147        "movw %[SieveWeight]-1(%[nr]),%%ax\n\t" \
01148        "sub $2,%[nr] \n\t" \
01149        "movq %%mm2,%%mm3 \n\t" \
01150        "psubd %%mm7,%%mm1 \n\t" \
01151        "punpckldq %%mm2,%%mm2 # lo/hi->lo/lo \n\t" \
01152        "pcmpgtd %%mm0,%%mm4 \n\t" \
01153        "punpckhdq %%mm3,%%mm3 # lo/hi->hi/hi \n\t" \
01154        "pcmpgtd %%mm1,%%mm5 \n\t" \
01155        "pand %%mm4,%%mm2 \n\t" \
01156        "pand %%mm5,%%mm3 \n\t" \
01157        "pand %%mm0,%%mm4 \n\t" \
01158        "pand %%mm1,%%mm5 \n\t" \
01159        "prefetchw %[Deltas]-128(,%[nr],8) # tuning cache!! \n\t" \
01160        "prefetch %[PrimeNumbers]-64(,%[nr],4)# tuning cache!! \n\t" \
01161        "movd %%mm4,%%esi \n\t" \
01162        "movd %%mm5,%%edi \n\t" \
01163        "paddd %%mm2,%%mm0 \n\t" \
01164        "punpckhdq %%mm4,%%mm4 # get hi to lo \n\t" \
01165        "subb %%al,%[feldEnd](%%esi)\n\t" \
01166        "paddd %%mm3,%%mm1 \n\t" \
01167        "punpckhdq %%mm5,%%mm5 # get hi to lo \n\t" \
01168        "subb %%ah,%[feldEnd](%%edi)\n\t" \
01169        "movd %%mm4,%%esi \n\t" \
01170        "movd %%mm5,%%edi \n\t" \
01171        "movq %%mm0,%[Deltas]+8(,%[nr],8)\n\t" \
01172        "movq %%mm1,%[Deltas]+16(,%[nr],8)\n\t" \
01173        "subb %%al,%[feldEnd](%%esi)\n\t" \
01174        "subb %%ah,%[feldEnd](%%edi)\n\t" \
01175        "2: cmpl %%edx,%[PrimeNumbers]-4(,%[nr],4)\n\t" \
01176        "jge 1b \n\t" \
01177        "cmpl %%edx,%[PrimeNumbers](,%[nr],4)\n\t" \
01178        "jl 3f \n\t" \
01179        "movq %[Deltas](,%[nr],8),%%mm1\n\t" \
01180        "pcmpgtd %%mm5,%%mm5 # set mm5 to 0xffff... \n\t" \
01181        "movq %[PrimeNumbers](,%[nr],4),%%mm3 \n\t" \
01182        "movb %[SieveWeight](%[nr]),%%al\n\t" \
01183        "psubd %%mm7,%%mm1 \n\t" \
01184        "punpckldq %%mm3,%%mm3 # lo/hi->lo/lo \n\t" \
01185        "pcmpgtd %%mm1,%%mm5 \n\t" \
01186        "pand %%mm5,%%mm3 \n\t" \
01187        "pand %%mm1,%%mm5 \n\t" \
01188        "movd %%mm5,%%esi \n\t" \
01189        "paddd %%mm3,%%mm1 \n\t" \
01190        "punpckhdq %%mm5,%%mm5 # get hi to lo \n\t" \
01191        "movd %%mm5,%%edi \n\t" \
01192        "movq %%mm1,%[Deltas](,%[nr],8)\n\t" \
01193        "subb %%al,%[feldEnd](%%esi)\n\t" \
01194        "subb %%al,%[feldEnd](%%edi)\n\t" \
01195        "sub $1,%[nr] \n\t" \
01196        "3: femms \n" \
01197        : [nr] "+r" (nr)
01198        : [SieveWeight] "o" (SieveControl::log_PrimeNumbers[0]),
01199          [PrimeNumbers] "o" (PrimeNumbers[0]),
01200          [Deltas] "o" (LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),
01201          [feldEnd] "o" (SieveArray[PhysicalSieveSize]),
01202          [PhSiSi] "i" (PhysicalSieveSize)
01203        : "cc", "memory", "eax", "edx", "esi", "edi", "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7");
01204 
01205 #elif 1 && defined(SIEBASM_386) && (TSIEVEELEMENTSIZE == 1) && (defined(ASM_ATHLON) || defined(ASM_SSE))
01206 // code containing optimizations for MMX (plus prefetching) machines
01207  #ifdef DEBUG
01208   #warning "experimental static factor sieving code for MMX (plus prefetching MMX-ext)"
01209  #endif
01210   asm volatile ( \
01211        "movl %[PhSiSi],%%edx \n\t" \
01212        "movd %%edx,%%mm7 \n\t" \
01213        "punpckldq %%mm7,%%mm7 # lo/hi->lo/lo \n\t" \
01214        "jmp 2f \n\t" \
01215        ".balign 16 \n\t" \
01216        "1: movq %[Deltas]-8(,%[nr],8),%%mm0\n\t" \
01217        "movq %[Deltas](,%[nr],8),%%mm1\n\t" \
01218        "pcmpgtd %%mm4,%%mm4 # set mm4 to 0xffff... \n\t" \
01219        "pcmpgtd %%mm5,%%mm5 # set mm5 to 0xffff... \n\t" \
01220        "movq %[PrimeNumbers]-4(,%[nr],4),%%mm2 \n\t" \
01221        "movw %[SieveWeight]-1(%[nr]),%%ax\n\t" \
01222        "sub $2,%[nr] \n\t" \
01223        "movq %%mm2,%%mm3 \n\t" \
01224        "psubd %%mm7,%%mm0 \n\t" \
01225        "psubd %%mm7,%%mm1 \n\t" \
01226        "punpckldq %%mm2,%%mm2 # lo/hi->lo/lo \n\t" \
01227        "punpckhdq %%mm3,%%mm3 # lo/hi->hi/hi \n\t" \
01228        "pcmpgtd %%mm0,%%mm4 \n\t" \
01229        "pcmpgtd %%mm1,%%mm5 \n\t" \
01230        "pand %%mm4,%%mm2 \n\t" \
01231        "pand %%mm5,%%mm3 \n\t" \
01232        "pand %%mm0,%%mm4 \n\t" \
01233        "pand %%mm1,%%mm5 \n\t" \
01234        "prefetcht2 %[Deltas]-256(,%[nr],8) # tuning cache!! \n\t" \
01235        "prefetcht2 %[PrimeNumbers]-128(,%[nr],4)# tuning cache!! \n\t" \
01236        "movd %%mm4,%%esi \n\t" \
01237        "movd %%mm5,%%edi \n\t" \
01238        "paddd %%mm2,%%mm0 \n\t" \
01239        "punpckhdq %%mm4,%%mm4 # get hi to lo \n\t" \
01240        "paddd %%mm3,%%mm1 \n\t" \
01241        "punpckhdq %%mm5,%%mm5 # get hi to lo \n\t" \
01242        "subb %%al,%[feldEnd](%%esi)\n\t" \
01243        "subb %%ah,%[feldEnd](%%edi)\n\t" \
01244        "movd %%mm4,%%esi \n\t" \
01245        "movd %%mm5,%%edi \n\t" \
01246        "movq %%mm0,%[Deltas]+8(,%[nr],8)\n\t" \
01247        "movq %%mm1,%[Deltas]+16(,%[nr],8)\n\t" \
01248        "subb %%al,%[feldEnd](%%esi)\n\t" \
01249        "subb %%ah,%[feldEnd](%%edi)\n\t" \
01250        "2: cmpl %%edx,%[PrimeNumbers]-4(,%[nr],4)\n\t" \
01251        "jge 1b \n\t" \
01252        "cmpl %%edx,%[PrimeNumbers](,%[nr],4)\n\t" \
01253        "jl 3f \n\t" \
01254        "movq %[Deltas](,%[nr],8),%%mm1\n\t" \
01255        "pcmpgtd %%mm5,%%mm5 # set mm5 to 0xffff... \n\t" \
01256        "movq %[PrimeNumbers](,%[nr],4),%%mm3 \n\t" \
01257        "movb %[SieveWeight](%[nr]),%%al\n\t" \
01258        "psubd %%mm7,%%mm1 \n\t" \
01259        "punpckldq %%mm3,%%mm3 # lo/hi->lo/lo \n\t" \
01260        "pcmpgtd %%mm1,%%mm5 \n\t" \
01261        "pand %%mm5,%%mm3 \n\t" \
01262        "pand %%mm1,%%mm5 \n\t" \
01263        "movd %%mm5,%%esi \n\t" \
01264        "paddd %%mm3,%%mm1 \n\t" \
01265        "punpckhdq %%mm5,%%mm5 # get hi to lo \n\t" \
01266        "movd %%mm5,%%edi \n\t" \
01267        "movq %%mm1,%[Deltas](,%[nr],8)\n\t" \
01268        "subb %%al,%[feldEnd](%%esi)\n\t" \
01269        "subb %%al,%[feldEnd](%%edi)\n\t" \
01270        "sub $1,%[nr] \n\t" \
01271        "3: emms \n" \
01272        : [nr] "+r" (nr)
01273        : [SieveWeight] "o" (SieveControl::log_PrimeNumbers[0]),
01274          [PrimeNumbers] "o" (PrimeNumbers[0]),
01275          [Deltas] "o" (LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),
01276          [feldEnd] "o" (SieveArray[PhysicalSieveSize]),
01277          [PhSiSi] "i" (PhysicalSieveSize)
01278        : "cc", "memory", "eax", "edx", "esi", "edi", "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7");
01279 
01280 #elif 1 && defined(SIEBASM_386) && (TSIEVEELEMENTSIZE == 1) && defined(ASM_MMX)
01281 // code containing optimizations for MMX compatible machines
01282  #ifdef DEBUG
01283   #warning "experimental static factor sieving code for MMX compatible machines"
01284  #endif
01285   asm volatile ( \
01286        "movl %[PhSiSi],%%edx \n\t" \
01287        "movd %%edx,%%mm7 \n\t" \
01288        "punpckldq %%mm7,%%mm7 # lo/hi->lo/lo \n\t" \
01289        "jmp 2f \n\t" \
01290        "1: movq %[Deltas]-8(,%[nr],8),%%mm0\n\t" \
01291        "movq %[Deltas](,%[nr],8),%%mm1\n\t" \
01292        "pcmpgtd %%mm4,%%mm4 # set mm4 to 0xffff... \n\t" \
01293        "pcmpgtd %%mm5,%%mm5 # set mm5 to 0xffff... \n\t" \
01294        "movq %[PrimeNumbers]-4(,%[nr],4),%%mm2 \n\t" \
01295        "movw %[SieveWeight]-1(%[nr]),%%ax\n\t" \
01296        "sub $2,%[nr] \n\t" \
01297        "movq %%mm2,%%mm3 \n\t" \
01298        "psubd %%mm7,%%mm0 \n\t" \
01299        "psubd %%mm7,%%mm1 \n\t" \
01300        "punpckldq %%mm2,%%mm2 # lo/hi->lo/lo \n\t" \
01301        "punpckhdq %%mm3,%%mm3 # lo/hi->hi/hi \n\t" \
01302        "pcmpgtd %%mm0,%%mm4 \n\t" \
01303        "pcmpgtd %%mm1,%%mm5 \n\t" \
01304        "pand %%mm4,%%mm2 \n\t" \
01305        "pand %%mm5,%%mm3 \n\t" \
01306        "pand %%mm0,%%mm4 \n\t" \
01307        "pand %%mm1,%%mm5 \n\t" \
01308        "#prefetcht2 %[Deltas]-256(,%[nr],8) # tuning cache!! \n\t" \
01309        "#prefetcht2 %[PrimeNumbers]-128(,%[nr],4)# tuning cache!! \n\t" \
01310        "movd %%mm4,%%esi \n\t" \
01311        "movd %%mm5,%%edi \n\t" \
01312        "paddd %%mm2,%%mm0 \n\t" \
01313        "punpckhdq %%mm4,%%mm4 # get hi to lo \n\t" \
01314        "paddd %%mm3,%%mm1 \n\t" \
01315        "punpckhdq %%mm5,%%mm5 # get hi to lo \n\t" \
01316        "subb %%al,%[feldEnd](%%esi)\n\t" \
01317        "subb %%ah,%[feldEnd](%%edi)\n\t" \
01318        "movd %%mm4,%%esi \n\t" \
01319        "movd %%mm5,%%edi \n\t" \
01320        "movq %%mm0,%[Deltas]+8(,%[nr],8)\n\t" \
01321        "movq %%mm1,%[Deltas]+16(,%[nr],8)\n\t" \
01322        "subb %%al,%[feldEnd](%%esi)\n\t" \
01323        "subb %%ah,%[feldEnd](%%edi)\n\t" \
01324        "2: cmpl %%edx,%[PrimeNumbers]-4(,%[nr],4)\n\t" \
01325        "jge 1b \n\t" \
01326        "cmpl %%edx,%[PrimeNumbers](,%[nr],4)\n\t" \
01327        "jl 3f \n\t" \
01328        "movq %[Deltas](,%[nr],8),%%mm1\n\t" \
01329        "pcmpgtd %%mm5,%%mm5 # set mm5 to 0xffff... \n\t" \
01330        "movq %[PrimeNumbers](,%[nr],4),%%mm3 \n\t" \
01331        "movb %[SieveWeight](%[nr]),%%al\n\t" \
01332        "psubd %%mm7,%%mm1 \n\t" \
01333        "punpckldq %%mm3,%%mm3 # lo/hi->lo/lo \n\t" \
01334        "pcmpgtd %%mm1,%%mm5 \n\t" \
01335        "pand %%mm5,%%mm3 \n\t" \
01336        "pand %%mm1,%%mm5 \n\t" \
01337        "movd %%mm5,%%esi \n\t" \
01338        "paddd %%mm3,%%mm1 \n\t" \
01339        "punpckhdq %%mm5,%%mm5 # get hi to lo \n\t" \
01340        "movd %%mm5,%%edi \n\t" \
01341        "movq %%mm1,%[Deltas](,%[nr],8)\n\t" \
01342        "subb %%al,%[feldEnd](%%esi)\n\t" \
01343        "subb %%al,%[feldEnd](%%edi)\n\t" \
01344        "sub $1,%[nr] \n\t" \
01345        "3: emms \n" \
01346        : [nr] "+r" (nr)
01347        : [SieveWeight] "o" (SieveControl::log_PrimeNumbers[0]),
01348          [PrimeNumbers] "o" (PrimeNumbers[0]),
01349          [Deltas] "o" (LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),
01350          [feldEnd] "o" (SieveArray[PhysicalSieveSize]),
01351          [PhSiSi] "i" (PhysicalSieveSize)
01352        : "cc", "memory", "eax", "edx", "esi", "edi", "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7");
01353 
01354 #else
01355   // generic version
01356   // larger prime numbers (maximal 1 hit per Delta) 
01357   for (;; --nr) 
01358     {
01359       register int Primzahl=PrimeNumbers[nr];
01360       if (Primzahl<PhysicalSieveSize) break;
01361 
01362       register TSieveElement log_Primzahl = SieveControl::PrimzahlSieveWeight(nr);
01363       /* i.e. log_Primzahl gets the value (TSieveElement) (log_SieveEntryMultiplier*log(Primzahl)) */
01364 
01365       // d0/d1-calculation
01366       register int d0,d1;
01367       
01368       d0=LocalPhysInterval_Delta_of_PrimeNumbers[nr][0];
01369       d1=LocalPhysInterval_Delta_of_PrimeNumbers[nr][1];
01370 
01371       // hint: PhysicalSieveSize is also a dummy position inside the sieve;
01372       //       writing to a dummy avoids expensive branch instructions.
01373       SieveArray[d0<PhysicalSieveSize ? d0 : PhysicalSieveSize]-=log_Primzahl;
01374       SieveArray[d1<PhysicalSieveSize ? d1 : PhysicalSieveSize]-=log_Primzahl;
01375 
01376       d0-=PhysicalSieveSize; if (d0<0) d0+=Primzahl;
01377       d1-=PhysicalSieveSize; if (d1<0) d1+=Primzahl;
01378 
01379       LocalPhysInterval_Delta_of_PrimeNumbers[nr][0]=d0;
01380       LocalPhysInterval_Delta_of_PrimeNumbers[nr][1]=d1;
01381     }
01382 #endif
01383 
01384   // smaller primes (many hits per Delta)
01385   const int NonDirtyIndex = SieveControl::FBStartIndex();
01386   for (; nr>=NonDirtyIndex; --nr) 
01387     {
01388       register long int Primzahl=PrimeNumbers[nr];
01389       register TSieveElement log_Primzahl = SieveControl::PrimzahlSieveWeight(nr);
01390       /* i.e. log_Primzahl gets the value (TSieveElement) (log_SieveEntryMultiplier*log(Primzahl)) */
01391 
01392       // d0/d1-calculation
01393       register long int d0,d1;
01394       
01395       d0=LocalPhysInterval_Delta_of_PrimeNumbers[nr][0];
01396       d1=LocalPhysInterval_Delta_of_PrimeNumbers[nr][1];
01397 
01398 #ifdef SIEBASM_386
01399       asm_sieb_small(log_Primzahl, d0, d1, Primzahl);
01400 #else
01401       if (d0>d1) { register int h=d0; d0=d1; d1=h; }
01402       // now: d0<=d1
01403       while (d1<PhysicalSieveSize)
01404        {
01405          SieveArray[d0]-=log_Primzahl; d0+=Primzahl;
01406          SieveArray[d1]-=log_Primzahl; d1+=Primzahl;
01407        }
01408       if (d0<PhysicalSieveSize)
01409        {
01410          SieveArray[d0]-=log_Primzahl; d0+=Primzahl;
01411        }
01412 #endif
01413       d0-=PhysicalSieveSize; if (d0<0) d0+=Primzahl;
01414       d1-=PhysicalSieveSize; if (d1<0) d1+=Primzahl;
01415 
01416       LocalPhysInterval_Delta_of_PrimeNumbers[nr][0]=d0;
01417       LocalPhysInterval_Delta_of_PrimeNumbers[nr][1]=d1;
01418     }
01419 
01420   // higher powers of primes
01421   for (nr=0; nr<NumberOf_more_PrimePowers; nr++)
01422     {
01423       register long int PPotenz = PrimePowers[nr];
01424       register TSieveElement log_Primzahl = log_Primzahl_of_PrimePowers[nr];
01425       /* i.e. log_Primzahl gets the value (TSieveElement) (log_SieveEntryMultiplier*log(Primzahl)) */
01426       // notice, that: if we sieve with these primes, then not log(Primzahl^k),
01427       // but only log(Primzahl) has to be subtracted!
01428       // Analogy: If you have already divided by p, then you would ending in dividing by p^3 all together,
01429       //          should you decide to divide by p^2 instead of p!
01430 
01431       // d0/d1-calculation
01432       register long int d0,d1;
01433       
01434       d0=LocalPhysInterval_Delta_of_PrimePowers[nr][0];
01435       d1=LocalPhysInterval_Delta_of_PrimePowers[nr][1];
01436 
01437 #ifdef SIEBASM_386
01438       asm_sieb(log_Primzahl, d0, d1, PPotenz);
01439 #else
01440       if (d0>d1) { register int h=d0; d0=d1; d1=h; }
01441       // now: d0<=d1
01442       while (d1<PhysicalSieveSize)
01443        {
01444          SieveArray[d0]-=log_Primzahl; d0+=PPotenz;
01445          SieveArray[d1]-=log_Primzahl; d1+=PPotenz;
01446        }
01447       if (d0<PhysicalSieveSize)
01448        {
01449          SieveArray[d0]-=log_Primzahl; d0+=PPotenz;
01450        }
01451 #endif
01452       d0-=PhysicalSieveSize; if (d0<0) d0+=PPotenz;
01453       d1-=PhysicalSieveSize; if (d1<0) d1+=PPotenz;
01454 
01455       LocalPhysInterval_Delta_of_PrimePowers[nr][0]=d0;
01456       LocalPhysInterval_Delta_of_PrimePowers[nr][1]=d1;
01457     }
01458 
01459 }
01460 
01461 
01462 void do_scanning_Sieve(void)
01463 {
01464   //cout << "Auslesen des (gesiebten) Siebintervalls" << endl;
01465 
01466   // only a small fraction of sieve entries are expected to be hits!
01467   //  --> search for hits can be accelerated!
01468   // Constraint: the 64 positions below the sieve (SieveArray[-64]..SieveArray[-1]) are already set to -1.
01469   register long int d=PhysicalSieveSize;
01470   do
01471    {
01472 #ifdef SIEBASM_386
01473      asm_search_sieb(d); // search for hits...
01474      //if (d>=0 && SieveArray[d]<0) cout << "hit for position " << d << endl;
01475      //else cout << "??? for " << d << endl; // d==-1 is okay...
01476 #else
01477      while (SieveArray[--d]>=0) { } // search for hit...
01478 #endif
01479      if (d>=0) // check whether the detected hit is a provoked dummy-hit to exit the loop.
01480       {
01481         // a regular hit
01482         if (SieveControl::Hit_after_DirtyCorrection(SieveOffset+d,SieveArray[d]))
01483           StaticRelations::insert(new CRelation(SieveOffset+d)); // insert a possible relation
01484       }
01485    } while (d>0); // as long as more hits are possible: continue search
01486 // Werteverteilung_im_Sieb_ausgeben();
01487 }
01488 
01489 
01490 #if defined(ASM_X86_64) /* special code using SSE2 instructions (X86_64) */
01491 
01492 void CDeltaComputations::GetStaticFactorHits(std::vector<unsigned int> &Factors, const signed int SievePos)
01493 {
01494   // find factors in static factorbase
01495   // (starting with the 3. prime of the FB (see above) )
01496 
01497   // SSE2 specific code
01498  #if 1 //def DEBUG
01499   #warning "X86_64/SSE2 specific code to detect static factors"
01500  #endif
01501   typedef int v4sf __attribute__ ((vector_size (16))); // 16 bytes -> 4 short floats
01502   register v4sf XMM6 asm("xmm6"); // XMM6 is now bound to %xmm6 (intention: sign correction)
01503   register v4sf XMM7 asm("xmm7"); // XMM7 is now bound to %xmm7
01504 
01505    // refer GCC manual, "Specifying registers for local variables";
01506    // but remember: that does not necessarily mean, that %xmm7 cannot be
01507    // used for other variables as well! (It is only guaranteed, that if we access
01508    // XMM7, this access is synonym to accessing %xmm7; but accessing %xmm7 is not necessarily
01509    // synonym to accessing XMM7 (because if the compiler assumes, that XMM7 is dead, %xmm7 could
01510    // be used for other variables; and the compiler does *not* know, that we want
01511    // XMM7 to be alive by accessing %xmm7!).
01512 
01513   asm volatile ( \
01514        "emms                       # enter mmx\n\t" \
01515        "movd %[Sieboffs],%%mm0     # mm0=(0,Sieboffs)\n\t" \
01516        "xorps %[_xmm6],%[_xmm6]    # L2AM   clear mm6 \n\t" \
01517        "punpckldq %%mm0,%%mm0      # L2AM   (0,Sieboffs) -> (Sieboffs,Sieboffs) \n\t" \
01518        "cvtpi2ps %%mm0,%[_xmm7]    # packed int to float \n\t" \
01519        "shufps $0x44,%[_xmm7],%[_xmm7] \n\t" \
01520        "cmpnleps %[_xmm7],%[_xmm6] # L2AM   set bits if signed, clear bits if unsigned" \
01521        : [_xmm7] "=x" (XMM7), [_xmm6] "=x" (XMM6) : [Sieboffs] "rm" (SievePos) : "mm0");
01522 
01523   // PrimeNumbers[0] and PrimeNumbers[1] get left out (since special treatment is necessary)
01524   // PrimeNumbers[2] und PrimeNumbers[3] get left out (so that the remaining number of primes is divisible by 4)
01525   {
01526      register int d = normalized_signed_mod(SievePos,PrimeNumbers[2],PrimeNumberReciprocals[2]);
01527      if (d==Delta_of_PrimeNumbers[2][0] || d==Delta_of_PrimeNumbers[2][1]) Factors.push_back(2);
01528      d = normalized_signed_mod(SievePos,PrimeNumbers[3],PrimeNumberReciprocals[3]);
01529      if (d==Delta_of_PrimeNumbers[3][0] || d==Delta_of_PrimeNumbers[3][1]) Factors.push_back(3);
01530   }
01531   
01532   long int nr;
01533   for (nr=4; nr<StaticFactorbase::Size() && PrimeNumbers[nr]<PhysicalSieveSize; nr+=4) 
01534     {
01535       register unsigned int pflags;
01536       asm ( \
01537        "movaps %[_xmm7],%%xmm2 \n\t" \
01538        "cvtdq2ps (%[divisors],%[i],4),%%xmm1 # convert 4 packed int to float \n\t" \
01539        "mulps (%[FloatRecips],%[i],4),%%xmm2 \n\t" \
01540        "movaps %%xmm1,%%xmm3 \n\t" \
01541        "cvttps2dq %%xmm2,%%xmm0 \n\t" \
01542        "movaps %[_xmm7],%%xmm2 \n\t" \
01543        "cvtdq2ps %%xmm0,%%xmm0 \n\t" \
01544        "mulps %%xmm1,%%xmm0 \n\t" \
01545        "subps %%xmm0,%%xmm2 \n\t" \
01546        "andps %[_xmm6],%%xmm1     # correct sign (part 1) \n\t" \
01547        "addps %%xmm1,%%xmm2       # correct sign (part 2) \n\t" \
01548        "cmpneqps %%xmm2,%%xmm3    #  assure modp=0 instead of modp=p \n\t" \
01549        "andps %%xmm3,%%xmm2   \n\t" \
01550        "cvttps2dq %%xmm2,%%xmm3 \n\t" \
01551        "pshufd $0x50,%%xmm3,%%xmm0 \n\t" \
01552        "pshufd $0xfa,%%xmm3,%%xmm1 \n\t" \
01553        "pcmpeqd (%[Deltas],%[i],8),%%xmm0 \n\t" \
01554        "pcmpeqd 16(%[Deltas],%[i],8),%%xmm1 \n\t" \
01555        "packssdw %%xmm1,%%xmm0 \n\t" \
01556        "pmovmskb %%xmm0,%[pflags] # create bytemask" \
01557        : [pflags] "=r" (pflags)
01558        : [divisors] "r" (&PrimeNumbers[0]), [Deltas] "r" (&Delta_of_PrimeNumbers[0][0]),
01559          [FloatRecips] "r" (&PrimeNumberFloatReciprocals[0]),
01560          [_xmm7] "x" (XMM7), [_xmm6] "x" (XMM6), [i] "r" (nr)
01561        : "xmm0", "xmm1", "xmm2", "xmm3");
01562 
01563       //cout << "SievePos: " << SievePos << ": " << PackedResult[0] << "," << PackedResult[1] << endl;
01564  
01565 #if defined(DEBUG)
01566        {
01567          // check pflags for correct byte mask
01568          int d=normalized_signed_mod(SievePos,PrimeNumbers[nr]);
01569          if ( (d==Delta_of_PrimeNumbers[nr][0]) != ((pflags&0x0003)==0x0003) ) { MARK; cerr << nr << "!!" << (pflags&0x0003) << endl; exit(7); }
01570          if ( (d==Delta_of_PrimeNumbers[nr][1]) != ((pflags&0x000c)==0x000c) ) { MARK; cerr << nr << "!!" << (pflags&0x000c) << endl; exit(7); }
01571          d=normalized_signed_mod(SievePos,PrimeNumbers[nr+1]);
01572          if ( (d==Delta_of_PrimeNumbers[nr+1][0]) != ((pflags&0x0030)==0x0030) ) { MARK; cerr << nr+1 << "!!" << (pflags&0x0030) << endl; exit(7); }
01573          if ( (d==Delta_of_PrimeNumbers[nr+1][1]) != ((pflags&0x00c0)==0x00c0) ) { MARK; cerr << nr+1 << "!!" << (pflags&0x00c0) << endl; exit(7); }
01574          d=normalized_signed_mod(SievePos,PrimeNumbers[nr+2]);
01575          if ( (d==Delta_of_PrimeNumbers[nr+2][0]) != ((pflags&0x0300)==0x0300) ) { MARK; cerr << nr+2 << "!!" << (pflags&0x0300) << endl; exit(7); }
01576          if ( (d==Delta_of_PrimeNumbers[nr+2][1]) != ((pflags&0x0c00)==0x0c00) ) { MARK; cerr << nr+2 << "!!" << (pflags&0x0c00) << endl; exit(7); }
01577          d=normalized_signed_mod(SievePos,PrimeNumbers[nr+3]);
01578          if ( (d==Delta_of_PrimeNumbers[nr+3][0]) != ((pflags&0x3000)==0x3000) ) { MARK; cerr << nr+3 << "!!" << (pflags&0x3000) << endl; exit(7); }
01579          if ( (d==Delta_of_PrimeNumbers[nr+3][1]) != ((pflags&0xc000)==0xc000) ) { MARK; cerr << nr+3 << "!!" << (pflags&0xc000) << endl; exit(7); }
01580        }
01581 #endif
01582       if ( pflags & 0x000f ) Factors.push_back(nr);
01583       if ( pflags & 0x00f0 ) Factors.push_back(nr+1);
01584       if ( pflags & 0x0f00 ) Factors.push_back(nr+2);
01585       if ( pflags & 0xf000 ) Factors.push_back(nr+3);
01586 
01587     }
01588 
01589   const signed int mySievePos = SievePos-SieveOffset-PhysicalSieveSize;
01590   asm volatile ( \
01591        "movd %[mySievePos],%[_xmm7]   # -      mm0=(0,mySiebPos)\n\t" \
01592        "shufps $0x00,%[_xmm7],%[_xmm7] \n\t" \
01593        : [_xmm7] "=x" (XMM7) : [mySievePos] "r" (mySievePos));
01594 
01595   for (; nr<StaticFactorbase::Size(); nr+=4)
01596    {
01597       register unsigned int pflags;
01598       asm ( \
01599        "movdqa (%[PrimeNumbers],%[i],4),%%xmm2   # L2AMS  get y : x (as packed int)\n\t" \
01600        "paddd %[_xmm7],%%xmm2        # L2AM \n\t" \
01601        "movdqa 16(%[LocDeltas],%[i],8),%%xmm3 \n\t" \
01602        "pshufd $0x50,%%xmm2,%%xmm0 \n\t" \
01603        "pshufd $0xfa,%%xmm2,%%xmm1 \n\t" \
01604        "pcmpeqd (%[LocDeltas],%[i],8),%%xmm0 \n\t" \
01605        "pcmpeqd %%xmm3,%%xmm1 \n\t" \
01606        "packssdw %%xmm1,%%xmm0 \n\t" \
01607        "pmovmskb %%xmm0,%[pflags] # create bytemask" \
01608        : [pflags] "=r" (pflags)
01609        : [PrimeNumbers] "r" (&PrimeNumbers[0]), [LocDeltas] "r" (&LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),
01610          [_xmm7] "x" (XMM7), [i] "r" (nr)
01611        : "xmm0", "xmm1", "xmm2", "xmm3");
01612 
01613 #if 0 || defined(DEBUG)
01614        {
01615          // check pflags for correct byte mask
01616          int d = mySievePos+PrimeNumbers[nr];
01617          if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr][0]) != ((pflags&0x0003)==0x0003) ) { MARK; cerr << nr << "!!" << (pflags&0x0003) << endl; exit(7); }
01618          if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr][1]) != ((pflags&0x000c)==0x000c) ) { MARK; cerr << nr << "!!" << (pflags&0x000c) << endl; exit(7); }
01619          d=mySievePos+PrimeNumbers[nr+1];
01620          if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+1][0]) != ((pflags&0x0030)==0x0030) ) { MARK; cerr << nr+1 << "!!" << (pflags&0x0030) << endl; exit(7); }
01621          if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+1][1]) != ((pflags&0x00c0)==0x00c0) ) { MARK; cerr << nr+1 << "!!" << (pflags&0x00c0) << endl; exit(7); }
01622          d=mySievePos+PrimeNumbers[nr+2];
01623          if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+2][0]) != ((pflags&0x0300)==0x0300) ) { MARK; cerr << nr+2 << "!!" << (pflags&0x0300) << endl; exit(7); }
01624          if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+2][1]) != ((pflags&0x0c00)==0x0c00) ) { MARK; cerr << nr+2 << "!!" << (pflags&0x0c00) << endl; exit(7); }
01625          d=mySievePos+PrimeNumbers[nr+3];
01626          if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+3][0]) != ((pflags&0x3000)==0x3000) ) { MARK; cerr << nr+3 << "!!" << (pflags&0x3000) << endl; exit(7); }
01627          if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+3][1]) != ((pflags&0xc000)==0xc000) ) { MARK; cerr << nr+3 << "!!" << (pflags&0xc000) << endl; exit(7); }
01628        }
01629 #endif
01630       if (pflags==0) continue;
01631       if ( pflags & 0x000f ) Factors.push_back(nr);
01632       if ( pflags & 0x00f0 ) Factors.push_back(nr+1);
01633       if ( pflags & 0x0f00 ) Factors.push_back(nr+2);
01634       if ( pflags & 0xf000 ) Factors.push_back(nr+3);
01635    }
01636   asm volatile ("emms              # exit mmx");
01637 
01638 }
01639 // end of SSE2 optimized version, X86_64 mode
01640 
01641 
01642 #elif defined(ASM_SSE2) /* special code using SSE2 instructions (P4,Athlon-64) */
01643 
01644 void CDeltaComputations::GetStaticFactorHits(std::vector<unsigned int> &Factors, const signed int SievePos)
01645 {
01646   // find factors in static factorbase
01647   // (starting with the 3. prime of the FB (see above) )
01648 
01649   // (Pentium4) SSE2 specific code
01650  #ifdef DEBUG
01651   #warning "Pentium4/SSE2 specific code to detect static factors"
01652  #endif
01653   typedef int v4sf __attribute__ ((vector_size (16))); // 16 bytes -> 4 short floats
01654   register v4sf XMM6 asm("xmm6"); // XMM6 is now bound to %xmm6 (intention: sign correction)
01655   register v4sf XMM7 asm("xmm7"); // XMM7 is now bound to %xmm7
01656 
01657    // refer GCC manual, "Specifying registers for local variables";
01658    // but remember: that does not necessarily mean, that %xmm7 cannot be
01659    // used for other variables as well! (It is only guaranteed, that if we access
01660    // XMM7, this access is synonym to accessing %xmm7; but accessing %xmm7 is not necessarily
01661    // synonym to accessing XMM7 (because if the compiler assumes, that XMM7 is dead, %xmm7 could
01662    // be used for other variables; and the compiler does *not* know, that we want
01663    // XMM7 to be alive by accessing %xmm7!).
01664 
01665   asm volatile ( \
01666        "emms                       # enter mmx\n\t" \
01667        "movd %[Sieboffs],%%mm0     # mm0=(0,Sieboffs)\n\t" \
01668        "xorps %[_xmm6],%[_xmm6]    # L2AM   clear mm6 \n\t" \
01669        "punpckldq %%mm0,%%mm0      # L2AM   (0,Sieboffs) -> (Sieboffs,Sieboffs) \n\t" \
01670        "cvtpi2ps %%mm0,%[_xmm7]    # packed int to float \n\t" \
01671        "shufps $0x44,%[_xmm7],%[_xmm7] \n\t" \
01672        "cmpnleps %[_xmm7],%[_xmm6] # L2AM   set bits if signed, clear bits if unsigned" \
01673        : [_xmm7] "=x" (XMM7), [_xmm6] "=x" (XMM6) : [Sieboffs] "rm" (SievePos) : "mm0");
01674 
01675   // PrimeNumbers[0] and PrimeNumbers[1] get left out (since special treatment is necessary)
01676   // PrimeNumbers[2] und PrimeNumbers[3] get left out (so that the remaining number of primes is divisible by 4)
01677   {
01678      register int d = normalized_signed_mod(SievePos,PrimeNumbers[2],PrimeNumberReciprocals[2]);
01679      if (d==Delta_of_PrimeNumbers[2][0] || d==Delta_of_PrimeNumbers[2][1]) Factors.push_back(2);
01680      d = normalized_signed_mod(SievePos,PrimeNumbers[3],PrimeNumberReciprocals[3]);
01681      if (d==Delta_of_PrimeNumbers[3][0] || d==Delta_of_PrimeNumbers[3][1]) Factors.push_back(3);
01682   }
01683   
01684   int nr;
01685   for (nr=4; nr<StaticFactorbase::Size() && PrimeNumbers[nr]<PhysicalSieveSize; nr+=4) 
01686     {
01687       register unsigned int pflags;
01688       asm ( \
01689        "movaps %[_xmm7],%%xmm2 \n\t" \
01690        "cvtdq2ps %[divisors](,%[i],4),%%xmm1 # convert 4 packed int to float \n\t" \
01691        "mulps %[FloatRecips](,%[i],4),%%xmm2 \n\t" \
01692        "movaps %%xmm1,%%xmm3 \n\t" \
01693        "cvttps2dq %%xmm2,%%xmm0 \n\t" \
01694        "movaps %[_xmm7],%%xmm2 \n\t" \
01695        "cvtdq2ps %%xmm0,%%xmm0 \n\t" \
01696        "mulps %%xmm1,%%xmm0 \n\t" \
01697        "subps %%xmm0,%%xmm2 \n\t" \
01698        "andps %[_xmm6],%%xmm1     # correct sign (part 1) \n\t" \
01699        "addps %%xmm1,%%xmm2       # correct sign (part 2) \n\t" \
01700        "cmpneqps %%xmm2,%%xmm3    #  assure modp=0 instead of modp=p \n\t" \
01701        "andps %%xmm3,%%xmm2   \n\t" \
01702        "cvttps2dq %%xmm2,%%xmm3 \n\t" \
01703        "pshufd $0x50,%%xmm3,%%xmm0 \n\t" \
01704        "pshufd $0xfa,%%xmm3,%%xmm1 \n\t" \
01705        "pcmpeqd %[Deltas](,%[i],8),%%xmm0 \n\t" \
01706        "pcmpeqd 16+%[Deltas](,%[i],8),%%xmm1 \n\t" \
01707        "packssdw %%xmm1,%%xmm0 \n\t" \
01708        "pmovmskb %%xmm0,%[pflags] # create bytemask" \
01709        : [pflags] "=r" (pflags)
01710        : [divisors] "o" (PrimeNumbers[0]), [Deltas] "o" (Delta_of_PrimeNumbers[0][0]),
01711          [FloatRecips] "o" (PrimeNumberFloatReciprocals[0]),
01712          [_xmm7] "x" (XMM7), [_xmm6] "x" (XMM6), [i] "r" (nr)
01713        : "xmm0", "xmm1", "xmm2", "xmm3");
01714 
01715       //cout << "SievePos: " << SievePos << ": " << PackedResult[0] << "," << PackedResult[1] << endl;
01716  
01717 #if defined(DEBUG)
01718        {
01719          // check pflags for correct byte mask
01720          int d=normalized_signed_mod(SievePos,PrimeNumbers[nr]);
01721          if ( (d==Delta_of_PrimeNumbers[nr][0]) != ((pflags&0x0003)==0x0003) ) { MARK; cerr << nr << "!!" << (pflags&0x0003) << endl; exit(7); }
01722          if ( (d==Delta_of_PrimeNumbers[nr][1]) != ((pflags&0x000c)==0x000c) ) { MARK; cerr << nr << "!!" << (pflags&0x000c) << endl; exit(7); }
01723          d=normalized_signed_mod(SievePos,PrimeNumbers[nr+1]);
01724          if ( (d==Delta_of_PrimeNumbers[nr+1][0]) != ((pflags&0x0030)==0x0030) ) { MARK; cerr << nr+1 << "!!" << (pflags&0x0030) << endl; exit(7); }
01725          if ( (d==Delta_of_PrimeNumbers[nr+1][1]) != ((pflags&0x00c0)==0x00c0) ) { MARK; cerr << nr+1 << "!!" << (pflags&0x00c0) << endl; exit(7); }
01726          d=normalized_signed_mod(SievePos,PrimeNumbers[nr+2]);
01727          if ( (d==Delta_of_PrimeNumbers[nr+2][0]) != ((pflags&0x0300)==0x0300) ) { MARK; cerr << nr+2 << "!!" << (pflags&0x0300) << endl; exit(7); }
01728          if ( (d==Delta_of_PrimeNumbers[nr+2][1]) != ((pflags&0x0c00)==0x0c00) ) { MARK; cerr << nr+2 << "!!" << (pflags&0x0c00) << endl; exit(7); }
01729          d=normalized_signed_mod(SievePos,PrimeNumbers[nr+3]);
01730          if ( (d==Delta_of_PrimeNumbers[nr+3][0]) != ((pflags&0x3000)==0x3000) ) { MARK; cerr << nr+3 << "!!" << (pflags&0x3000) << endl; exit(7); }
01731          if ( (d==Delta_of_PrimeNumbers[nr+3][1]) != ((pflags&0xc000)==0xc000) ) { MARK; cerr << nr+3 << "!!" << (pflags&0xc000) << endl; exit(7); }
01732        }
01733 #endif
01734       if ( pflags & 0x000f ) Factors.push_back(nr);
01735       if ( pflags & 0x00f0 ) Factors.push_back(nr+1);
01736       if ( pflags & 0x0f00 ) Factors.push_back(nr+2);
01737       if ( pflags & 0xf000 ) Factors.push_back(nr+3);
01738 
01739     }
01740 
01741   const signed int mySievePos = SievePos-SieveOffset-PhysicalSieveSize;
01742   asm volatile ( \
01743        "movd %[mySievePos],%[_xmm7]   # -      mm0=(0,mySiebPos)\n\t" \
01744        "shufps $0x00,%[_xmm7],%[_xmm7] \n\t" \
01745        : [_xmm7] "=x" (XMM7) : [mySievePos] "r" (mySievePos));
01746 
01747   for (; nr<StaticFactorbase::Size(); nr+=4)
01748    {
01749       register unsigned int pflags;
01750       asm ( \
01751        "movdqa %[PrimeNumbers](,%[i],4),%%xmm2   # L2AMS  get y : x (as packed int)\n\t" \
01752        "paddd %[_xmm7],%%xmm2        # L2AM \n\t" \
01753        "movdqa 16+%[LocDeltas](,%[i],8),%%xmm3 \n\t" \
01754        "pshufd $0x50,%%xmm2,%%xmm0 \n\t" \
01755        "pshufd $0xfa,%%xmm2,%%xmm1 \n\t" \
01756        "pcmpeqd %[LocDeltas](,%[i],8),%%xmm0 \n\t" \
01757        "pcmpeqd %%xmm3,%%xmm1 \n\t" \
01758        "packssdw %%xmm1,%%xmm0 \n\t" \
01759        "pmovmskb %%xmm0,%[pflags] # create bytemask" \
01760        : [pflags] "=r" (pflags)
01761        : [PrimeNumbers] "o" (PrimeNumbers[0]), [LocDeltas] "o" (LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),
01762          [_xmm7] "x" (XMM7), [i] "r" (nr)
01763        : "xmm0", "xmm1", "xmm2", "xmm3");
01764 
01765 #if 0 || defined(DEBUG)
01766        {
01767          // check pflags for correct byte mask
01768          int d = mySievePos+PrimeNumbers[nr];
01769          if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr][0]) != ((pflags&0x0003)==0x0003) ) { MARK; cerr << nr << "!!" << (pflags&0x0003) << endl; exit(7); }
01770          if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr][1]) != ((pflags&0x000c)==0x000c) ) { MARK; cerr << nr << "!!" << (pflags&0x000c) << endl; exit(7); }
01771          d=mySievePos+PrimeNumbers[nr+1];
01772          if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+1][0]) != ((pflags&0x0030)==0x0030) ) { MARK; cerr << nr+1 << "!!" << (pflags&0x0030) << endl; exit(7); }
01773          if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+1][1]) != ((pflags&0x00c0)==0x00c0) ) { MARK; cerr << nr+1 << "!!" << (pflags&0x00c0) << endl; exit(7); }
01774          d=mySievePos+PrimeNumbers[nr+2];
01775          if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+2][0]) != ((pflags&0x0300)==0x0300) ) { MARK; cerr << nr+2 << "!!" << (pflags&0x0300) << endl; exit(7); }
01776          if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+2][1]) != ((pflags&0x0c00)==0x0c00) ) { MARK; cerr << nr+2 << "!!" << (pflags&0x0c00) << endl; exit(7); }
01777          d=mySievePos+PrimeNumbers[nr+3];
01778          if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+3][0]) != ((pflags&0x3000)==0x3000) ) { MARK; cerr << nr+3 << "!!" << (pflags&0x3000) << endl; exit(7); }
01779          if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+3][1]) != ((pflags&0xc000)==0xc000) ) { MARK; cerr << nr+3 << "!!" << (pflags&0xc000) << endl; exit(7); }
01780        }
01781 #endif
01782       if (pflags==0) continue;
01783       if ( pflags & 0x000f ) Factors.push_back(nr);
01784       if ( pflags & 0x00f0 ) Factors.push_back(nr+1);
01785       if ( pflags & 0x0f00 ) Factors.push_back(nr+2);
01786       if ( pflags & 0xf000 ) Factors.push_back(nr+3);
01787    }
01788   asm volatile ("emms              # exit mmx");
01789 
01790 }
01791 // end of SSE2 optimized version
01792 
01793 
01794 #elif defined(ASM_SSE) /* special code using SSE instructions (P3,Athlon-XP) */
01795 
01796 void CDeltaComputations::GetStaticFactorHits(std::vector<unsigned int> &Factors, const signed int SievePos)
01797 {
01798   // find factors in static factorbase
01799   // (starting with the 3. prime of the FB (see above) )
01800 
01801   // (Pentium3) SSE specific code
01802  #ifdef DEBUG
01803   #warning "Pentium3/SSE specific code to detect static factors"
01804  #endif
01805   typedef int v4sf __attribute__ ((vector_size (16))); // 16 bytes -> 4 short floats
01806   register v4sf XMM6 asm("xmm6"); // XMM6 is now bound to %xmm6 (intention: sign correction)
01807   register v4sf XMM7 asm("xmm7"); // XMM7 is now bound to %xmm7
01808 
01809    // refer GCC manual, "Specifying registers for local variables";
01810    // but remember: that does not necessarily mean, that %xmm7 cannot be
01811    // used for other variables as well! (It is only guaranteed, that if we access
01812    // XMM7, this access is synonym to accessing %xmm7; but accessing %xmm7 is not necessarily
01813    // synonym to accessing XMM7 (because if the compiler assumes, that XMM7 is dead, %xmm7 could
01814    // be used for other variables; and the compiler does *not* know, that we want
01815    // XMM7 to be alive by accessing %xmm7!).
01816 
01817   asm volatile ( \
01818        "emms                       # enter mmx\n\t" \
01819        "movd %[Sieboffs],%%mm0   # -      mm0=(0,Sieboffs)\n\t" \
01820        "xorps %[_xmm6],%[_xmm6]    # L2AM   clear mm6 \n\t" \
01821        "punpckldq %%mm0,%%mm0  # L2AM   (0,Sieboffs) -> (Sieboffs,Sieboffs) \n\t" \
01822        "cvtpi2ps %%mm0,%[_xmm7]  # packed int to float \n\t" \
01823        "shufps $0x44,%[_xmm7],%[_xmm7] \n\t" \
01824        "cmpnleps %[_xmm7],%[_xmm6] # L2AM   set bits if signed, clear bits if unsigned" \
01825        : [_xmm7] "=x" (XMM7), [_xmm6] "=x" (XMM6) : [Sieboffs] "rm" (SievePos) : "mm0");
01826 
01827   // PrimeNumbers[0] and PrimeNumbers[1] get left out (since special treatment is necessary)
01828   // PrimeNumbers[2] und PrimeNumbers[3] get left out (so that the remaining number of primes is divisible by 4)
01829   {
01830      register int d = normalized_signed_mod(SievePos,PrimeNumbers[2],PrimeNumberReciprocals[2]);
01831      if (d==Delta_of_PrimeNumbers[2][0] || d==Delta_of_PrimeNumbers[2][1]) Factors.push_back(2);
01832      d = normalized_signed_mod(SievePos,PrimeNumbers[3],PrimeNumberReciprocals[3]);
01833      if (d==Delta_of_PrimeNumbers[3][0] || d==Delta_of_PrimeNumbers[3][1]) Factors.push_back(3);
01834   }
01835   
01836   int nr;
01837   for (nr=4; nr<StaticFactorbase::Size() && PrimeNumbers[nr]<PhysicalSieveSize; nr+=4) 
01838     {
01839       register unsigned int pflags, pflags2;
01840       asm ( \
01841        "movaps %[_xmm7],%%xmm2 \n\t" \
01842        "movq %[divisors](,%[i],4),%%mm4   # L2AMS  get y : x (as packed int) \n\t" \
01843        "movq 8+%[divisors](,%[i],4),%%mm5   # L2AMS  get y : x (as packed int) \n\t" \
01844        "cvtpi2ps %%mm4,%%xmm0 # convert packed int to float \n\t" \
01845        "cvtpi2ps %%mm5,%%xmm1 # convert packed int to float \n\t" \
01846        "mulps %[FloatRecips](,%[i],4),%%xmm2 \n\t" \
01847        "movlhps %%xmm1,%%xmm0 \n\t" \
01848        "movaps %%xmm0,%%xmm1 \n\t" \
01849        "movaps %%xmm0,%%xmm3 \n\t" \
01850        "movhlps %%xmm2,%%xmm4 \n\t" \
01851        "cvttps2pi %%xmm2,%%mm0 \n\t" \
01852        "cvttps2pi %%xmm4,%%mm1 \n\t" \
01853        "movaps %[_xmm7],%%xmm2 \n\t" \
01854        "cvtpi2ps %%mm0,%%xmm0 \n\t" \
01855        "cvtpi2ps %%mm1,%%xmm4 \n\t" \
01856        "movlhps %%xmm4,%%xmm0 \n\t" \
01857        "mulps %%xmm1,%%xmm0 \n\t" \
01858        "subps %%xmm0,%%xmm2 \n\t" \
01859        "andps %[_xmm6],%%xmm1      # correct sign (part 1) \n\t" \
01860        "addps %%xmm1,%%xmm2        # correct sign (part 2) \n\t" \
01861        "movaps %%xmm2,%%xmm0 \n\t" \
01862        "cmpneqps %%xmm3,%%xmm0      #  assure modp=0 instead of modp=p \n\t" \
01863        "andps %%xmm0,%%xmm2   \n\t" \
01864        "movhlps %%xmm2,%%xmm4 \n\t" \
01865        "cvttps2pi %%xmm2,%%mm0 \n\t" \
01866        "pshufw $0x44,%%mm0,%%mm1   # L2AM \n\t" \
01867        "pshufw $0xee,%%mm0,%%mm2   # L2AM \n\t" \
01868        "pcmpeqd %[Deltas](,%[i],8),%%mm1   # L2AM \n\t" \
01869        "pcmpeqd 8+%[Deltas](,%[i],8),%%mm2 # L2AM \n\t" \
01870        "packssdw %%mm2,%%mm1       # L2AM  pack comparison results together \n\t" \
01871        "pmovmskb %%mm1,%[pflags]   # L6+ (VectorPath)  into bytemask \n\t" \
01872        "cvttps2pi %%xmm4,%%mm0 \n\t" \
01873        "pshufw $0x44,%%mm0,%%mm1   # L2AM \n\t" \
01874        "pshufw $0xee,%%mm0,%%mm2   # L2AM \n\t" \
01875        "pcmpeqd 16+%[Deltas](,%[i],8),%%mm1 # L2AM \n\t" \
01876        "pcmpeqd 24+%[Deltas](,%[i],8),%%mm2 # L2AM \n\t" \
01877        "packssdw %%mm2,%%mm1       # L2AM  pack comparison results together \n\t" \
01878        "pmovmskb %%mm1,%[pflags2]  # L6+ (VectorPath)  into bytemask" \
01879        : [pflags] "=r" (pflags), [pflags2] "=r" (pflags2)
01880        : [divisors] "o" (PrimeNumbers[0]), [Deltas] "o" (Delta_of_PrimeNumbers[0][0]),
01881          [FloatRecips] "o" (PrimeNumberFloatReciprocals[0]),
01882          [_xmm7] "x" (XMM7), [_xmm6] "x" (XMM6), [i] "r" (nr)
01883        : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
01884 
01885       //cout << "SievePos: " << SievePos << ": " << PackedResult[0] << "," << PackedResult[1] << endl;
01886  
01887 #if defined(DEBUG)
01888        {
01889          // check pflags for correct byte mask
01890          const int d=normalized_signed_mod(SievePos,PrimeNumbers[nr]);
01891          if ( (d==Delta_of_PrimeNumbers[nr][0]) != ((pflags&0x03)==0x03) ) { MARK; cerr << nr << "!!" << (pflags&0x03) << endl; exit(7); }
01892          if ( (d==Delta_of_PrimeNumbers[nr][1]) != ((pflags&0x0c)==0x0c) ) { MARK; cerr << nr << "!!" << (pflags&0x0c) << endl; exit(7); }
01893        }
01894 #endif
01895       if ( pflags & 0x0f ) Factors.push_back(nr);
01896 
01897 #if defined(DEBUG)
01898        {
01899          // check pflags for correct byte mask
01900          const int d=normalized_signed_mod(SievePos,PrimeNumbers[nr+1]);
01901          if ( (d==Delta_of_PrimeNumbers[nr+1][0]) != ((pflags&0x30)==0x30) ) { MARK; cerr << nr+1 << "!!" << (pflags&0x30) << endl; exit(7); }
01902          if ( (d==Delta_of_PrimeNumbers[nr+1][1]) != ((pflags&0xc0)==0xc0) ) { MARK; cerr << nr+1 << "!!" << (pflags&0xc0) << endl; exit(7); }
01903        }
01904 #endif
01905       if ( pflags & 0xf0 ) Factors.push_back(nr+1);
01906 
01907 #if defined(DEBUG)
01908        {
01909          // check pflags2 for correct byte mask
01910          const int d=normalized_signed_mod(SievePos,PrimeNumbers[nr+2]);
01911          if ( (d==Delta_of_PrimeNumbers[nr+2][0]) != ((pflags2&0x03)==0x03) ) { MARK; cerr << nr+2 << "!!" << (pflags2&0x03) << endl; exit(7); }
01912          if ( (d==Delta_of_PrimeNumbers[nr+2][1]) != ((pflags2&0x0c)==0x0c) ) { MARK; cerr << nr+2 << "!!" << (pflags2&0x0c) << endl; exit(7); }
01913        }
01914 #endif
01915       if ( pflags2 & 0x0f ) Factors.push_back(nr+2);
01916 
01917 #if defined(DEBUG)
01918        {
01919          // check pflags2 for correct byte mask
01920          const int d=normalized_signed_mod(SievePos,PrimeNumbers[nr+3]);
01921          if ( (d==Delta_of_PrimeNumbers[nr+3][0]) != ((pflags2&0x30)==0x30) ) { MARK; cerr << nr+3 << "!!" << (pflags2&0x30) << endl; exit(7); }
01922          if ( (d==Delta_of_PrimeNumbers[nr+3][1]) != ((pflags2&0xc0)==0xc0) ) { MARK; cerr << nr+3 << "!!" << (pflags2&0xc0) << endl; exit(7); }
01923        }
01924 #endif
01925       if ( pflags2 & 0xf0 ) Factors.push_back(nr+3);
01926     }
01927 
01928   const signed int mySievePos = SievePos-SieveOffset-PhysicalSieveSize;
01929   typedef int v2di __attribute__ ((vector_size (8))); // 8 bytes -> 2 double words
01930   register v2di MM7 asm("mm7"); // MM7 is now bound to %mm7
01931 
01932   asm volatile ( \
01933        "movd %[mySievePos],%%mm7   # -      mm0=(0,mySiebPos)\n\t" \
01934        "punpckldq %%mm7,%%mm7  # L2AM   (0,mySievePos) -> (mySievePos,mySievePos) \n\t"
01935        : [_mm7] "=y" (MM7) : [mySievePos] "r" (mySievePos));
01936 
01937   for (; nr<StaticFactorbase::Size(); nr+=2)
01938    {
01939       register unsigned int pflags;
01940       asm ( \
01941        "movq %[PrimeNumbers](,%[i],4),%%mm0   # L2AMS  get y : x (as packed int)\n\t" \
01942        "paddd %[_mm7],%%mm0        # L2AM \n\t" \
01943        "pshufw $0x44,%%mm0,%%mm1   # L2AM \n\t" \
01944        "pshufw $0xee,%%mm0,%%mm2   # L2AM \n\t" \
01945        "pcmpeqd %[LocDeltas](,%[i],8),%%mm1   # L2AM \n\t" \
01946        "pcmpeqd 8+%[LocDeltas](,%[i],8),%%mm2 # L2AM \n\t" \
01947        "packssdw %%mm2,%%mm1       # L2AM  pack comparison results together \n\t" \
01948        "pmovmskb %%mm1,%[pflags]   # L6+ (VectorPath)  into bytemask" \
01949        : [pflags] "=r" (pflags)
01950        : [PrimeNumbers] "o" (PrimeNumbers[0]), [LocDeltas] "o" (LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),
01951          [_mm7] "y" (MM7), [i] "r" (nr)
01952        : "mm0", "mm1", "mm2");
01953 
01954 #if 0 || defined(DEBUG)
01955        {
01956          // check pflags for correct byte mask
01957          register const int d = mySievePos+PrimeNumbers[nr];
01958          if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr][0]) != ((pflags&0x03)==0x03) ) { MARK; exit(7); }
01959          if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr][1]) != ((pflags&0x0c)==0x0c) ) { MARK; exit(7); }
01960        }
01961 #endif
01962       if ( pflags & 0x0f ) Factors.push_back(nr);
01963 
01964 #if 0 || defined(DEBUG)
01965        {
01966          // check pflags for correct byte mask
01967          register const int d = mySievePos+PrimeNumbers[nr+1];
01968          if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+1][0]) != ((pflags&0x30)==0x30) ) { MARK; exit(7); }
01969          if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+1][1]) != ((pflags&0xc0)==0xc0) ) { MARK; exit(7); }
01970        }
01971 #endif
01972       if ( pflags & 0xf0 ) Factors.push_back(nr+1);
01973 
01974    }
01975   asm volatile ("emms              # exit mmx");
01976 
01977 }
01978 // end of SSE optimized version
01979 
01980 
01981 #elif defined(ASM_ATHLON) /* special code for AMD Athlon 3DNow! */
01982 
01983 void CDeltaComputations::GetStaticFactorHits(std::vector<unsigned int> &Factors, const signed int SievePos)
01984 {
01985   // find factors in static factorbase
01986   // (beginnend mit der 3. Primzahl der FB (s.o) )
01987 
01988   // Athlon 3DNow! specific code
01989  #ifdef DEBUG
01990   #warning "AMD Athlon 3DNow! specific code to detect static factors"
01991  #endif
01992   typedef int v2sf __attribute__ ((vector_size (8))); // 8 bytes -> 2 short floats
01993   register v2sf MM7 asm("mm7"); // MM7 is now bound to %mm7
01994   register v2sf MM6 asm("mm6"); // MM6 is now bound to %mm6 (intention: sign correction)
01995 
01996    // refer GCC manual, "Specifying registers for local variables";
01997    // but remember: that does not necessarily mean, that %mm7 cannot be
01998    // used for other variables as well! (It is only guaranteed, that if we access
01999    // MM7, this access is synonym to accessing %mm7; but accessing %mm7 is not necessarily
02000    // synonym to accessing MM7 (because if the compiler assumes, that MM7 is dead, %mm7 could
02001    // be used for other variables; and the compiler does *not* know, that we want
02002    // MM7 to be alive by accessing %mm7!).
02003 
02004   asm volatile ( \
02005        "femms                      # L2AMS  fast enter mmx\n\t" \
02006        "movd %[Sieboffs],%[_mm7]   # -      MM7=(0,Sieboffs)\n\t" \
02007        "punpckldq %[_mm7],%[_mm7]  # L2AM   (0,Sieboffs) -> (Sieboffs,Sieboffs) \n\t" \
02008        "movq %[_mm7],%%mm0         # L2AM \n\t" \
02009        "pxor %[_mm6],%[_mm6]       # L2AM   clear mm6 \n\t" \
02010        "pi2fd %[_mm7],%[_mm7]      # L4A    int to float \n\t" \
02011        "pcmpgtd %%mm0,%%mm6        # L2AM   set bits if signed, clear bits if unsigned" \
02012        : [_mm7] "=y" (MM7), [_mm6] "=y" (MM6) : [Sieboffs] "rm" (SievePos) : "mm0");
02013 
02014   // PrimeNumbers[0] und PrimeNumbers[1] are left out (because they need special treatment)
02015   int nr;
02016   for (nr=2; nr<StaticFactorbase::Size() && PrimeNumbers[nr]<PhysicalSieveSize; nr+=2) 
02017     {
02018       register unsigned int pflags;
02019       asm ( \
02020        "movq %[_mm7],%%mm2         # L2AMS \n\t" \
02021        "movq %[divisors](,%[i],4),%%mm4   # L2AMS  get y : x (as packed int)\n\t" \
02022        "#prefetch %[FloatRecips]+64(,%[i],4) \n\t" \
02023        "pfmul %[FloatRecips](,%[i],4),%%mm2 # L4M    z/y : w/x \n\t" \
02024        "pi2fd %%mm4,%%mm0          # L4A    get y : x (as packed float)\n\t" \
02025        "movq %%mm4,%%mm5           # L2AMS \n\t" \
02026        "pf2id %%mm2,%%mm1          # L4A    packed float to int \n\t" \
02027        "pand %%mm6,%%mm4           # L2AM   correct sign (part 1) \n\t" \
02028        "pi2fd %%mm1,%%mm1          # L4A    and back (->floor) \n\t" \
02029        "pfmul %%mm0,%%mm1          # L4M    multiply \n\t" \
02030        "pfsubr %[_mm7],%%mm1       # L4A    subtract \n\t" \
02031        "pf2id %%mm1,%%mm0          # L4A    ->modulo \n\t" \
02032        "paddd %%mm4,%%mm0          # L2AM   correct sign (part 2) \n\t" \
02033        "pcmpeqd %%mm0,%%mm5        # L2AM  assure modp=0 instead of modp=p \n\t" \
02034        "pandn %%mm0,%%mm5          # L2AM \n\t" \
02035        "pshufw $0x44,%%mm5,%%mm1   # L2AM \n\t" \
02036        "pshufw $0xee,%%mm5,%%mm2   # L2AM \n\t" \
02037        "pcmpeqd %[Deltas](,%[i],8),%%mm1   # L2AM \n\t" \
02038        "pcmpeqd 8+%[Deltas](,%[i],8),%%mm2 # L2AM \n\t" \
02039        "packssdw %%mm2,%%mm1       # L2AM  pack comparison results together \n\t" \
02040        "pmovmskb %%mm1,%[pflags]   # L6+ (VectorPath)  into bytemask" \
02041        : [pflags] "=r" (pflags)
02042        : [divisors] "o" (PrimeNumbers[0]), [FloatRecips] "o" (PrimeNumberFloatReciprocals[0]),
02043          [Deltas] "o" (Delta_of_PrimeNumbers[0][0]),
02044          [_mm7] "y" (MM7), [_mm6] "y" (MM6), [i] "r" (nr)
02045        : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5");
02046 
02047       //cout << "SievePos: " << SievePos << ": " << PackedResult[0] << "," << PackedResult[1] << endl;
02048  
02049 #ifdef DEBUG
02050        {
02051          // check pflags for correct byte mask
02052          const int d=normalized_signed_mod(SievePos,PrimeNumbers[nr]);
02053          if ( (d==Delta_of_PrimeNumbers[nr][0]) != ((pflags&0x03)==0x03) ) { MARK; exit(7); }
02054          if ( (d==Delta_of_PrimeNumbers[nr][1]) != ((pflags&0x0c)==0x0c) ) { MARK; exit(7); }
02055        }
02056 #endif
02057       if ( pflags & 0x0f ) Factors.push_back(nr);
02058 
02059 #ifdef DEBUG
02060        {
02061          // check pflags for correct byte mask
02062          const int d=normalized_signed_mod(SievePos,PrimeNumbers[nr+1]);
02063          if ( (d==Delta_of_PrimeNumbers[nr+1][0]) != ((pflags&0x30)==0x30) ) { MARK; exit(7); }
02064          if ( (d==Delta_of_PrimeNumbers[nr+1][1]) != ((pflags&0xc0)==0xc0) ) { MARK; exit(7); }
02065        }
02066 #endif
02067       if ( pflags & 0xf0 ) Factors.push_back(nr+1);
02068     }
02069 
02070   const signed int mySievePos = SievePos-SieveOffset-PhysicalSieveSize;
02071 
02072   asm volatile ( \
02073        "movd %[mySievePos],%%mm7   # -      mm0=(0,mySiebPos)\n\t" \
02074        "punpckldq %%mm7,%%mm7  # L2AM   (0,mySievePos) -> (mySievePos,mySievePos) \n\t"
02075        : [_mm7] "=y" (MM7) : [mySievePos] "r" (mySievePos));
02076 
02077   for (; nr<StaticFactorbase::Size(); nr+=2)
02078    {
02079       register unsigned int pflags;
02080       asm ( \
02081        "movq %[PrimeNumbers](,%[i],4),%%mm0   # L2AMS  get y : x (as packed int)\n\t" \
02082        "paddd %[_mm7],%%mm0        # L2AM \n\t" \
02083        "pshufw $0x44,%%mm0,%%mm1   # L2AM \n\t" \
02084        "pshufw $0xee,%%mm0,%%mm2   # L2AM \n\t" \
02085        "pcmpeqd %[LocDeltas](,%[i],8),%%mm1   # L2AM \n\t" \
02086        "pcmpeqd 8+%[LocDeltas](,%[i],8),%%mm2 # L2AM \n\t" \
02087        "packssdw %%mm2,%%mm1       # L2AM  pack comparison results together \n\t" \
02088        "pmovmskb %%mm1,%[pflags]   # L6+ (VectorPath)  into bytemask" \
02089        : [pflags] "=r" (pflags)
02090        : [PrimeNumbers] "o" (PrimeNumbers[0]), [LocDeltas] "o" (LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),
02091          [_mm7] "y" (MM7), [i] "r" (nr)
02092        : "mm0", "mm1", "mm2");
02093 
02094 #if 0 || defined(DEBUG)
02095        {
02096          // check pflags for correct byte mask
02097          register const int d = mySievePos+PrimeNumbers[nr];
02098          if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr][0]) != ((pflags&0x03)==0x03) ) { MARK; exit(7); }
02099          if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr][1]) != ((pflags&0x0c)==0x0c) ) { MARK; exit(7); }
02100        }
02101 #endif
02102       if ( pflags & 0x0f ) Factors.push_back(nr);
02103 
02104 #if 0 || defined(DEBUG)
02105        {
02106          // check pflags for correct byte mask
02107          register const int d = mySievePos+PrimeNumbers[nr+1];
02108          if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+1][0]) != ((pflags&0x30)==0x30) ) { MARK; exit(7); }
02109          if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+1][1]) != ((pflags&0xc0)==0xc0) ) { MARK; exit(7); }
02110        }
02111 #endif
02112       if ( pflags & 0xf0 ) Factors.push_back(nr+1);
02113 
02114    }
02115   asm volatile ("femms             # L2AMS  fast exit mmx");
02116 
02117 }
02118 // end of AMD Athlon 3DNow! optimized version
02119 
02120 #elif defined(ASM_3DNOW) /* special code for 3DNow! (without extensions) */
02121 
02122 void CDeltaComputations::GetStaticFactorHits(std::vector<unsigned int> &Factors, const signed int SievePos)
02123 {
02124   // find factors in static factorbase
02125   // (beginnend mit der 3. Primzahl der FB (s.o) )
02126 
02127   // AMD-K6 3DNow! specific code (without using MMX/3DNow! extensions)
02128  #ifdef DEBUG
02129   #warning "AMD K6 3DNow! specific code to detect static factors"
02130  #endif
02131   typedef int v2sf __attribute__ ((vector_size (8))); // 8 bytes -> 2 short floats
02132   register v2sf MM7 asm("mm7"); // MM7 is now bound to %mm7
02133   register v2sf MM6 asm("mm6"); // MM6 is now bound to %mm6 (intention: sign correction)
02134 
02135    // refer GCC manual, "Specifying registers for local variables";
02136    // but remember: that does not necessarily mean, that %mm7 cannot be
02137    // used for other variables as well! (It is only guaranteed, that if we access
02138    // MM7, this access is synonym to accessing %mm7; but accessing %mm7 is not necessarily
02139    // synonym to accessing MM7 (because if the compiler assumes, that MM7 is dead, %mm7 could
02140    // be used for other variables; and the compiler does *not* know, that we want
02141    // MM7 to be alive by accessing %mm7!).
02142 
02143   asm volatile ( \
02144        "femms                      # L2AMS  fast enter mmx\n\t" \
02145        "movd %[Sieboffs],%[_mm7]   # -      MM7=(0,Sieboffs)\n\t" \
02146        "punpckldq %[_mm7],%[_mm7]  # L2AM   (0,Sieboffs) -> (Sieboffs,Sieboffs) \n\t" \
02147        "movq %[_mm7],%%mm0         # L2AM \n\t" \
02148        "pxor %[_mm6],%[_mm6]       # L2AM   clear mm6 \n\t" \
02149        "pi2fd %[_mm7],%[_mm7]      # L4A    int to float \n\t" \
02150        "pcmpgtd %%mm0,%%mm6        # L2AM   set bits if signed, clear bits if unsigned" \
02151        : [_mm7] "=y" (MM7), [_mm6] "=y" (MM6) : [Sieboffs] "rm" (SievePos) : "mm0");
02152 
02153   // PrimeNumbers[0] und PrimeNumbers[1] are left out (because they need special treatment)
02154   int nr;
02155   for (nr=2; nr<StaticFactorbase::Size() && PrimeNumbers[nr]<PhysicalSieveSize; nr+=2) 
02156     {
02157       register unsigned int pflags;
02158       asm ( \
02159        "movq %[_mm7],%%mm2         # L2AMS \n\t" \
02160        "movq %[divisors](,%[i],4),%%mm4   # L2AMS  get y : x (as packed int)\n\t" \
02161        "pfmul %[FloatRecips](,%[i],4),%%mm2 # L4M    z/y : w/x \n\t" \
02162        "pi2fd %%mm4,%%mm0          # L4A    get y : x (as packed float)\n\t" \
02163        "movq %%mm4,%%mm5           # L2AMS \n\t" \
02164        "pf2id %%mm2,%%mm1          # L4A    packed float to int \n\t" \
02165        "pand %%mm6,%%mm4           # L2AM   correct sign (part 1) \n\t" \
02166        "pi2fd %%mm1,%%mm1          # L4A    and back (->floor) \n\t" \
02167        "pfmul %%mm0,%%mm1          # L4M    multiply \n\t" \
02168        "pfsubr %[_mm7],%%mm1       # L4A    subtract \n\t" \
02169        "pf2id %%mm1,%%mm0          # L4A    ->modulo \n\t" \
02170        "prefetch %[FloatRecips]+96(,%[i],4) \n\t" \
02171        "paddd %%mm4,%%mm0          # L2AM   correct sign (part 2) \n\t" \
02172        "pcmpeqd %%mm0,%%mm5        # L2AM  assure modp=0 instead of modp=p \n\t" \
02173        "pandn %%mm0,%%mm5          # L2AM \n\t" \
02174        "movq %%mm5,%%mm1 \n\t" \
02175        "punpckldq %%mm5,%%mm5 \n\t" \
02176        "punpckhdq %%mm1,%%mm1 \n\t" \
02177        "pcmpeqd %[Deltas](,%[i],8),%%mm5   # L2AM \n\t" \
02178        "pcmpeqd 8+%[Deltas](,%[i],8),%%mm1 # L2AM \n\t" \
02179        "packssdw %%mm1,%%mm5 \n\t" \
02180        "packsswb %%mm0,%%mm5 # mm0 is not relevant\n\t" \
02181        "movd %%mm5,%[pflags]   # L5+ (VectorPath)" \
02182        : [pflags] "=r" (pflags)
02183        : [divisors] "o" (PrimeNumbers[0]), [FloatRecips] "o" (PrimeNumberFloatReciprocals[0]),
02184          [Deltas] "o" (Delta_of_PrimeNumbers[0][0]),
02185          [_mm7] "y" (MM7), [_mm6] "y" (MM6), [i] "r" (nr)
02186        : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5");
02187 
02188       //cout << "SievePos: " << SievePos << ": " << PackedResult[0] << "," << PackedResult[1] << endl;
02189  
02190 #if 0 || defined(DEBUG)
02191        {
02192          // check pflags for correct byte mask
02193          const int d=normalized_signed_mod(SievePos,PrimeNumbers[nr]);
02194          if ( (d==Delta_of_PrimeNumbers[nr][0]) != ((pflags&0x000000ffU)==0x000000ffU) ) { MARK; exit(7); }
02195          if ( (d==Delta_of_PrimeNumbers[nr][1]) != ((pflags&0x0000ff00U)==0x0000ff00U) ) { MARK; exit(7); }
02196        }
02197 #endif
02198       if ( pflags & 0x0000ffffU ) Factors.push_back(nr);
02199 
02200 #if 0 || defined(DEBUG)
02201        {
02202          // check pflags for correct byte mask
02203          const int d=normalized_signed_mod(SievePos,PrimeNumbers[nr+1]);
02204          if ( (d==Delta_of_PrimeNumbers[nr+1][0]) != ((pflags&0x00ff0000U)==0x00ff0000U) ) { MARK; exit(7); }
02205          if ( (d==Delta_of_PrimeNumbers[nr+1][1]) != ((pflags&0xff000000U)==0xff000000U) ) { MARK; exit(7); }
02206        }
02207 #endif
02208       if ( pflags & 0xffff0000U ) Factors.push_back(nr+1);
02209     }
02210 
02211   const signed int mySievePos = SievePos-SieveOffset-PhysicalSieveSize;
02212 
02213   asm volatile ( \
02214        "movd %[mySievePos],%%mm7   # -      mm0=(0,mySiebPos)\n\t" \
02215        "punpckldq %%mm7,%%mm7  # L2AM   (0,mySievePos) -> (mySievePos,mySievePos) \n\t"
02216        : [_mm7] "=y" (MM7) : [mySievePos] "r" (mySievePos));
02217 
02218   for (; nr<StaticFactorbase::Size(); nr+=2)
02219    {
02220       register unsigned int pflags;
02221       asm ( \
02222        "movq %[PrimeNumbers](,%[i],4),%%mm0   # L2AMS  get y : x (as packed int)\n\t" \
02223        "paddd %[_mm7],%%mm0        # L2AM \n\t" \
02224        "movq %%mm0,%%mm1 \n\t" \
02225        "punpckldq %%mm0,%%mm0 \n\t" \
02226        "punpckhdq %%mm1,%%mm1 \n\t" \
02227        "pcmpeqd %[LocDeltas](,%[i],8),%%mm0   # L2AM \n\t" \
02228        "pcmpeqd 8+%[LocDeltas](,%[i],8),%%mm1 # L2AM \n\t" \
02229        "packssdw %%mm1,%%mm0 \n\t" \
02230        "packsswb %%mm1,%%mm0 \n\t" \
02231        "movd %%mm0,%[pflags]   # L5+ (VectorPath)" \
02232        : [pflags] "=r" (pflags)
02233        : [PrimeNumbers] "o" (PrimeNumbers[0]), [LocDeltas] "o" (LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),
02234          [_mm7] "y" (MM7), [i] "r" (nr)
02235        : "mm0", "mm1");
02236 
02237 #if 0 || defined(DEBUG)
02238        {
02239          // check pflags for correct byte mask
02240          register const int d = mySievePos+PrimeNumbers[nr];
02241          if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr][0]) != ((pflags&0x000000ffU)==0x000000ffU) ) { MARK; exit(7); }
02242          if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr][1]) != ((pflags&0x0000ff00U)==0x0000ff00U) ) { MARK; exit(7); }
02243        }
02244 #endif
02245       if ( pflags & 0x0000ffffU ) Factors.push_back(nr);
02246 
02247 #if 0 || defined(DEBUG)
02248        {
02249          // check pflags for correct byte mask
02250          register const int d = mySievePos+PrimeNumbers[nr+1];
02251          if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+1][0]) != ((pflags&0x00ff0000U)==0x00ff0000U) ) { MARK; exit(7); }
02252          if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+1][1]) != ((pflags&0xff000000U)==0xff000000U) ) { MARK; exit(7); }
02253        }
02254 #endif
02255       if ( pflags & 0xffff0000U ) Factors.push_back(nr+1);
02256 
02257    }
02258   asm volatile ("femms             # L2AMS  fast exit mmx");
02259 
02260 }
02261 // end of AMD 3DNow! optimized version
02262 
02263 #else /* generic version */
02264 
02265 void CDeltaComputations::GetStaticFactorHits(std::vector<unsigned int> &Factors, const signed int SievePos)
02266 {
02267   // find factors in static factorbase
02268   // (beginnend mit der 3. Primzahl der FB (s.o) )
02269 
02270   int i;
02271   for (i=2; i<StaticFactorbase::Size() && PrimeNumbers[i]<PhysicalSieveSize; ++i)
02272    {
02273      // shortcut:
02274      /* avoid expensive mpz-divisions!
02275         We can re-check for sieved hits at SievePos. This is cheaper!
02276       */
02277 
02278      register int d = normalized_signed_mod(SievePos,PrimeNumbers[i],PrimeNumberReciprocals[i]);
02279      // same as: SievePos%PrimeNumbers[i]; if (d<0) d+=PrimeNumbers[i];
02280      // Bemerkung: %-operation does *not* eliminate the sign!
02281      //            ugly: -1=1 (mod 2), but (-1)%2=-1, 1%2=1
02282      // -> so we have to eliminate the sign!!!
02283      // and don't try to use unsigned int d, this won't work!
02284 
02285      if (d==Delta_of_PrimeNumbers[i][0] || d==Delta_of_PrimeNumbers[i][1]) // hit?
02286        Factors.push_back(i); // these are the divisors!
02287    }
02288 
02289   const signed int mySievePos = SievePos-SieveOffset-PhysicalSieveSize;
02290   for (; i<StaticFactorbase::Size(); ++i)
02291    {
02292      // shortcut:
02293      /* avoid expensive mpz-divisions!
02294         We can re-check for sieved hits at SievePos. This is cheaper!
02295         For the bigger primefactors this is even easier and needs no division:
02296         Simply repeat the same procedure as done in the sieving code; then
02297         compare whether the result will fit...
02298       */
02299 
02300      register int d = mySievePos+PrimeNumbers[i];
02301      if (d==LocalPhysInterval_Delta_of_PrimeNumbers[i][0] || d==LocalPhysInterval_Delta_of_PrimeNumbers[i][1]) // hit?
02302        Factors.push_back(i); // these are the divisors!
02303    }
02304 
02305 
02306 }
02307 #endif /* generic code */
02308 
02309 
02310 #include "Sieving-inc.cc"

Generated on Wed Nov 7 23:29:26 2007 for Qsieve by  doxygen 1.5.4