Qsieve: dft.cc Source File

00001 // Discrete Fast Fourier Transform
00002 // written by Thorsten Reinecke, 2003-07-04
00003 // last change: 2005-05-24
00004 
00011 /*
00012    A(x)=a[0]+a[1]*x^1+a[2]*x^2+...+a[k]*x^m  mod(N)
00013    B(x)=b[0]+b[1]*x^1+b[2]*x^2+...+b[k]*x^n  mod(N)
00014    -> C(x)=A(x)*B(x)=c[0]+c[1]*x+...+c[m+n]*x^(m+n)   mod(N)
00015 
00016    Problem: Quadratwurzeln können wir modulo N nicht ziehen,
00017    jedenfalls nicht in erträglicher Zeit... (da N im allgemeinen eine
00018    zusammengesetzte Zahl ist, deren Teiler uns nicht bekannt sind)
00019    ... also können wir die diskrete FFT nicht durchführen!
00020 
00021    Lösung:
00022    Beobachtung, dass die einzelnen Koeffizienten des Polynoms A(x)*B(x)
00023    nicht größer werden können als (m+n)*N^2, wenn die einzelnen
00024    Koeffizienten von A(x) und B(x) im Bereich { 0 ...N-1 } liegen.
00025    (Beweis: Worst-Case-Überschlagsrechnung bei naiver Polynommultiplikation)
00026 
00027    [ Nebenbei: Das ist auch der Grund dafür, warum sich
00028      mpz_mod innerhalb der Rekursion bei der 
00029      Karatsuba-Polynommultiplikation nicht lohnt... ]
00030 
00031    Wir können uns also eine Primzahl P > (m+n)*N^2 suchen,
00032    für die wir Quadratwurzeln ziehen können!
00033    Wir führen die DFT dann modulo P durch.
00034    Das Ergebnis sollte immer mit dem Ergebnis ohne Modulorechnung
00035    übereinstimmen.
00036    -> Also  C(x) mod P = C(x) mod (m+n)*N^2
00037    -> (C(x) mod P) mod N ist dann unser gesuchtes Polynom!
00038 
00039    So viel zum Prinzipiellen...
00040 
00041    Leider wird die skalare Multiplikation dann unheimlich teuer
00042    und die Koeffizienten ziemlich groß, so dass man diese
00043    mod P reduzieren muß (allein schon, um Speicher zu sparen).
00044    Die Laufzeitgewinne schrumpfen also bei größeren Zahlen
00045    schnell wieder ein...
00046     
00047    Alternativ können wir aber statt eines großen P viele kleine P's
00048    verwenden und die Teilergebnisse dann mit dem chinesischen Restsatz 
00049    zum Gesamtergebnis zusammensetzen.
00050 
00051    eventuell noch implementierbar:
00052 
00053    Da die Teilmultiplikationen nach dem SIMD-Prinzip (Single Instruction
00054    Multiple Data) durchgeführt werden können, eignet sich das gut für
00055    Vektorrechner und dürfte auch gut für eine spätere SSE-Optimierung
00056    geeignet sein.
00057 
00058    32-bittige unsigned ints sind dafür allerdings nicht unbedingt zu empfehlen,
00059    da es im Intervall [2^31,2^32] nur 199 Primzahlen gibt mit (p-1)=2^20*rest,
00060    wie nachfolgende Prozedur zeigt:
00061 
00062      c:=0;
00063      for i from 2^31+1 to 2^32 by 2^20 do 
00064        if isprime(i) then c:=c+1; print(c,i); fi;
00065      od;
00066 
00067    Damit ließen sich dann grob überschlagen Dezimalzahlen der Stelligkeit
00068    lg(((2^31)^(199/2))/(2^20))=(31*99.5-20)*lg(2)=3064.5*0.30103=922.5
00069    beackern, da wir die 199 Primzahlen über den chinesischen Restsatz
00070    kombinieren und die Größe der Polynome auf maximal 2^20 Koeffizienten
00071    begrenzen müssen. (Koeffizientengröße von etwa 2^20*N^2 muß noch korrekt
00072    rekonstruierbar sein!)
00073    Hier heißt es also auf 64bit umzusteigen oder gleich in Fließkomma
00074    zu rechnen.
00075 */
00076 
00077 
00078 #include "modulo.H"
00079 #include "mpz_wrapper.H"
00080 
00081 namespace polynomial
00082 {
00083 
00084 using namespace my_mpz_wrapper;
00085 
00086 class CDFT_base0 : private ForbidAssignment
00087 {
00088  public:
00089   const unsigned int max_size;
00090 
00091   // static helper function (for constructor to initialize max_size)
00092   static unsigned int calc_max_size(const unsigned int x_size)
00093    {
00094      unsigned int i=2;
00095      while (i<x_size) i<<=1;
00096      return i;
00097    }
00098 
00099   inline unsigned int use_size(const unsigned int input_size) const
00100   {
00101     unsigned int i = 2;
00102     while (i<input_size) i<<=1;
00103     if (i>max_size)
00104      {
00105        MARK;
00106        cerr << "input_size is invalid!" << endl;
00107        exit(1);
00108      }
00109     return i;
00110   }
00111 
00112   explicit CDFT_base0(const unsigned int x_size)
00113    : max_size(calc_max_size(x_size))
00114    {
00115    }
00116 };
00117 
00118 class CDFT_base : public CDFT_base0
00119 {
00120 public:
00121   // this procedure calculates "count" valid primes (beginning at "Start") suitable for
00122   // doing dft (with recursion depth "Depth" and returns in a newly created Polynom "primes",
00123   // which needs to be initially an empty reference.
00124   static void get_valid_primes_for(TPolynom &primes, const unsigned int count,
00125                                    const mpz_t Start, const unsigned Depth);
00126 
00127  private:
00128   mpz_t h; // auxiliary variable, that any method is allowed to use
00129   mpz_t inverse[32];
00130   TPolynom w;
00131 
00132  protected:
00133   int size;
00134   mpz_t M; // coset: we compute all results (mod M)!
00135 
00136   inline const mpz_t& invpow2(const unsigned int i) const { return inverse[i]; }
00137 
00138   void calc_roots_and_inverse();
00139   void convolute(const TPolynom p, const unsigned int n);
00140 
00141  public:
00142   explicit CDFT_base(const unsigned int x_size)
00143    : CDFT_base0(x_size), w(new mpz_t[max_size]), size(0)
00144    {
00145 #if defined(VERBOSE_INFO)
00146      MARK; cout << "CDFT_base-constructor for maximum size=" << max_size << endl;
00147 #endif
00148      mpz_init(M);
00149      mpz_init(h);
00150      for (unsigned int i=0; i<max_size; ++i) mpz_init(w[i]);
00151      for (unsigned int i=0; i<32; ++i) mpz_init(inverse[i]);
00152    }
00153 
00154   CDFT_base(const unsigned int x_size, const mpz_t x_M)
00155    : CDFT_base0(x_size), w(new mpz_t[max_size]),  size(0)
00156    {
00157 #if defined(VERBOSE_INFO)
00158      MARK; cout << "CDFT_base-constructor for maximum size=" << max_size << endl;
00159 #endif
00160      mpz_init(M);
00161      mpz_init(h);
00162      for (unsigned int i=0; i<max_size; ++i) mpz_init(w[i]);
00163      for (unsigned int i=0; i<32; ++i) mpz_init(inverse[i]);
00164      mpz_set(M,x_M);
00165      calc_roots_and_inverse();
00166    }
00167 
00168   virtual ~CDFT_base()
00169    {
00170      for (int i=31; i>=0; --i) mpz_clear(inverse[i]);
00171 
00172      for (int i=max_size-1; i>=0; --i) mpz_clear(w[i]);
00173      delete [] w;
00174 
00175      mpz_clear(h);
00176      mpz_clear(M);
00177    }
00178 
00179   int dftmul(const TPolynom R, const int kR,
00180              const TconstPolynom P1, const int k1,
00181              const TconstPolynom P2, const int k2);
00182 
00183   friend class CDFT_chinrem;
00184 };
00185 
00186 
00187 
00188 class CDFT : public CDFT_base
00189 {
00190  private:
00191   mpz_t N; // this is our underlying number!
00192 
00193   // helper function for constructor 
00194   void calc_field_and_roots_and_inverse();
00195 
00196  protected:
00197   int internal_mul(const TPolynom R, const int kR, 
00198                    const TconstPolynom P1, const int k1,
00199                    const TconstPolynom P2, const int k2,
00200                    const bool reduce_result_modN);
00201 
00202  public:
00203 
00204   // DFT (discrete fourier transform) to multiply to polynomials,
00205   // whose product have no more than <size> coefficients.
00206   // x_N denotes the coset, in which the result must be still correct.
00207 
00208   CDFT(const unsigned int x_size, const mpz_t x_N)
00209    : CDFT_base(x_size)
00210    {
00211 #if defined(VERBOSE_INFO)
00212      MARK; cout << "CDFT-constructor for maximum size=" << max_size << endl;
00213 #endif
00214      mpz_init_set(N,x_N);
00215      calc_field_and_roots_and_inverse();
00216    }
00217 
00218   virtual ~CDFT()
00219    {
00220      mpz_clear(N);
00221    }
00222 
00223  inline const mpz_t& get_N(void) const { return N; }
00224 
00225  inline int mul(const TPolynom R, const int kR, 
00226                 const TconstPolynom P1, const int k1,
00227                 const TconstPolynom P2, const int k2)
00228   {
00229     return internal_mul(R,kR,P1,k1,P2,k2,false);
00230   }
00231 
00232  inline int mulmod(const TPolynom R, const int kR,
00233                    const TconstPolynom P1, const int k1,
00234                    const TconstPolynom P2, const int k2)
00235   {
00236     return internal_mul(R,kR,P1,k1,P2,k2,true);
00237   }
00238 
00239  inline int square(const TPolynom R, const int kR,
00240                    const TconstPolynom P, const int k)
00241   {
00242     return internal_mul(R,kR,P,k,P,k,false);
00243   }
00244 
00245  inline int squaremod(const TPolynom R, const int kR,
00246                       const TconstPolynom P, const int k)
00247   {
00248     return internal_mul(R,kR,P,k,P,k,true);
00249   }
00250 
00251 };
00252 
00253 
00254 class CDFT_chinrem : public CDFT_base0
00255 {
00256  private:
00257   mpz_t N; // this is our underlying number!
00258 
00259   // helper function for constructor 
00260   void calc_field_and_roots_and_inverse();
00261 
00262  protected:
00263 
00264   struct tnode
00265    {
00266      mpz_t M;
00267      mpz_t inv_first_M_mod_second_M;
00268      tnode *left, *right;
00269      CDFT_base *first_dft, *second_dft;
00270    };
00271   void create_nodes(tnode &node, int &count, int depth = 1);
00272   void delete_nodes(tnode &node);
00273   void recurse_dftmul(tnode &node,
00274                const TPolynom R, const int kR,
00275                const TconstPolynom P1, const int k1,
00276                const TconstPolynom P2, const int k2);
00277   tnode root_node;
00278 
00279   int anz_dft;
00280   TPolynom Ms;
00281   CDFT_base** dft;
00282   int internal_mul(const TPolynom R, const int kR, 
00283                    const TconstPolynom P1, const int k1,
00284                    const TconstPolynom P2, const int k2,
00285                    const bool reduce_result_modN);
00286 
00287  public:
00288 
00289   // DFT (discrete fourier transform) to multiply to polynomials,
00290   // whose product have no more than <size> coefficients.
00291   // x_N denotes the coset, in which the result must be still correct.
00292 
00293   CDFT_chinrem(const unsigned int x_size, const mpz_t x_N)
00294    : CDFT_base0(x_size), anz_dft(0), Ms(NULL), dft(NULL)
00295    {
00296 #if defined(VERBOSE_INFO)
00297      MARK; cout << "CDFT-constructor for maximum size=" << max_size << endl;
00298 #endif
00299      mpz_init_set(N,x_N);
00300      calc_field_and_roots_and_inverse();
00301    }
00302 
00303   virtual ~CDFT_chinrem()
00304    {
00305      delete_nodes(root_node);
00306      for (int i=0; i<anz_dft; ++i) delete dft[i];
00307      delete [] dft;
00308      for (int i=0; i<anz_dft; ++i) mpz_clear(Ms[i]);
00309      delete [] Ms;
00310      mpz_clear(N);
00311    }
00312 
00313  inline const mpz_t& get_N(void) const { return N; }
00314 
00315  inline int mul(const TPolynom R, const int kR, 
00316                 const TconstPolynom P1, const int k1,
00317                 const TconstPolynom P2, const int k2)
00318   {
00319     return internal_mul(R,kR,P1,k1,P2,k2,false);
00320   }
00321 
00322  inline int mulmod(const TPolynom R, const int kR,
00323                    const TconstPolynom P1, const int k1,
00324                    const TconstPolynom P2, const int k2)
00325   {
00326     return internal_mul(R,kR,P1,k1,P2,k2,true);
00327   }
00328 
00329  inline int square(const TPolynom R, const int kR,
00330                    const TconstPolynom P, const int k)
00331   {
00332     return internal_mul(R,kR,P,k,P,k,false);
00333   }
00334 
00335  inline int squaremod(const TPolynom R, const int kR,
00336                       const TconstPolynom P, const int k)
00337   {
00338     return internal_mul(R,kR,P,k,P,k,true);
00339   }
00340 
00341 };
00342 
00343 
00344 
00345 // ----------------- Implementation ---------------------------------------
00346 
00347 
00348 void CDFT_base::get_valid_primes_for(TPolynom &primes, const unsigned int count,
00349                                      const mpz_t Start, const unsigned Depth)
00350 {
00351   // this procedure calculates "count" valid primes (beginning at "Start") suitable for
00352   // doing dft (with recursion depth "Depth" and returns in a newly created Polynom "primes",
00353   // which needs to be initially an empty reference.
00354 
00355  if (primes!=NULL)
00356   {
00357     cout << __FILE__ << ", " << __FUNCTION__ << ": line " <<  __LINE__ << endl;
00358     cerr << "First parameter is a call by reference," << endl;
00359     cerr << "it should initially point to NULL (to avoid memory-leaks)," << endl;
00360     cerr << "because a new pointer to new data will be generated and" << endl;
00361     cerr << "there is no need for initially data pointed by \"primes!\"" << endl;
00362     exit(1);
00363   }
00364 
00365  primes = new mpz_t[count];
00366  for (unsigned int i=0; i<count; ++i) mpz_init(primes[i]);
00367 
00368   mpz_t x,M;
00369   mpz_init(x); mpz_init(M);
00370 
00371   const size_t bits = mpz_sizeinbase(Start,2)+1; // magnitude ld(Start)+1
00372   // + "safety-bits", in case that any coefficients of polynomials are slightly too big...
00373   mpz_set_ui(M,1); mpz_mul_2exp(M,M,bits);
00374 #if 1
00375   // paranoid
00376   if (mpz_cmp_ui(M,Depth)<0) mpz_set_ui(M,Depth);
00377 #endif
00378   mpz_add_ui(M,M,1);
00379 
00380   const unsigned int interval = 10000;
00381 
00382 #ifdef VERBOSE
00383   MARK;
00384 #endif
00385 
00386   for (unsigned int bisher=0; bisher<count; ++bisher)
00387    {
00388 #ifdef VERBOSE
00389     cerr << bisher+1 << "/" << count << ": ";
00390 #endif
00391     do
00392      {
00393       // sieve[i] -> true, if M+i*Depth is composite
00394       // sieve[i] -> false: unknown
00395       bool sieve[interval] = { false };
00396       for (unsigned int p=3; p<1000; p+=2) if (numtheory::is_prime(p))
00397        {
00398          unsigned int i = 0;
00399          unsigned int r = mpz_fdiv_ui(M,p);
00400          while (r) { r=(r+Depth)%p; ++i; }
00401          while (i<interval) { sieve[i]=true; i+=p; }
00402        }
00403       sieve[interval-1]=false;
00404       unsigned int i=0;
00405       while(i<interval)
00406        {
00407          while(sieve[i]) ++i;
00408          mpz_set_ui(x,Depth); mpz_mul_ui(x,x,i); mpz_add(x,x,M);
00409 #ifdef VERBOSE
00410          cerr << i << " ";
00411 #endif
00412          if (mpz_probab_prime_p(x,10)) break;
00413          ++i;
00414        }
00415       mpz_set_ui(x,Depth); mpz_mul_ui(x,x,i); mpz_add(M,M,x);
00416 #ifdef VERBOSE
00417       cerr << " # +" << i << endl;
00418 #endif
00419      } while (mpz_probab_prime_p(M,10)==0);
00420     mpz_set(primes[bisher],M); mpz_add_ui(M,M,Depth);
00421    }
00422 
00423   mpz_clear(M); mpz_clear(x);
00424 }
00425 
00426 
00427 void CDFT::calc_field_and_roots_and_inverse()
00428 {
00429   mpz_mul(M,N,N); mpz_mul_ui(M,M,max_size);
00430   mpz_mul_ui(M,M,4); // + "Safety-Bits", in case that any coefficients are slightly too big...
00431 
00432   TPolynom MyField = NULL;
00433   get_valid_primes_for(MyField,1,M,max_size);
00434   mpz_set(M,MyField[0]);
00435   mpz_clear(MyField[0]); delete [] MyField;
00436   calc_roots_and_inverse();
00437 }
00438 
00439 
00440 void CDFT_chinrem::delete_nodes(tnode &node)
00441 {
00442   if (node.left) delete_nodes(*node.left);
00443   if (node.right) delete_nodes(*node.right);
00444   mpz_clear(node.inv_first_M_mod_second_M); mpz_clear(node.M); 
00445   delete node.right; delete node.left;
00446 }
00447 
00448 void CDFT_chinrem::create_nodes(tnode &node, int &count, int depth)
00449 {
00450   if ( 1<<depth >= anz_dft )
00451    {
00452      // leaf is reached
00453      node.first_dft=dft[count++];
00454 #if defined(VERBOSE_INFO)
00455      cout << "created dft " << count << ": " << node.first_dft->M << endl;
00456 #endif
00457      node.second_dft=dft[count++];
00458 #if defined(VERBOSE_INFO)
00459      cout << "created dft " << count << ": " << node.second_dft->M << endl;
00460 #endif
00461      node.left=node.right=NULL;
00462      mpz_init(node.M); mpz_mul(node.M,node.first_dft->M,node.second_dft->M);
00463      mpz_init(node.inv_first_M_mod_second_M);
00464      if (!mpz_invert(node.inv_first_M_mod_second_M,node.first_dft->M,node.second_dft->M))
00465       {
00466         MARK; cerr << "BUG!! Inverse MUST exist!" << endl;
00467         exit(1);
00468       }
00469    }
00470   else
00471    {
00472      node.first_dft=node.second_dft=NULL;
00473      node.left  = new tnode;
00474      create_nodes(*node.left,count,depth+1);
00475      node.right = new tnode;
00476      create_nodes(*node.right,count,depth+1);
00477      mpz_init(node.M); mpz_mul(node.M,node.left->M,node.right->M);
00478      mpz_init(node.inv_first_M_mod_second_M);
00479      if (!mpz_invert(node.inv_first_M_mod_second_M,node.left->M,node.right->M))
00480       {
00481         MARK; cerr << "BUG!! Inverse MUST exist!" << endl;
00482         exit(1);
00483       }
00484    }
00485 #if defined(VERBOSE_INFO)
00486   cout << "node value: " << node.M << endl;
00487 #endif
00488 }
00489 
00490 void CDFT_chinrem::calc_field_and_roots_and_inverse()
00491 {
00492   typedef CDFT_base* Pdft; 
00493   mpz_t M;
00494   mpz_init(M);
00495   mpz_mul(M,N,N); mpz_mul_ui(M,M,max_size);
00496   mpz_sqrt(M,M); // chinese remaindering using two primenumbers -> p*q > max_size*N
00497   anz_dft=2;
00498   while (mpz_sizeinbase(M,10)>125)
00499    {
00500      // increase depth of binary tree for chinese remaindering until
00501      // modulo value is below a given threshold
00502      mpz_sqrt(M,M); anz_dft*=2;
00503    }
00504   Ms = NULL; CDFT_base::get_valid_primes_for(Ms,anz_dft,M,max_size);
00505   dft = new Pdft[anz_dft];
00506   for (int i=0; i<anz_dft; ++i) dft[i] = new CDFT_base(max_size,Ms[i]);
00507 #if defined(VERBOSE_INFO)
00508   cout << "N = " << N << endl;
00509   cout << anz_dft << " nodes have been prepared for chinese remaindering." << endl;
00510 #endif
00511   int count=0; create_nodes(root_node,count); 
00512   mpz_clear(M);
00513 }
00514 
00515 
00516 void CDFT_base::calc_roots_and_inverse()
00517 {
00518   mpz_t x,e;
00519   mpz_init(x); mpz_init(e);
00520 
00521   if (!mpz_probab_prime_p(M,10))
00522    {
00523      MARK;
00524      cerr << "invalid M for dft!" << endl;
00525      exit(1);
00526    }
00527 
00528   mpz_sub_ui(e,M,1);
00529   if (mpz_div_ui(e,e,max_size)!=0)
00530    {
00531      MARK;
00532      cerr << "invalid M for dft!" << endl;
00533      exit(1);
00534    }
00535 
00536   unsigned int r=911;
00537 try_r:
00538   mpz_set_ui(x,r); mpz_powm(w[1],x,e,M); mpz_powm_ui(x,w[1],max_size/2,M);
00539   mpz_add_ui(x,x,1); mpz_mod(x,x,M);
00540   //cout << "Restklasse " << "M="; mpz_out_str(stdout,10,M); cout << endl;
00541   //cout << "-1? ";  mpz_out_str(stdout,10,x); cout << endl << endl;
00542   if (mpz_cmp_ui(x,0)!=0)
00543    {
00544      r+=2; if (r<2000) goto try_r;
00545      cerr << "unable to find valid roots..." << endl;
00546      exit(1);
00547    }
00548 
00549   // otherwise w[1] is the first root of unity...
00550   mpz_set_ui(w[0],1); // w^0 = 1
00551   for (unsigned int i=2; i<max_size; ++i)
00552    {
00553      mpz_mul(x,w[i-1],w[1]); mpz_mod(w[i],x,M);
00554      if (mpz_cmp_ui(w[i],1)==0)
00555       {
00556         MARK;
00557         cerr << "invalid roots..." << endl;
00558         exit(1);
00559       }
00560    }
00561 
00562   mpz_clear(x); mpz_clear(e);
00563 
00564   // finally precalculate inverse of 2^k (mod M), k=0..31
00565   mpz_set_ui(inverse[0],1);
00566   mpz_set_ui(h,2);
00567   if (!mpz_invert(h,h,M)) { MARK; cerr << "inverse of 2 does not exist!" << endl; exit(1); }
00568   mpz_mod(h,h,M); mpz_set(inverse[1],h);
00569   for (int i=2; i<32; ++i)
00570    {
00571      mpz_mul(h,inverse[i-1],inverse[1]); mpz_mod(h,h,M);
00572      mpz_set(inverse[i],h);
00573    }
00574 }
00575 
00576 void CDFT_base::convolute(const TPolynom p, const unsigned int n)
00577 {
00578   // we assume n = 2^k, k>1 !!
00579   if (n==4)
00580    {
00581      mpz_add(p[0],p[0],p[2]); mpz_mul_2exp(p[2],p[2],1); mpz_sub(p[2],p[0],p[2]);
00582      mpz_add(p[1],p[1],p[3]); mpz_add(p[0],p[0],p[1]); mpz_mul_2exp(p[3],p[3],1); mpz_sub(p[3],p[1],p[3]);
00583      mpz_mul(h,p[3],w[max_size>>2]); mpz_mod(h,h,M); mpz_sub(p[3],p[2],h); mpz_add(p[2],p[2],h);
00584      mpz_mul_2exp(p[1],p[1],1); mpz_sub(p[1],p[0],p[1]);
00585      mpz_swap(p[1],p[2]);
00586    }
00587   else
00588    {
00589      const unsigned int nh = n>>1;
00590 
00591 #if 1
00592      {
00593        mpz_t* const temp = new mpz_t[nh]; // will be only used for swapping, so no initialization is needed...
00594        for (unsigned int i=0, j=0; i<nh; ++i)
00595         {
00596           mpz_swap(p[i],p[j++]);
00597           mpz_swap(temp[i],p[j++]);
00598         }
00599        for (unsigned int i=0; i<nh; ++i) mpz_swap(p[i+nh],temp[i]);
00600        delete [] temp;
00601      }
00602 #else
00603      // alternative method: using more temporary memory, but less movements...
00604      {
00605        mpz_t* const temp = new mpz_t[n]; // will be only used for swapping, so no initialization is needed...
00606        memcpy(temp,p,n*sizeof(mpz_t));
00607        for (unsigned int i=0, j=0; i<nh; ++i)
00608         {
00609           memcpy(&p[i],&temp[j++],sizeof(mpz_t));
00610           memcpy(&p[i+nh],&temp[j++],sizeof(mpz_t));
00611         }
00612        delete [] temp;
00613      }
00614 #endif
00615 
00616      convolute(p,nh);
00617      convolute(&p[nh],nh);
00618 
00619      const unsigned int dj = max_size/n;
00620      for (unsigned int i=0,j=0; i<nh; ++i,j+=dj)
00621       {
00622         mpz_mul(h,p[i+nh],w[j]); mpz_mod(h,h,M);
00623         mpz_sub(p[i+nh],p[i],h);
00624         mpz_add(p[i],p[i],h);
00625       }
00626    }
00627 }
00628 
00629 
00630 int CDFT_base::dftmul(const TPolynom R, const int kR,
00631                       const TconstPolynom P1, const int k1,
00632                       const TconstPolynom P2, const int k2)
00633 {
00634   const unsigned int estimated_memusage_in_bits = mpz_sizeinbase(M,2)+32; // for optimizing mpz-heap allocation
00635   const int result_size = k1+k2-1;
00636 #if defined(VERBOSE)
00637   cout << "dftmul: " << k1 << ", " << k2 << ", " << result_size << endl;
00638 #endif
00639   size = use_size(result_size);
00640 
00641   // sanity checks
00642   if (result_size>size)
00643    {
00644      MARK; cerr << "(result_size>size)" << endl;
00645      exit(1);
00646    }
00647   if (kR<result_size)
00648    {
00649      MARK; cerr << "destination polynomial is too small!" << endl;
00650      exit(1);
00651    }
00652 
00653   // use R, if memory in R suffices for convolution;
00654   // otherwise uses temporary memory...
00655   const TPolynom p = (kR>=size) ? R : new mpz_t[size];
00656 
00657   if (p!=R) for (int i=0; i<size; ++i) mpz_init2(p[i],estimated_memusage_in_bits);
00658   else for (int i=k1; i<size; ++i) mpz_set_ui(p[i],0); // padding with zeros
00659 
00660   for (int i=0; i<k1; ++i) mpz_mod(p[i],P1[i],M); // get first multiplicand
00661   convolute(p,size); // do fft
00662 
00663   const TPolynom q = (P1==P2 && k1==k2) ? p : new mpz_t[size]; // for special case p*q = p^2
00664   if (p!=q)
00665    {
00666      for (int i=0; i<size; ++i) mpz_init2(q[i],estimated_memusage_in_bits);
00667      // is done already by init2!! for (int i=k2; i<size; ++i) mpz_init(q[i]); // padding with zeros
00668 
00669      for (int i=0; i<k2; ++i) mpz_mod(q[i],P2[i],M); // get second multiplicand
00670      convolute(q,size); // do fft
00671    }
00672 
00673   // IMPORTANT: store result for last fft in p (to save memory space)
00674   for (int i=0; i<size; ++i)
00675    {
00676      mpz_mul(h,p[i],q[i]);
00677      mpz_mod(p[i],h,M);
00678    }
00679    // the result will be in p now!!!
00680 
00681   if (q!=p)
00682    {
00683      // we can delete the temporary polynomial q
00684      for (int i=0; i<size; ++i) mpz_clear(q[i]);
00685      delete [] q;
00686    }
00687 
00688   convolute(p,size); // do fft
00689   for (int i=1; i<size/2; ++i) mpz_swap(p[i],p[size-i]);
00690 
00691   int inv_index=0;
00692   for (int i=1; i<size; i<<=1) ++inv_index;
00693 
00694   for (int i=0; i<result_size; ++i)
00695    {
00696      mpz_mul(h,p[i],invpow2(inv_index));
00697      mpz_mod(p[i],h,M);
00698    }
00699 
00700   if (p!=R)
00701    {
00702      // copy result and release temporary polynomial
00703      for (int i=result_size-1; i>=0; --i) mpz_set(R[i],p[i]); // faster would be mpz_swap, but: memory fragmentation?
00704      if (size-result_size>10) cout << size-result_size << " computations saved..." << endl;
00705      // release temporary polynomial
00706      for (int i=0; i<size; ++i) mpz_clear(p[i]);
00707      delete [] p;
00708    }
00709   else
00710    for (int i=result_size; i<kR; ++i) mpz_set_ui(R[i],0); // leading zeroes seem apparently necessary (in spite of returning result_size!)
00711   return result_size; // return size of result
00712 }
00713 
00714 
00715 int CDFT::internal_mul(const TPolynom R, const int kR,
00716                const TconstPolynom P1, const int k1,
00717                const TconstPolynom P2, const int k2,
00718                const bool reduce_result_modN)
00719 {
00720   const size_t ld_N = mpz_sizeinbase(N,2); // ld(any input coefficient)>ld_N -> result could be wrong!!
00721   const unsigned int estimated_memusage_in_bits = mpz_sizeinbase(M,2)*2+5; // for optimizing mpz-heap allocation
00722   const int result_size = k1+k2-1;
00723 #if defined(VERBOSE_INFO)
00724   cout << "_mul (dft): ";
00725   if (P1==P2) cout << "(SQUARE) ";
00726   cout << k1 << ", " << k2 << ", " << result_size << endl; 
00727 #endif
00728   size = use_size(result_size);
00729 
00730   // sanity checks
00731   if (result_size>size)
00732    {
00733      MARK; cerr << "(result_size>size)" << endl;
00734      exit(1);
00735    }
00736   if (kR<result_size)
00737    {
00738      MARK; cerr << "destination polynomial is too small!" << endl;
00739      exit(1);
00740    }
00741 
00742   // use R, if memory in R suffices for convolution;
00743   // otherwise uses temporary memory...
00744   const TPolynom p = (kR>=size && mpz_sizeinbase(R[0],2)>=estimated_memusage_in_bits) ? R : new mpz_t[size];
00745 
00746   // IMPORTANT!!! all coefficients of P1 and P2 must be
00747   // between 0 and N-1!
00748   // This is *very* important for DFFT!!
00749 
00750   if (p!=R) for (int i=0; i<size; ++i) mpz_init2(p[i],estimated_memusage_in_bits);
00751   for (int i=0; i<k1; ++i) mpz_set(p[i],P1[i]); // get first multiplicand
00752 
00753   // and this is done by mpz_init already :-)
00754   // for (int i=k1; i<size; ++i) mpz_set_ui(p[i],0); // padding with zeros
00755 
00756 
00757 #if 0
00758   for (int i=0; i<k1; ++i) mpz_mod(p[i],p[i],N); // just to be on the safe side
00759 #else
00760   {
00761    int j=0;
00762    for (int i=0; i<k1; ++i)
00763     {
00764      if ( mpz_sgn(p[i])<0 || mpz_sizeinbase(p[i],2)>ld_N )
00765       {
00766         ++j; mpz_mod(p[i],p[i],N); // just to be on the safe side
00767       }
00768     }
00769 #if defined(VERBOSE)
00770    if (j) cout << "P1: " << j << " out of " << k1 << " coefficients corrected." << endl;
00771 #endif
00772   }
00773 #endif
00774 
00775   convolute(p,size); // do fft
00776 
00777   const TPolynom q = (P1==P2 && k1==k2) ? p : new mpz_t[size]; // for special case p*q = p^2
00778   if (p!=q)
00779    {
00780      // IMPORTANT!!! all input coefficients of P1 and P2 must be
00781      // between 0 and N-1!
00782      // This is *very* important for DFFT!!
00783 
00784      for (int i=0; i<size; ++i) mpz_init2(q[i],estimated_memusage_in_bits);
00785      for (int i=0; i<k2; ++i) mpz_set(q[i],P2[i]); // get second multiplicand
00786      // is done already by init2!! for (int i=k2; i<size; ++i) mpz_init(q[i]); // padding with zeros
00787 
00788 #if 0
00789      for (int i=0; i<k2; ++i) mpz_mod(q[i],q[i],N); // just to be on the safe side
00790 #else
00791      {
00792       int j=0;
00793       for (int i=0; i<k2; ++i)
00794        {
00795         if (mpz_sgn(q[i])<0 || mpz_sizeinbase(q[i],2)>ld_N )
00796          {
00797            ++j; mpz_mod(q[i],q[i],N); // just to be on the safe side
00798          }
00799        }
00800 #if defined(VERBOSE)
00801       if (j) cout << "P2: " << j << " out of " << k2 << " coefficients corrected." << endl;
00802 #endif
00803      }
00804 #endif
00805 
00806       convolute(q,size); // do fft
00807     }
00808 
00809   // IMPORTANT: store result for last fft in p (to save memory space)
00810   for (int i=0; i<size; ++i)
00811    {
00812      mpz_mul(p[i],p[i],q[i]);
00813      mpz_mod(p[i],p[i],M);
00814    }
00815    // the result will be in p now!!!
00816 
00817   if (q!=p)
00818    {
00819      // we can delete the temporary polynomial q
00820      for (int i=0; i<size; ++i) mpz_clear(q[i]);
00821      delete [] q;
00822    }
00823 
00824   convolute(p,size); // do fft
00825   for (int i=1; i<size/2; ++i) mpz_swap(p[i],p[size-i]);
00826 
00827   int inv_index=0;
00828   for (int i=1; i<size; i<<=1) ++inv_index;
00829 
00830   for (int i=0; i<result_size; ++i)
00831    {
00832      mpz_mul(p[i],p[i],invpow2(inv_index));
00833      mpz_mod(p[i],p[i],M);
00834      if (reduce_result_modN) mpz_mod(p[i],p[i],N);
00835    }
00836 
00837   if (p!=R)
00838    {
00839      // copy result and release temporary polynomial
00840      for (int i=result_size-1; i>=0; --i) mpz_set(R[i],p[i]); // mpz_swap would be faster, but: memory fragmentation?
00841      if (size-result_size>10) cout << size-result_size << " computations saved..." << endl;
00842 #ifdef DEBUG
00843      // sanity check
00844      for (int i=result_size; i<size; ++i)
00845       {
00846         mpz_mul(p[i],p[i],invpow2(inv_index));
00847         mpz_mod(p[i],p[i],M);
00848         if (mpz_cmp_ui(p[i],0)!=0)
00849          {
00850            MARK;
00851            cerr << "These values should be ZERO!" << endl;
00852          }
00853       }
00854 #endif
00855      // release temporary polynomial
00856      for (int i=0; i<size; ++i) mpz_clear(p[i]);
00857      delete [] p;
00858    }
00859   //for (int i=result_size; i<kR; ++i) mpz_set_ui(R[i],0); // leading zeroes
00860   return result_size; // return size of result
00861 }
00862 
00863 
00864 
00865 void CDFT_chinrem::recurse_dftmul(tnode &node,
00866                const TPolynom R, const int kR,
00867                const TconstPolynom P1, const int k1,
00868                const TconstPolynom P2, const int k2)
00869 {
00870   if (node.left || node.right)
00871    {
00872      // recurse deeper
00873      //cout << "downwards left" << endl;
00874      const TPolynom p1 = new mpz_t[k1];
00875      for (int i=0; i<k1; ++i) { mpz_init(p1[i]); mpz_mod(p1[i],P1[i],node.M); }
00876      const TPolynom p2 = (P1==P2) ? p1 : new mpz_t[k2];
00877      if (p1!=p2)
00878       for (int i=0; i<k2; ++i) { mpz_init(p2[i]); mpz_mod(p2[i],P2[i],node.M); }
00879 
00880      recurse_dftmul(*node.left,R,kR,p1,k1,p2,k2);
00881      TTempPolynom R2(kR);
00882      //cout << "downwards right" << endl;
00883      recurse_dftmul(*node.right,R2,kR,p1,k1,p2,k2);
00884 
00885      if (p1!=p2)
00886       {
00887         for (int i=0; i<k2; ++i) mpz_clear(p2[i]);
00888         delete [] p2;
00889       }
00890      for (int i=0; i<k1; ++i) mpz_clear(p1[i]);
00891      delete [] p1;
00892 
00893      // and returned from recursion.
00894      // -> do chinese remaindering
00895      mpz_t h; mpz_init(h);
00896      for (int i=0; i<k1+k2-1; ++i)
00897       {
00898         mpz_sub(h,R2[i],R[i]);
00899         mpz_mul(h,h,node.inv_first_M_mod_second_M);
00900         mpz_mod(h,h,node.right->M); mpz_addmul(R[i],h,node.left->M);
00901         //mpz_mod(R[i],R[i],node.M);
00902       }
00903      mpz_clear(h);
00904    }
00905   else
00906    {
00907      // reached a leaf
00908      //cout << "reached a leaf" << endl;
00909 
00910      const TPolynom p1 = new mpz_t[k1];
00911      for (int i=0; i<k1; ++i) { mpz_init(p1[i]); mpz_mod(p1[i],P1[i],node.M); }
00912      const TPolynom p2 = (P1==P2) ? p1 : new mpz_t[k2];
00913      if (p1!=p2)
00914       for (int i=0; i<k2; ++i) { mpz_init(p2[i]); mpz_mod(p2[i],P2[i],node.M); }
00915 
00916      node.first_dft->dftmul(R,kR,p1,k1,p2,k2);
00917      TTempPolynom R2(kR);
00918      node.second_dft->dftmul(R2,kR,p1,k1,p2,k2);
00919 
00920      if (p1!=p2)
00921       {
00922         for (int i=0; i<k2; ++i) mpz_clear(p2[i]);
00923         delete [] p2;
00924       }
00925      for (int i=0; i<k1; ++i) mpz_clear(p1[i]);
00926      delete [] p1;
00927 
00928      // and now: chinese remaindering
00929      mpz_t h; mpz_init(h);
00930      for (int i=0; i<k1+k2-1; ++i)
00931       {
00932         mpz_sub(h,R2[i],R[i]);
00933         mpz_mul(h,h,node.inv_first_M_mod_second_M);
00934         mpz_mod(h,h,node.second_dft->M); mpz_addmul(R[i],h,node.first_dft->M);
00935         //mpz_mod(R[i],R[i],node.M);
00936       }
00937      mpz_clear(h);
00938    }
00939 }
00940 
00941 
00942 int CDFT_chinrem::internal_mul(const TPolynom R, const int kR,
00943                const TconstPolynom P1, const int k1,
00944                const TconstPolynom P2, const int k2,
00945                const bool reduce_result_modN)
00946 {
00947   const int result_size = k1+k2-1;
00948 #if defined(VERBOSE_INFO)
00949   cout << "_mul (dft,chinrem): ";
00950   if (P1==P2) cout << "(SQUARE) ";
00951   cout << k1 << ", " << k2 << ", " << result_size << endl; 
00952 #endif
00953   // sanity check
00954   if (kR<result_size)
00955    {
00956      MARK; cerr << "destination polynomial is too small!" << endl;
00957      exit(1);
00958    }
00959 
00960   // use R, if memory in R suffices for convolution;
00961   // otherwise uses temporary memory...
00962   const TPolynom p1 = new mpz_t[k1];
00963 
00964   // IMPORTANT!!! all coefficients of P1 and P2 must be
00965   // between 0 and N-1!
00966   // This is *very* important for DFFT!!
00967 
00968   for (int i=0; i<k1; ++i) mpz_init(p1[i]); // get first multiplicand
00969   for (int i=0; i<k1; ++i) mpz_mod(p1[i],P1[i],N); // just to be on the safe side
00970 
00971   const TPolynom p2 = (P1==P2 && k1==k2) ? p1 : new mpz_t[k2]; // special case p*q = p^2
00972   if (p1!=p2)
00973    {
00974      // IMPORTANT!!! all input coefficients of P1 and P2 must be
00975      // between 0 and N-1!
00976      // This is *very* important for DFFT!!
00977 
00978      for (int i=0; i<k2; ++i) mpz_init(p2[i]); // get second multiplicand
00979      for (int i=0; i<k2; ++i) mpz_mod(p2[i],P2[i],N); // just to be on the safe side
00980    }
00981 
00982   const int size = use_size(result_size);
00983   if (kR>=size)
00984    {
00985      recurse_dftmul(root_node,R,kR,p1,k1,p2,k2);
00986      if (reduce_result_modN)
00987       for (int i=0; i<result_size; ++i) mpz_mod(R[i],R[i],N);
00988    }
00989   else
00990    {
00991      TTempPolynom myR(size);
00992      recurse_dftmul(root_node,myR,size,p1,k1,p2,k2);
00993      if (reduce_result_modN)
00994       for (int i=0; i<result_size; ++i) mpz_mod(R[i],myR[i],N);
00995      else
00996       for (int i=0; i<result_size; ++i) mpz_set(R[i],myR[i]);
00997    }
00998 
00999   if (p1!=p2)
01000    {
01001      for (int i=0; i<k2; ++i) mpz_clear(p2[i]);
01002      delete [] p2;
01003    }
01004   for (int i=0; i<k1; ++i) mpz_clear(p1[i]);
01005   delete [] p1;
01006 
01007   //for (int i=result_size; i<kR; ++i) mpz_set_ui(R[i],0); // leading zeroes
01008   return result_size; // return size of result
01009 }
01010 
01011 
01012 
01013 // ------------------------------------------------------------------------
01014 
01015 
01016 
01017 // the following lines are more specific for our application...
01018 
01019 //typedef CDFT TDFT;// dft without chinese remaindering
01020 typedef CDFT_chinrem TDFT; // dft with chinese remaindering
01021 typedef TDFT* PDFT;
01022 
01023 
01024 #if 1
01025 inline bool dft_mul_is_recommended(const int k1, const int k2)
01026 {
01027   // tune...
01028   if (k1<14000 || k2<14000) return false;
01029   if (k1<=16384) return true;
01030   if (k1>25000 && k2>25000) return true;
01031   return false;
01032 }
01033 
01034 inline bool dft_square_is_recommended(const int k)
01035 {
01036   // tune...
01037   return k>=8192;
01038 }
01039 
01040 #else
01041 
01042 inline bool dft_mul_is_recommended(const int k1, const int k2)
01043 {
01044   return k1+k2>=8;
01045   //return k1+k2>=512;
01046 }
01047 
01048 inline bool dft_square_is_recommended(const int k)
01049 {
01050   return k>=4;
01051   //return k>=256;
01052 }
01053 
01054 #endif
01055 
01056 
01057 
01058 const PDFT get_dft(const unsigned int n, const mpz_t m)
01059 {
01060  static PDFT pdft = NULL;
01061  if (!pdft)
01062   {
01063     if (n<=0) return NULL;
01064     pdft = new TDFT(n>32768 ? n : 32768,m);
01065   }
01066  
01067  if ( n > pdft->max_size // resize is necessary!
01068        ||
01069       mpz_cmp(pdft->get_N(),m)!=0 // modulo-base has changed!
01070        ||
01071       n==0 // request to clear the temporary object
01072     ) 
01073   {
01074     delete pdft; pdft=NULL;
01075     if (n>0)
01076      {
01077        cout << "renewing dft-object..." << endl;
01078        pdft = new TDFT(n,m);
01079      }
01080     else
01081      {
01082        // if n<=0, then no new dft-object will be created...
01083        cout << "dft-object is released..." << endl;
01084      }
01085     return pdft;
01086   }
01087  return pdft;
01088 }
01089 
01090 void clear_dft_tempmemory()
01091 {
01092   mpz_t x;
01093   mpz_init(x);
01094   get_dft(0,x); // trigger releasing of the DFT-object...
01095   mpz_clear(x);
01096 }
01097 
01098 } // namespace polynomial
01099