Colin Champion

Summary: this page contains software for two FFT routines, one in C++, the other in Javascript.

functiondescription

doc : licence : source : rfft test : cfft test : rfft speed test

rfft contains four fft subroutines and utilities spectrum and dotrfft. The prototypes are

void rfft(double *x,double *y,int n) ;
void invrfft(double *x,double *y,int n) ;
void cfft(double *x,double *y,int n) ;
void invcfft(double *x,double *y,int n) ;

double spectrum(double *x,double *y,int n) ;
void dotrfft(double *x,double *y,double *z,int n) ;

In all four cases:

The first two routines take real inputs; the second two take complex data stored as pairs of reals.

All four FFTs decompose the order n as 2r×3s×k and implement the transform as r mod 2 D-L passes, s mod 3 D-L passes, and (if k>1) a single mod k D-L pass. (D-L = Danielson-Lanczos.) Hence the transform is truly fast if k is 1 or prime.

The inverse FFTs invert the forwards FFTs (there is no unwanted scale factor such as occurs in NR/manyfft).

The definition of a fourier transform for these purposes is

x'(ω) = ∑ x(t)eiωt

The order of real transforms must be even.

The output from rfft is an array of n reals. The first is the fft at 0, the second is the fft at the Nyquist rate. Both of these values are real. Then come a sequence of (real,imag) pairs giving fft values at the remaining principal frequencies in increasing order.

The complex transforms work on n complex numbers encoded as 2n doubles.

spectrum computes n/2+1 bin energies from the fft of a real signal. The fft (as computed by rfft) should be provided in x; the bin energies will be returned in y in the natural order (0, ..., nyquist rate).

The return value from spectrum is ∑|x'(ω)|2 = ∑x(t)2. The bin energies are the squared magnitudes of the FFT except for the first and last bins: these outputs are half the squared magnitudes.

The halving is performed because this is the behaviour consistent with Parseval’s theorem. On the other hand the interpretation of the fourier coefficients as sinusoidal energies at bin frequencies breaks down at the edges, so cannot be invoked to repudiate the Parsevalian adjustment.

dotrfft computes in z the dot product (qua complexes) of the two FFTs of n-long real signals provided in x and y.

There’s no specific download block; download one of the test programs instead, eg. rffttest.c.

These routines were written in May 2015 as a replacement for manyfft.

rfft (including its documentation and test programs) is made available free of charge and without restriction to anyone who wants to use it. It is supplied ‘as is’ with no claim that it is correct, satisfactory, or fit for any purpose. No responsibility is accepted for any misadventure resulting from its use.

• swap2     • assign2     • fetch2     • gentab     • powerof2     • powerof3     • shuffle     • dlmod2     • dlmod3     • dlmodn     • rfft     • invrfft     • inlinecfft     • cfft     • invcfft     • spectrum     • dotrfft

#include <math.h>
#include "memory.h"
#include "unreal.h"

#define pi 3.141592653589793
#define i complex(0,1)
#define maxlog2order 40 
#define maxlog3order 30 
#define workspace 486

static inline void swap2(double *a,double *b) 
{ double c=b[0],d=b[1] ; b[0] = a[0] ; a[0] = c ; b[1] = a[1] ; a[1] = d ; }
static inline void assign2(double *a,double *b) { a[0] = b[0] ; a[1] = b[1] ; }
static inline void assign2(double *a,complex b) { a[0] = b.re ; a[1] = b.im ; }
static inline complex fetch2(double *x) { return complex(x[0],x[1]) ; } 

/* ----------------------------- pretabulation ------------------------------ */

static double tab[1+maxlog2order] ; 
static int threetothe[1+maxlog3order] ; 
static unsigned char tab243[243] ;
static unsigned char rev3[3],rev9[9],rev27[27],rev81[81],rev243[243] ; 

static complex gentab()
{ int t,inc,base,j,k,l,m,n ;
  double q ; 

  // sine table
  for(q=pi,t=0;t<=maxlog2order;q/=2,t++) tab[t] = sin(q) ;

  // log3 table
  for(base=0,inc=1;inc<729;inc*=3,base++)
    for(t=0;t<243;t+=inc) tab243[t] = base ; 

  // power of 3 table
  for(threetothe[0]=t=1;t<=maxlog3order;t++) threetothe[t] = 3*threetothe[t-1] ;

  // base 3 reversals
  for(j=0;j<3;j++) rev3[j] = j ;  
  for(j=0;j<3;j++) for(k=0;k<3;k++) rev9[j+3*k] = 3*j + k ;  
  for(j=0;j<3;j++) for(k=0;k<3;k++) for(l=0;l<3;l++) 
    rev27[j+3*k+9*l] = 9*j + 3*k + l ;  
  for(j=0;j<3;j++) for(k=0;k<3;k++) for(l=0;l<3;l++) for(m=0;m<3;m++) 
    rev81[j+3*k+9*l+27*m] = 27*j + 9*k + 3*l + m ;  
  for(j=0;j<3;j++) for(k=0;k<3;k++) for(l=0;l<3;l++) for(m=0;m<3;m++) 
    for(n=0;n<3;n++) 
      rev243[j+3*k+9*l+27*m+81*n] = 81*j + 27*k + 9*l + 3*m + n ;  

  return exp(2*pi*i/3) ; 
}
static complex zomega=gentab() ;

/* -------------------------------------------------------------------------- */
/*page*/
/* -------------------------- find factors of 2 & 3 ------------------------- */

static inline int powerof2(int n)
{ int logn ; 
  static char grey[]={0,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0} ;
  for(logn=0;(n&15)==0;logn+=4,n>>=4) ; 
  return logn + grey[n&15] ; 
}
static inline int powerof3(int n)
{ div_t r ; 
  int base ; 
  if(n<243) return tab243[n] ; 
  for(base=0;base<maxlog3order;base+=5,n=r.quot) 
  { r = div(n,243) ; if(r.rem) return base + tab243[r.rem] ; } 
  return -1 ; 
}
/* --------- rearrange pairs of values into "reversed-bitwise order" -------- */

static inline void shuffle(double *x,double *y,int radix,int pow2,int pow3)
{ int h,j,k,nn,n3,m=1<<pow2,h0,h1,h2,h3,h4,h5,h0lim,hn3,u,v,w ; 
  unsigned char *revl[]={0,rev3,rev9,rev27,rev81,rev243},*rev ;

  if(pow3==0) // power of 2 (and one more)
  { if(radix==1&&y==x) for(j=k=0;k<m;k+=2,j+=nn) 
    { if(j>k) swap2(y+k,y+j) ; for(nn=m/2;nn>=2&&j>=nn;nn>>=1) j -= nn ; } 

    else if(radix==1) for(j=k=0;k<m;k+=2,j+=nn) 
    { assign2(&y[j],&x[k]) ; for(nn=m/2;nn>=2&&j>=nn;nn>>=1) j -= nn ; } 

    else if(radix==3) for(j=k=0;k<m;k+=2,j+=nn) 
    { assign2(y+k,x+3*j) ; assign2(y+k+m,x+3*j+2) ; 
      assign2(y+k+(2<<pow2),x+3*j+4) ; 
      for(nn=m/2;nn>=2&&j>=nn;nn>>=1) j -= nn ; 
    }

    else if(radix==5) for(j=k=0;k<m;k+=2,j+=nn) 
    { assign2(y+k,x+5*j) ; assign2(y+k+m,x+5*j+2) ; 
      assign2(y+k+(2<<pow2),x+5*j+4) ; assign2(y+k+(3<<pow2),x+5*j+6) ; 
      assign2(y+k+(4<<pow2),x+5*j+8) ;
      for(nn=m/2;nn>=2&&j>=nn;nn>>=1) j -= nn ; 
    }

    else for(j=k=0;k<m;k+=2,j+=nn) 
    { for(h=0;h<radix;h++) assign2(y+k+(h<<pow2),x+radix*j+2*h) ;
      for(nn=m/2;nn>=2&&j>=nn;nn>>=1) j -= nn ; 
    }
    return ; 
  }
  /* ------------------------------------------------------------------------ */
  /*page*/
  /* ------------------- powers of 2 & 3 (and one more) --------------------- */

  n3 = threetothe[pow3] ; 
  h0lim = threetothe[(pow3-1)%5+1] ; 
  rev = revl[(pow3-1)%5+1] ;
  if(radix==1&&pow3<=5) for(j=k=0;k<m;k+=2,j+=nn) 
  { for(h0=0;h0<n3;h0++) assign2(y+k+(rev[h0]<<pow2),x+radix*(n3*j+2*h0)) ;
    for(nn=m/2;nn>=2&&j>=nn;nn>>=1) j -= nn ; 
  }
  else if(pow3<=5) for(j=k=0;k<m;k+=2,j+=nn) 
  { for(h=0;h<radix;h++) for(hn3=h*n3,w=n3*j,h0=0;h0<n3;h0++) 
      assign2(y+k+((rev[h0]+hn3)<<pow2),x+radix*(w+2*h0)+2*h) ;
    for(nn=m/2;nn>=2&&j>=nn;nn>>=1) j -= nn ; 
  }
  else if(radix==1&&pow3<=10) for(j=k=0;k<m;k+=2,j+=nn)
  { for(u=h0lim<<pow2,h0=0;h0<h0lim;h0++) 
      for(v=k+(rev[h0]<<pow2),w=n3*j+2*h0*243,h1=0;h1<243;h1++)
        assign2(y+v+u*rev243[h1],x+w+2*h1) ;
    for(nn=m/2;nn>=2&&j>=nn;nn>>=1) j -= nn ; 
  }
  else if(pow3<=10) for(j=k=0;k<m;k+=2,j+=nn)
  { for(u=h0lim<<pow2,h=0;h<radix;h++) for(hn3=h*n3,h0=0;h0<h0lim;h0++) 
      for(v=k+((rev[h0]+hn3)<<pow2),w=n3*j+2*h0*243,h1=0;h1<243;h1++)
        assign2(y+v+u*rev243[h1],x+radix*(w+2*h1)+2*h) ;
    for(nn=m/2;nn>=2&&j>=nn;nn>>=1) j -= nn ; 
  }
  else if(pow3<=15) for(j=k=0;k<m;k+=2,j+=nn) 
  { for(u=(h0lim*243)<<pow2,h=0;h<radix;h++) for(hn3=h*n3,h0=0;h0<h0lim;h0++) 
      for(h1=0;h1<243;h1++)
        for(v=k+((rev[h0]+h0lim*rev243[h1]+hn3)<<pow2),
            w=n3*j+2*(h0*243+h1)*243,h2=0;h2<243;h2++)
          assign2(y+v+u*rev243[h2],x+radix*(w+2*h2)+2*h) ;
    for(nn=m/2;nn>=2&&j>=nn;nn>>=1) j -= nn ; 
  }
  // the following options untested (they need ints to be >32 bits)
  else if(pow3<=20) for(j=k=0;k<m;k+=2,j+=nn) 
  { for(h=0;h<radix;h++) for(hn3=h*n3,h0=0;h0<h0lim;h0++) 
      for(h1=0;h1<243;h1++) for(h2=0;h2<243;h2++) for(h3=0;h3<243;h3++)
        assign2(y+k+((rev[h0]+h0lim*(rev243[h1]+
                              243*(rev243[h2]+243*rev243[h3]))+hn3)<<pow2),
                x+radix*(n3*j+2*(((h0*243+h1)*243+h2)*243+h3))+2*h) ;
    for(nn=m/2;nn>=2&&j>=nn;nn>>=1) j -= nn ; 
  }
  else if(pow3<=25) for(j=k=0;k<m;k+=2,j+=nn) 
  { for(h=0;h<radix;h++) for(hn3=h*n3,h0=0;h0<h0lim;h0++) for(h1=0;h1<243;h1++) 
      for(h2=0;h2<243;h2++) for(h3=0;h3<243;h3++) for(h4=0;h4<243;h4++)
        assign2(y+k+((rev[h0]+h0lim*(rev243[h1]+243*(rev243[h2]+
                              243*(rev243[h3]+243*rev243[h4])))+hn3)<<pow2),
                x+radix*(n3*j+2*((((h0*243+h1)*243+h2)*243+h3)*243+h4))+2*h) ;
    for(nn=m/2;nn>=2&&j>=nn;nn>>=1) j -= nn ; 
  }
  else if(pow3<=30) for(j=k=0;k<m;k+=2,j+=nn) 
  { for(h=0;h<radix;h++) for(hn3=h*n3,h0=0;h0<h0lim;h0++) 
      for(h1=0;h1<243;h1++) for(h2=0;h2<243;h2++) for(h3=0;h3<243;h3++) 
        for(h4=0;h4<243;h4++) for(h5=0;h5<243;h5++)
          assign2(y+k+((rev[h0]+h0lim*(rev243[h1]+243*(rev243[h2]+
              243*(rev243[h3]+243*(rev243[h4]+243*rev243[h5]))))+hn3)<<pow2),
              x+radix*(n3*j+2*(((((h0*243+h1)*243+h2)*243+h3)*243+h4)*243+h5))+
              2*h) ;
    for(nn=m/2;nn>=2&&j>=nn;nn>>=1) j -= nn ; 
  }
}
/* -------------------------------------------------------------------------- */
/*page*/
/* ---------------------- sequence of mod 2 DL passes ----------------------- */

static inline void dlmod2(double *x,int m,int n,int dir,int opt,double qf) 
{ double *y,*tabptr,u,cosomega,sinomega ;
  complex zn,za,zz,zx,zy,z0,z1,z2,z3 ; 
  int t,k ; 

  if(m==2) { if(opt) for(t=0;t<n;t++) x[t] *= qf ; }
  if(m==4) for(y=x;y<x+n;y+=4) 
  { if(opt) { zx = qf*fetch2(&y[0]) ; zy = qf*fetch2(&y[2]) ; }
    else { zx = fetch2(&y[0]) ; zy = fetch2(&y[2]) ; }
    assign2(&y[0],zx+zy) ; 
    assign2(&y[2],zx-zy) ; 
  }

  if(m>4) for(y=x;y<x+n;y+=8) 
  { if(opt)  
    { z0 = qf*fetch2(&y[0]) ; 
      z1 = qf*fetch2(&y[2]) ; 
      z2 = qf*fetch2(&y[4]) ; 
      z3 = qf*fetch2(&y[6]) ; 
    }
    else
    { z0 = fetch2(&y[0]) ; 
      z1 = fetch2(&y[2]) ; 
      z2 = fetch2(&y[4]) ; 
      z3 = fetch2(&y[6]) ; 
    }
    zx = z0 + z1 ; 
    zy = z2 + z3 ; 
    assign2(&y[0],zx+zy) ; 
    assign2(&y[4],zx-zy) ; 
    zx = z0 - z1 ; 
    if(dir>=0) zy = i*(z2-z3) ; else zy = -i*(z2-z3) ; 
    assign2(&y[2],zx+zy) ; 
    assign2(&y[6],zx-zy) ; 
  }
  
  for(tabptr=tab+3,u=dir*tab[2],k=8;k<m;tabptr++,k*=2) 
  { sinomega = u ; 
    u = dir*tabptr[0] ;
    cosomega = 1 - 2*u*u ; 
    za = complex(cosomega,sinomega) ;
    
    for(zn=1,t=0;t<k;t+=2,zn*=za) for(y=x+t;y<x+n;y+=2*k) 
    { zy = fetch2(&y[0]) ; 
      zz = zn * fetch2(&y[k]) ; 
      assign2(&y[0],zy+zz) ; 
      assign2(&y[k],zy-zz) ; 
    }
  }
}
/* -------------------------------------------------------------------------- */
/*page*/
/* ---------------------- sequence of mod 3 DL passes ----------------------- */

static inline void dlmod3(double *x,int m,int n,int rad,int dir) 
{ int k,t ; 
  double *y,*ylim=x+n*rad ; 
  complex zu,zv,z1,zx,zy,zz,za ; 

  if(dir>=0) zu = zomega ; else zu = conj(zomega) ; 
  zv = zu * zu ;  

  for(k=m;k<n;k*=3) 
  { za = exp(2*dir*pi*i/(3*k/2)) ;
    for(z1=1,t=0;t<k;t+=2,z1*=za) for(y=x+t;y<ylim;y+=3*k)  
    { zx = fetch2(&y[0]) ;
      zy = z1*fetch2(&y[k]) ; 
      zz = z1*z1*fetch2(&y[2*k]) ;
      assign2(&y[  0] , zx +     zy+   zz ) ;
      assign2(&y[  k] , zx + zu*(zy+zu*zz)) ;
      assign2(&y[2*k] , zx + zv*(zy+zv*zz)) ;
    }
  }
}
/* ------------------------ a single mod n DL pass -------------------------- */

static inline void dlmodn(double *x,double *y,int rad,int m,int dir)
{ int t,k,h,n=rad*m ;
  complex zn,zy,zt[25],*zin,za,zu=exp(2*dir*i*pi/(n/2)) ; 
  complex zv=exp(2*dir*i*m*pi/n) ;
  
  if(rad>25) zin = cvector(rad) ; else zin = zt ; 

  for(za=1,t=0;t<m;t+=2,za*=zu) 
  { for(k=0;k<rad;k++) zin[k] = fetch2(&x[t+k*m]) ; 
    for(zn=za,k=0;k<rad;k++,zn*=zv)
    { for(zy=zin[rad-1],h=rad-2;h>=0;h--) zy = zin[h] + zn*zy ; 
      assign2(&y[m*k+t],zy) ; 
    }
  }
  if(rad>25) free(zin) ; 
}
/* -------------------------------------------------------------------------- */

void rfft(double *xx,double *x,int n)
{ int t,rad,pow2,n2,pow3,n3,flag=0 ;
  double u,*y,workarray[workspace] ;
  complex zn,za,z0,z1,zy,zz ; 

  pow2 = powerof2(n) ; 
  n2 = 1 << pow2 ; 
  rad = n >> pow2 ; 

  if(rad==1||xx!=x) y = x ;
  else if(n>workspace) { y = vector(n) ; flag = 1 ; } 
  else y = workarray ; 

  pow3 = powerof3(rad) ; 
  n3 = threetothe[pow3] ; 
  rad /= n3 ; 

  shuffle(xx,y,rad,pow2,pow3) ;
  if(pow2>1) dlmod2(y,n2,n,1,0,0) ; 
  if(pow3>0) dlmod3(y,n2,n2*n3,rad,1) ; 
  if(rad>1) dlmodn(y,y,rad,n2*n3,1) ;

  if(pow3||rad>1) za = exp(2*pi*i/n) ; 
  else { u = tab[pow2] ; za = complex(1-2*u*u,tab[pow2-1]) ; }

  for(zn=-i*za,t=2;t<n/2;t+=2,zn*=za) 
  { z0 = fetch2(&y[t]) ; 
    z1 = conj(fetch2(&y[n-t])); 
    zy = (z0+z1) / 2 ; 
    zz = (zy-z1) * zn ; 
    assign2(&x[t],zy+zz) ; 
    assign2(&x[n-t],conj(zy-zz)) ; 
  }
  assign2(&x[0],conj(fetch2(&y[0])*(1-i))) ; 
  if(n%4==0) assign2(&x[n/2],fetch2(&y[n/2])) ; 
  if(flag) free(y) ; 
}
/* -------------------------------------------------------------------------- */

void invrfft(double *xx,double *x,int n)
{ int t,n2,rad,pow2,pow3,n3,flag=0 ;
  double *y,workarray[workspace],qn=1.0/n ;
  complex z0,z1,zy,zz,za,zn ;
  
  pow2 = powerof2(n) ; 
  n2 = 1 << pow2 ; 
  rad = n >> pow2 ; 
  
  if(rad==1) y = x ; 
  else { if(n>workspace) { y = vector(n) ; flag = 1 ; } else y = workarray ; }

  // Pack the results (this is (12.3.6) of NR)
  assign2(&y[0],qn*conj(fetch2(&xx[0]))*(1+i)) ; 
  if(n2>=4) assign2(&y[n/2],fetch2(&xx[n/2])*2*qn) ; 

  for(zn=za=exp(2*pi*i/n),t=2;2*t<n;t+=2,zn*=za) 
  { z0 = fetch2(&xx[t]) * qn ; 
    z1 = conj(fetch2(&xx[n-t])) * qn ; 
    zy = zn * conj(z0-z1) ;
    zz = z0 + z1 ; 
    assign2(&y[t],zz+i*conj(zy)) ;
    assign2(&y[n-t],conj(zz)+i*zy) ;
  }

  pow3 = powerof3(rad) ; 
  n3 = threetothe[pow3] ; 
  rad /= n3 ; 

  shuffle(y,x,rad,pow2,pow3) ; 
  dlmod2(x,n2,n,-1,0,0) ; 
  if(pow3>0) dlmod3(x,n2,n2*n3,rad,-1) ; 
  if(rad>1) dlmodn(x,x,rad,n2*n3,-1) ;
  if(flag) free(y) ; 
}
/* -------------------------------------------------------------------------- */
/*page*/
/* -------------------------------------------------------------------------- */

static void inlinecfft(double *xx,double *x,int n,int dir)
{ int rad,pow2,n2,pow3,n3,flag=0 ;
  double *y,workarray[workspace] ; 
  complex z0,z1,z2,za,zb,zc,zk,zn,zin0,zin1,zin2,zu,zv,zx,zy,zz ;
    
  if(n<=0) return ;     // safety net 
  pow2 = powerof2(2*n) ; 
  rad = (2*n) >> pow2 ; 
  n2 = 1 << pow2 ;      // there are n2*rad real inputs (n2*rad/2 complexes)

  if(rad==1) 
  { shuffle(xx,x,1,pow2,0) ;
    if(dir==1) dlmod2(x,n2,n2,1,0,0) ; 
    else dlmod2(x,n2,n2,-1,1,1.0/(n2*rad/2)) ; 
    return ; 
  }

  if(xx!=x) y = x ; 
  else if(n2*rad>workspace) { y = vector(n2*rad) ; flag = 1 ; } 
  else y = workarray ;

  pow3 = powerof3(rad) ;  
  n3 = threetothe[pow3] ; 
  rad /= n3 ; 
  // we need a mod-n pass to move from y back to x
  if(rad==1&&y!=x) { pow3 -= 1 ; n3 = threetothe[pow3] ; rad = 3 ; } 

  shuffle(xx,y,rad,pow2,pow3) ;
  if(dir>=0&&pow2>1) dlmod2(y,n2,2*n,1,0,0) ; 
  else if(dir<0) dlmod2(y,n2,2*n,-1,1,1.0/n) ; 
  if(pow3>0) dlmod3(y,n2,n2*n3,rad,dir) ; 
  if(rad>1) dlmodn(y,x,rad,n2*n3,dir) ;
  if(flag) free(y) ; 
}
/* -------------------------------------------------------------------------- */

// cfft - FFT of a complex array (n complex numbers provided in a 
// 2n-long array of doubles).                                          

void cfft(double *xx,double *x,int n) { inlinecfft(xx,x,n,1) ; }
void invcfft(double *xx,double *x,int n) { inlinecfft(xx,x,n,-1) ; }

/* -------------------------------------------------------------------------- */

/* -------------------------------------------------------------------------- */

double spectrum(double *x,double *y,int n)
{ double en,q=x[1]*x[1]/2 ;

  en = y[0] = x[0]*x[0]/2 ;
  for(int t=1;t<n/2;t++) en += y[t] = norm(fetch2(&x[2*t])) ;
  en += y[n/2] = q ;
  return en ;
}
void dotrfft(double *x,double *y,double *z,int n)
{ int t ; 
  for(t=0;t<2;t++) z[t] = x[t] * y[t] ; 
  for(;t<n;t+=2) assign2(&z[t],fetch2(&x[t])*fetch2(&y[t])) ;
}
/* -------------------------------------------------------------------------- */

The first test program transforms forwards and backwards for a range of orders or for a single order, verifying that it’s got back to where it started from. It also verifies for each order and each direction that the in-place and out-of-place transforms give the same results.

Run

rffttest [return]

to run a single random test over each of a range of orders; run

rffttest n [return]

to run a single random test of order n; and run

rffttest n ntests [return]

to run ntests random tests of order n.

You should find that the output says simply ‘0 errors from 83x1 ffts’.

 

A download link such as the one which follows works only in Firefox and only if popup windows are permitted. Click on it and follow the instructions.

 

rfft.c rffttest.c 
utils: ranf.c memory.h unreal.h random.h 
---
g++ -O2 -g -o rffttest rffttest.c rfft.c ranf.c 
rffttest

 

#include <math.h>
#include "memory.h"
#include "unreal.h"
#include "random.h"

void rfft(double *xx,double *x,int n),invrfft(double *xx,double *x,int n) ;

#define pi 3.141592653589793

int main(int argc,char **argv)
{ int maxn=10000,n,t,nerr,terr,ntry=1,usen=0,tryno,size=maxn,nn ;
  complex z,u ; 

  if(argc>1) 
  { size = usen = atoi(argv[1]) ; 
    maxn = 2 ; 
    if(argc>2) ntry = atoi(argv[2]) ; 
  }
  double *x=vector(size),*y=vector(size),*xdash=vector(size) ;

  for(nn=terr=0,n=2;n<=maxn;n+=2) 
    if(n<64||(n<1000&&(n%32==0||n%27==0))||(n<10000&&n%729==0)||n%177147==0)
  { nn += 1 ; 
    if(usen>0) n = usen ; 
    for(nerr=tryno=0;tryno<ntry;tryno++) 
    { ranset(tryno+(n<<20)) ;
      for(t=0;t<n;t++) x[t] = gaussv() ; 

      rfft(x,y,n) ; 
      invrfft(y,xdash,n) ; 

      for(t=0;t<n;t++) if(fabs(x[t]-xdash[t])>1e-6)
      { if(t<2) printf("%3d:%3d: (%6.2f,%6.2f)\n",n,t,x[t],xdash[t]) ; 
        nerr += 1 ; 
      }

      rfft(x,x,n) ; 
      for(t=0;t<n;t++) insist(fabs(x[t]-y[t])<1e-6,
          orscream("in place != out of place for order %d (%d)",n,t)) ; 

      invrfft(y,y,n) ;
      for(t=0;t<n;t++) insist(fabs(xdash[t]-y[t])<1e-6,
          orscream("in place != out of place for inverse of order %d",n)) ; 
    }
    if(nerr) printf("order%4d: incorrect\n",n) ; 
    terr += nerr ; 
  }
  printf("%d errors from %dx%d ffts\n",nerr,nn,ntry) ; 
  free(x,y,xdash) ; 
}

The second test program validates the complex FFT for a rather wider range of orders.

Run

cffttest [return]

to run a single random test over each of a range of orders; run

cffttest n [return]

to run a single random test of order n; and run

cffttest n ntests [return]

to run ntests random tests of order n.

You should find that the output lists the orders used and adds ‘0 errors from 144x1 ffts’.

 

rfft.c cffttest.c 
Inf/486: ranf.c memory.h unreal.h random.h 
---
g++ -O2 -g -o cffttest cffttest.c rfft.c ranf.c 
cffttest

 

#include "memory.h"
#include <math.h>
#include "unreal.h"
#include "random.h"
#define pi 3.141592653589793

void cfft(double *xx,double *x,int n),invcfft(double *xx,double *x,int n) ;
static inline complex fetch2(double *x) { return complex(x[0],x[1]) ; } 

int main(int argc,char **argv)
{ int maxn=1000000,n,t,k,nerr,terr,ntry=1,usen=0,tryno,size=maxn,ntest,nn,ind ;

  if(argc>1) 
  { size = usen = atoi(argv[1]) ; 
    maxn = 1 ; 
    if(argc>2) ntry = atoi(argv[2]) ; 
  }

  double *x=vector(2*size),*y=vector(2*size),*xdash=vector(2*size) ; 
  complex z,zn,u,i(0,1) ; 
  printf("       ") ; 

  for(nn=terr=0,n=1;n<=maxn;n++) 
    if(n<64||(n<1000&&(n%32==0||n%27==0))||(n<10000&&n%729==0)||n%177147==0)
  { nn += 1 ; 
    if(usen>0) n = usen ; 
    if(n>100000) printf("n=%d\n",n) ; 
    else { printf("n=%-5d",n) ; if(nn%10==9) printf("\n") ; } 
    for(nerr=tryno=0;tryno<ntry;tryno++) 
    { ranset(tryno+(n<<20)) ;
      for(t=0;t<2*n;t++) x[t] = gaussv() ; 
      cfft(x,y,n) ;

      ntest = 10000000 / n ; 
      if(ntest<2) ntest = 2 ; 
      if(ntest>200) ntest = 200 ; 
      if(ntest>n) ntest = n ; 

      for(ind=0;ind<ntest;ind++)
      { if(ind==0||ntest==n) k = ind ; else k = 1 + rani(n-1) ; 
        u = exp(2*k*pi*i/n) ; 
        for(zn=1,z=t=0;t<n;t++,zn*=u) z += fetch2(&x[2*t]) * zn ; 
        if(norm(z-fetch2(&y[2*k]))>1e-12)
        { if(k==0) printf("-> %3d:%3d: (%6.2f,%6.2f) should be (%6.2f,%6.2f)\n",
                          n,t,y[2*k],y[2*k+1],z.re,z.im) ; 
          nerr += 1 ; 
        }
      }

      invcfft(y,xdash,n) ; 
      for(t=0;t<2*n;t++) if(fabs(x[t]-xdash[t])>1e-6)
      { if(0) printf("<- %3d:%3d: (%6.2f,%6.2f)\n",n,t,x[t],xdash[t]) ; 
        nerr += 1 ; 
      }
    }
    if(nerr) printf("order %d: incorrect\n",n) ; 
    terr += nerr ; 
  }
  printf("%d errors from %dx%d ffts\n",terr,nn,ntry) ; 
  free(x,y,xdash) ; 
}

The final test program rfftspeedtest.c is used to time a number of transforms over a wide range of orders.

Run

rfftspeedtest [return]

I get the output below.

The first thing to notice is that FFTs become more efficient as the order increases to around 1000–2000 and then decreases. The improving efficiency comes from amortising overheads; the drop, presumably, comes from memory references having to go to RAM rather than a cache. This is a lack of optimisation: probably I should roll several mod 2 D-L passes into a single loop; meanwhile, no interesting conclusions can be drawn from the behaviour for large orders.

For middling orders we notice:

time (ns) per value computed
    16:  5.2     20: 13.0                  24:  7.9                  28: 14.9 
    32:  5.4     40: 10.8                  48:  7.1     54: 10.2     56: 13.6 
    64:  5.8     80: 10.3                  96:  6.9    108:  9.2    112: 13.5 
   128:  6.3    160: 10.5    162: 10.5    192:  7.1    216:  8.7    224: 13.5 
   256:  6.8    320: 10.5    324: 10.0    384:  7.6    432:  9.0    448: 13.7 
   512:  7.4    640: 11.2    648: 10.2    768:  8.2    864:  9.6    896: 14.3 
  1024:  8.2   1280: 11.6   1296: 10.6   1536:  8.9   1728: 10.2   1792: 15.2 
  2048:  9.5   2560: 13.0   2592: 11.2   3072: 10.1   3456: 10.9   3584: 16.4 
  4096: 10.2   5120: 16.8   5184: 13.3   6144: 15.2   6912: 15.0   7168: 22.2 
  8192: 15.9  10240: 20.4  10368: 15.3  12288: 18.1  13824: 17.8  14336: 24.5 
 16384: 18.8  20480: 23.8  20736: 17.8  24576: 22.1  27648: 21.6  28672: 28.8 
 32768: 23.5  40960: 28.5  41472: 23.1  49152: 27.2  55296: 27.2  57344: 34.3 
 65536: 29.2 

time (ns) per n log[2](n) (a measure of efficiency)
    16: 1.30     20: 3.01                  24: 1.72                  28: 3.10 
    32: 1.09     40: 2.02                  48: 1.26     54: 1.77     56: 2.35 
    64: 0.97     80: 1.62                  96: 1.05    108: 1.36    112: 1.98 
   128: 0.89    160: 1.44    162: 1.43    192: 0.94    216: 1.13    224: 1.72 
   256: 0.85    320: 1.26    324: 1.20    384: 0.88    432: 1.03    448: 1.56 
   512: 0.82    640: 1.20    648: 1.09    768: 0.86    864: 0.99    896: 1.46 
  1024: 0.82   1280: 1.13   1296: 1.03   1536: 0.84   1728: 0.95   1792: 1.41 
  2048: 0.86   2560: 1.15   2592: 0.99   3072: 0.87   3456: 0.93   3584: 1.39 
  4096: 0.85   5120: 1.36   5184: 1.08   6144: 1.21   6912: 1.18   7168: 1.73 
  8192: 1.22  10240: 1.53  10368: 1.14  12288: 1.33  13824: 1.29  14336: 1.77 
 16384: 1.34  20480: 1.66  20736: 1.24  24576: 1.52  27648: 1.46  28672: 1.95 
 32768: 1.57  40960: 1.86  41472: 1.51  49152: 1.75  55296: 1.73  57344: 2.17 
 65536: 1.83 

time (ns) per sum of factors of n (a measure of optimisation)
    16: 0.58     20: 1.45                  24: 0.79                  28: 1.35 
    32: 0.49     40: 0.98                  48: 0.59     54: 0.85     56: 1.05 
    64: 0.45     80: 0.79                  96: 0.50    108: 0.65    112: 0.90 
   128: 0.42    160: 0.70    162: 0.70    192: 0.45    216: 0.55    224: 0.79 
   256: 0.40    320: 0.62    324: 0.59    384: 0.42    432: 0.50    448: 0.72 
   512: 0.39    640: 0.59    648: 0.54    768: 0.41    864: 0.48    896: 0.68 
  1024: 0.39   1280: 0.55   1296: 0.51   1536: 0.40   1728: 0.46   1792: 0.66 
  2048: 0.41   2560: 0.56   2592: 0.49   3072: 0.42   3456: 0.46   3584: 0.65 
  4096: 0.41   5120: 0.67   5184: 0.53   6144: 0.58   6912: 0.58   7168: 0.82 
  8192: 0.59  10240: 0.75  10368: 0.57  12288: 0.65  13824: 0.63  14336: 0.84 
 16384: 0.65  20480: 0.82  20736: 0.61  24576: 0.74  27648: 0.72  28672: 0.93 
 32768: 0.76  40960: 0.92  41472: 0.74  49152: 0.85  55296: 0.85  57344: 1.04 
 65536: 0.89 

log10(time) (ns) per transform
    16: 1.92     20: 2.42                  24: 2.28                  28: 2.62 
    32: 2.24     40: 2.63                  48: 2.53     54: 2.74     56: 2.88 
    64: 2.57     80: 2.91                  96: 2.82    108: 3.00    112: 3.18 
   128: 2.90    160: 3.23    162: 3.23    192: 3.14    216: 3.28    224: 3.48 
   256: 3.24    320: 3.53    324: 3.51    384: 3.46    432: 3.59    448: 3.79 
   512: 3.58    640: 3.85    648: 3.82    768: 3.80    864: 3.92    896: 4.11 
  1024: 3.92   1280: 4.17   1296: 4.14   1536: 4.14   1728: 4.25   1792: 4.44 
  2048: 4.29   2560: 4.52   2592: 4.46   3072: 4.49   3456: 4.58   3584: 4.77 
  4096: 4.62   5120: 4.93   5184: 4.84   6144: 4.97   6912: 5.02   7168: 5.20 
  8192: 5.11  10240: 5.32  10368: 5.20  12288: 5.35  13824: 5.39  14336: 5.55 
 16384: 5.49  20480: 5.69  20736: 5.57  24576: 5.74  27648: 5.78  28672: 5.92 
 32768: 5.89  40960: 6.07  41472: 5.98  49152: 6.13  55296: 6.18  57344: 6.29 
 65536: 6.28 

For comparison I get the following timings for manyfft:

log10(time) (ns) per transform
    16: 1.85     20: 2.38                  24: 2.21                  28: 2.65 
    32: 2.21     40: 2.63                  48: 2.50                  56: 2.92 
    64: 2.57     80: 2.93                  96: 2.81                 112: 3.22 
   128: 2.92    160: 3.26                 192: 3.15                 224: 3.54 
   256: 3.26    320: 3.56                 384: 3.46                 448: 3.84 
   512: 3.59    640: 3.89                 768: 3.80                 896: 4.16 
  1024: 3.91   1280: 4.21                1536: 4.14                1792: 4.48 
  2048: 4.26   2560: 4.54                3072: 4.50                3584: 4.81 
  4096: 4.62   5120: 4.97                6144: 5.00                7168: 5.24 
  8192: 5.15  10240: 5.37               12288: 5.40               14336: 5.61 
 16384: 5.53  20480: 5.74               24576: 5.78               28672: 5.97 
 32768: 5.92  40960: 6.13               49152: 6.18               57344: 6.36 
 65536: 6.33 

Evidently no one will prefer one to the other on account of speed. manyfft is marginally faster for very small orders and marginally slower the rest of the time.

 

rfft.c rfftspeedtest.c 
utils: ranf.c memory.h unreal.h random.h 
---
g++ -O2 -g -o rfftspeedtest rfftspeedtest.c rfft.c ranf.c 
rfftspeedtest

 

• getorder     • sumoffactors     • main

void rfft(double *x,double *y,int n) ;
void invrfft(double *x,double *y,int n) ;

#include "memory.h"
#include <math.h>
#include "random.h"
#include <sys/resource.h>

static int getorder(int base,int k) 
{ if(k==0) return base ; 
  else if(k==1) return 5*base/4 ; 
  else if(k==2) { if(base<128) return 0 ; else return 81*base/64 ; } 
  else if(k==3) return 6*base/4 ; 
  else if(k==4) { if(base<32) return 0 ; else return 27*base/16 ; } 
  else /* if(k==5) */ return 7*base/4 ; 
}
static double sumoffactors(int order)
{ int sum = 0 ; 
  while(order%2==0) { sum += 2 ; order /= 2 ; } 
  while(order%3==0) { sum += 3 ; order /= 3 ; } 
  return sum+order ;
} 
int main(int argc,char **argv)
{ double *x,*y ; 
  int t,n,flag=0,iter,oiter,niter,base,order,ind,k,basesum ; 
  double t0,t1,t2,time[75],qsum ; 
  struct rusage clock ; 
  x = vector(65536) ; 
  y = vector(65536) ; 
  for(t=0;t<75;t++) time[t] = 0 ; 

  printf("time (ns) per value computed\n") ; 
  for(ind=0,base=16;base<=65536;base*=2)
  { for(k=0;k<6;k++,ind++)
    { order = getorder(base,k) ; 
      if(order==0) { printf("             ") ; continue ; } 
      else if(order>65536) break ; 
      niter = 1000000/order ; 
      for(t0=oiter=0;oiter<100;oiter++) 
      { for(t=0;t<order;t++) x[t] = gaussv() ; 
        getrusage(0,&clock) ; 
        t1 = clock.ru_utime.tv_sec*1e9+clock.ru_utime.tv_usec*1e3 ;
        for(iter=0;iter<niter;iter++) rfft(x,y,order) ; 
        getrusage(0,&clock) ; 
        t2 = clock.ru_utime.tv_sec*1e9+clock.ru_utime.tv_usec*1e3 ;
        t0 += t2 - t1 ; 
      }
      time[ind] = t0 / (100.0*niter*order) ;
      printf("%6d: %4.1f ",order,time[ind]) ; 
    }
    printf("\n") ; 
  }

  printf("\ntime (ns) per n log[2](n) (a measure of efficiency)\n") ; 
  for(ind=0,base=16;base<=65536;base*=2)
  { for(k=0;k<6;k++,ind++)
    { order = getorder(base,k) ; 
      if(order==0) { printf("             ") ; continue ; } 
      else if(order>65536) break ; 
      printf("%6d: %4.2f ",order,time[ind]/log2(order)) ; 
    }
    printf("\n") ; 
  }

  printf("\ntime (ns) per sum of factors of n (a measure of optimisation)\n") ; 
  for(ind=0,base=16;base<=65536;base*=2)
  { for(k=0;k<6;k++,ind++)
    { order = getorder(base,k) ; 
      if(order==0) { printf("             ") ; continue ; } 
      else if(order>65536) break ; 
      printf("%6d: %4.2f ",order,time[ind]/sumoffactors(order)) ; 
    }
    printf("\n") ; 
  }

  printf("\nlog10(time) (ns) per transform\n") ; 
  for(ind=0,base=16;base<=65536;base*=2)
  { for(k=0;k<6;k++,ind++)
    { order = getorder(base,k) ; 
      if(order==0) { printf("             ") ; continue ; } 
      else if(order>65536) break ; 
      printf("%6d: %4.2f ",order,log10(time[ind]*order)) ; 
    }
    printf("\n") ; 
  }
}

doc : licence : source : rfft test : cfft test : rfft speed test

rfft : spectrum : source

Call

xhat = rfft(x) ;

or

xhat = rfft(x,n) ;

to compute the FFT xhat of the real signal x of length x.length in the first case or n in the second. This length must be a power of 2. Apart from that the code is modelled on the C rfft above.

Call

xspec = spectrum(xhat) ;

or

xspec = spectrum(xhat,n) ;

to compute the spectral estimate xspec of the signal whose Fourier transform is xhat.

• rfft     • spectrum

function rfft(xx,n)
{ var y,j,k,m,mmax,qr,qi,q0,q1,q2,q3,pi=Math.PI ;
  if(n==undefined) n = xx.length ;
  var x=new Array(n) ; 
  for(k=0;k<n;k++) x[k] = xx[k] ; 

  // Rearrange the contents into "reversed-bitwise order" (excluding
  // the final bit, which distinguishes real and imaginary parts of
  // the same entry).

  for(j=k=0;k<n;k+=2) 
  { if(j>k) 
    { y = x[k] ; x[k] = x[j] ; x[j] = y ;         // swap x[j] and x[k]
      y = x[k+1] ; x[k+1] = x[j+1] ; x[j+1] = y ; // swap x[j+1] and x[k+1]
    }
    for(m=n/2;m>=2&&j>=m;m>>=1) j -= m ; 
    j += m ;
  }
  // Perform the Danielson-Lanczos lemma
  for (mmax=2;mmax<n;mmax*=2) for(m=0;m<mmax;m+=2) 
  { qr = Math.cos(m*pi/mmax) ;
    qi = Math.sin(m*pi/mmax) ;
    for (k=m;k<n;k+=2*mmax) 
    { j = k + mmax;    // Modify all pairs mmax apart.
      u0 = qr*x[j] - qi*x[j+1] ;
      u1 = qi*x[j] + qr*x[j+1] ;
      x[j  ]  = x[k  ] - u0 ;
      x[j+1]  = x[k+1] - u1 ;
      x[k  ] += u0 ;
      x[k+1] += u1 ;
    }
  }
  // Unpack the results.
  u1 = x[0] ;
  x[0] = u1 + x[1] ;
  x[1] = u1 - x[1] ;
  
  for(k=2;k<n/2;k+=2) 
  { q0 = (x[k  ] + x[n-k  ])/2 ;
    q1 = (x[k+1] - x[n-k+1])/2 ;
    q2 = (x[k+1] + x[n-k+1])/2 ;
    q3 =-(x[k]   - x[n-k  ])/2 ;
    qr = Math.cos(k*pi/n) ;
    qi = Math.sin(k*pi/n) ;
    u0 = q2*qr - q3*qi ; 
    u1 = q3*qr + q2*qi ;
    x[k]     =  q0 + u0 ;
    x[k+1]   =  q1 + u1 ;
    x[n-k]   =  q0 - u0 ;
    x[n-k+1] = -q1 + u1 ;
  }
  return x ;
}
function spectrum(xx,n)
{ if(n==undefined) n = xx.length ;
  var k,x=new Array(n/2+1) ; 

  // convert fft into a spectral estimate
  for(k=1;k<n/2;k++) x[k] = xx[2*k]*xx[2*k] + xx[2*k+1]*xx[2*k+1] ; 
  x[0] = xx[0]*xx[0]/2 ; 
  x[n/2] = xx[1]*xx[1]/2 ; 
  return x ; 
}