#include #include #include #include #define NZ 1048576 #define N2 NZ/2 #define FLT_REL 0.11920928955078e-06 static float xx[NZ][2],zz[NZ][2]; /* test driver for complex FFT routine ccfft2. WPP 28 June 1996 */ main() { int n,n2exp,i,nit,kk,nexec; /* repetition count: 16, 32, 64, 128, 256, 512, 1024, 2048, | | | | | | | | */ int NX[17]={1000 ,1000 ,1000 ,1000 ,1000 ,100 , 100 , 100 , 100, 100, 10 , 10 , 1, 1, 1, 1, 1}; /* | | | | | | | | | 4096,8192,16384,32768,65536,131072,262144,524288,1048576 */ float eps,z0,z1,fnm1,errCC,ggl(float*); float *x; double ln2,mflops,t0,t1,t2,t3,t4,walltime(double*); static float seed=331.0; int sign; void ccfft2(), rcfft2(), crfft2(), refft2(), rofft2(); /* print headers */ printf(" ----------------------------------- \n"); printf(" loop inversion variant of CFFT2 \n"); printf(" ------------------------------------ \n"); for(n2exp=4;n2exp<21;n2exp++){ n = pow(2,n2exp); eps = ((float)n)*FLT_REL; printf("\n N=%d tests, rms error criterion=%8.2e: \n",n,eps); printf(" ------------------------------------------\n",n); t3 = 100000.; t4 = 100000.; nexec = 1; for(nit=0;nit<10;nit++){ if(nit==0){ for(i=0;ieps){ printf("\n error in CCFFT F/B: errCC=%e\n",errCC); } else { printf("\n CCFFT F/B test OK\n"); } } nexec = NX[n2exp-4]; } t1 = 0.5*(t3+t4)/((float) nexec); t1 = (t1>0.0)?t1:0.0; ln2 = (double) n2exp; if(t1>0.0){ mflops = 5.0*((double) n)*ln2/((1.e+6)*t1); printf(" Time=%8.2e, Mflops=%6.1f for ccfft2 on %d points\n", t1,mflops,n); printf("\n"); } } } float ggl(float *ds) { /* generate u(0,1) distributed random numbers. Seed ds must be saved between calls. ggl is essentially the same as the IMSL routine RNUM. W. Petersen and M. Troyer, 24 Oct. 2002, ETHZ: a modification of a fortran version from I. Vattulainen, Tampere Univ. of Technology, Finland, 1992 */ double t,d2=0.2147483647e10; t = (float) *ds; t = fmod(0.16807e5*t,d2); *ds = (float) t; return((float) ((t-1.0e0)/(d2-1.0e0))); } /* ======================================= */ /* split here for library functions */ /* ======================================= */ void cffti(int n,float *w) /* Initialization routine for binary radix FFTs. OUTPUT w = complex n/2 dimensional array, = exp(2*Pi*i*j/n), for j=0..n/2-1 i.e. n/2 powers of the n-th root of unity W Petersen, SAM, Mathmatik, ETHZ, 25 Aug., 1999 */ { int i,ii; float pi,arg,t; pi = 3.141592653589793; t = 2.0*pi/((float) n); ii = 0; for(i=0;i 1 x = complex n dimensional array sign = integer sign of transform (+- below) = +1, or -1. OUTPUT (where w = n-th root of unity) x_j <- sum(k=0..n-1) w^{+- jk} x_k, for j=0..n-1 W Petersen, SAM, Mathematik, ETHZ, 25 Aug., 1999 */ { void cffti(), cfft_2(); static float *w,*pw1,*pw2; static int n1=-33,n2=-33; int nt; if(n==n1){ w = pw1; } else { if(n==n2){ w = pw2; pw2 = pw1; pw1 = w; nt = n1; n1 = n2; n2 = nt; } else { if(n2>0) free(pw2); w = (void *) malloc(n*sizeof(float)); cffti(n,w); pw2 = pw1; pw1 = w; n2 = n1; n1 = n; } } cfft_2(n,x,w,1,sign); } #define max(a,b) (a>b)?a:b void cfft_2(n,x,w,iw,sign) int n, iw,sign; float x[][2], w[]; /* in-place, in-order binary radix FFT from C. Temperton, SIAM J. on Sci. and Stat. Computing, vol 12, no. 4, July 1991. This is has a loop-inversion optimization: first log2(n)/2 passes are Cooley-Tukey, last log2(n)/2 passes compute 'bugs' in pairs. Two variants of the self-ordering phase are possible - step1 and step2. A schedule for choosing step1 or step2 is given by the empirical choice given by the formula for various machines in the comments. Pick your favorite. Parameter iw=stride in array w, used for real/complex routines (iw=2 in that case). W Petersen, SAM, Mathematik, ETHZ, 25 Aug., 1999 */ { int n2,m,j,mj,p2,p3,p4,BK; void step0(), step1(), step2(); m = (int) (log((float) n)/log(1.99)); /* optimization point for Y-MP: BK = max(m/3-2,0); optimization point for J-90 BK = max((m+1)/6,0); optimization point for IBM: BK = max((m-3)/2,0); optimization point for DEC Alpha: BK = max(m/2-2,0); optimization point for HP9000 BK = max(m/2-3,0); optimization point for SGI Indigo BK = max((m-1)/4,0); optimization point for SGI TFP (R8000) BK = max((m-1)/2,0); optimization point for i860/XP BK = max((20*(m-5)-(m-5)*(m-5))/25),0); optimization point for Intel P-6 BK = max((7+5*(m-4))/10+1,0); optimization point for Sparc-10 BK = max(0,2*m/3-4); optimization point for Mac Power-PC BK = 4*m/9 plain vanilla: */ BK = 0; mj = 1; n2 = n/2; for(j=0;j 0){ #pragma ivdep for(k=0;k 0){ for(i=j;i 0)?w[kw][1]:(-w[kw][1]); for(i=0;i complex FFT: x = real input array (n dimensional) INPUT n = power of 2 dimension of transform x = n/2 dimensional real array OUTPUT (where w = n-th root of unity) x_j <- sum(j=1,n) w^{jk} x_k, j=0..n/2 = "half complex" output array, x_{n-j} = x_j hence, only n/2+1 unique elements. Since input is real, output x_{0} and x_{n/2} are purely real. To be done in-place, then, the data structure of output has a packed first complex element output: x[0][0] = real(x_{0}), and x[0][1] = real(x_{n/2}). Otherwise, x[i][0] = real(x_{i}), and x[i][1] = imag(x_{i}), for i=1..n/2-1. W Petersen, IPS, ETH Zurich, 7 March, 1995 */ { void cffti(),cfft_2(); void post_p(int,float*,float*,int); static float *w,*pw1,*pw2; static int n1=-33,n2=-33; int nt; if(n==n1){ w = pw1; } else { if(n==n2){ w = pw2; pw2 = pw1; pw1 = w; nt = n1; n1 = n2; n2 = nt; } else { if(n2>0) free(pw2); w = (void *) malloc(n*sizeof(float)); cffti(n,w); pw2 = pw1; pw1 = w; n2 = n1; n1 = n; } } cfft_2(n/2,x,w,2,sign); post_p(n,x,w,sign); } void crfft2(int n, float *x, int sign) /* "Half complex" -> real FFT: x = Hermitian complex input assumed, such that for all j=1,n, x_{n-j} = conjg(x_j), hence only n/2+1 unique elements. Furthermore, first and last are purely real. In-place computation has a packed data structure as below. INPUT n = power of 2 dimension of transform x = n/2 dimensional COMPLEX array, where the first elements x[0][0] = real(x_{0}) and, x[0][1] = real(x_{n/2}). Otherwise, for i=1..n/2-1, x[i][0] = real(x_{i}) and x[i][1] = imag(x_{i}), treated as a complex array. OUTPUT (where w = n-th root of unity) x[j], j=0..n-1 get x_j <- sum(k=1,n) w^{jk} x_k, j=0..n-1, are REAL. W Petersen, SAM, Mathematik, ETHZ, 25 Aug, 1999 */ { void cfft_2(),pre_proc(); static float *w,*pw1,*pw2; static int n1=-33,n2=-33; int nt; if(n==n1){ w = pw1; } else { if(n==n2){ w = pw2; pw2 = pw1; pw1 = w; nt = n1; n1 = n2; n2 = nt; } else { if(n2>0) free(pw2); w = (void *) malloc(n*sizeof(float)); cffti(n,w); pw2 = pw1; pw1 = w; n2 = n1; n1 = n; } } pre_proc(n,x,w,sign); cfft_2(n/2,x,w,2,sign); } void post_p(int n, float *z, float *w, int sign) { float pr,pi,qr,qi,zr,zi; float ar,ai,br,bi,wr,wi; int j; zr = *z + *(z+1); zi = *z - *(z+1); *z = zr; *(z+1) = zi; if(sign<0){ for(j=2;j0) free(pw2); w = (void *) malloc(n*sizeof(float)); cffti(n,w); pw2 = pw1; pw1 = w; n2 = n1; n1 = n; } } x1 = x[0] + x[n/2]; x2 = x[0] - x[n/2]; x[n/4] = 2.0*x[n/4]; x[0] = x1; for(i=1;i0) free(pw2); w = (void *) malloc(n*sizeof(float)); cffti(n,w); pw2 = pw1; pw1 = w; n2 = n1; n1 = n; } } for(i=1;i double walltime(double *t0) { double mic, time; double mega=0.000001; struct timeval tp; struct timezone tzp; static long base_sec = 0; static long base_usec = 0; (void) gettimeofday(&tp, &tzp); if (base_sec == 0) { base_sec = tp.tv_sec; base_usec = tp.tv_usec; } time = (double)(tp.tv_sec - base_sec); mic = (double)(tp.tv_usec - base_usec); time = (time + mic * mega) - *t0; return(time); }