#include #include #include #include #include "xmmintrin.h" #define N 256 #define N2 N/2 main() { /* SSE version of cfft2 - uses Intel intrinsics. Expanded version wpp, SAM. Math. ETHZ 21 May, 2002 */ int first,i,icase,it,n; float error,fnm1,seed,sign,z0,z1,ggl(); float *x,*y,*z,*w; float t1,ln2,mflops; void cffti(),cfft2(); /* allocate storage for x,y,z,w on 4-word bndr. */ x = (float *)_mm_malloc(8*N, 16); y = (float *)_mm_malloc(8*N, 16); z = (float *)_mm_malloc(8*N, 16); w = (float *)_mm_malloc(4*N, 16); first = 1; seed = 331.0; for(icase=0;icase<2;icase++){ if(first){ for(i=0;i<2*N;i+=2){ z0 = ggl(&seed); /* real part of array */ z1 = ggl(&seed); /* imaginary part of array */ x[i] = z0; z[i] = z0; /* copy of initial real data */ x[i+1] = z1; z[i+1] = z1; /* copy of initial imag. data */ } } else { for(i=0;i<2*N;i+=2){ z0 = 0; /* real part of array */ z1 = 0; /* imaginary part of array */ x[i] = z0; z[i] = z0; /* copy of initial real data */ x[i+1] = z1; z[i+1] = z1; /* copy of initial imag. data */ } } /* initialize sine/cosine tables */ n = N; cffti(n,w); /* transform forward, back */ if(first){ sign = 1.0; cfft2(n,x,y,w,sign); sign = -1.0; cfft2(n,y,x,w,sign); /* results should be same as initial multiplied by N */ fnm1 = 1.0/((float) n); error = 0.0; for(i=0;i<2*N;i+=2){ error += (z[i] - fnm1*x[i])*(z[i] - fnm1*x[i]) + (z[i+1] - fnm1*x[i+1])*(z[i+1] - fnm1*x[i+1]); } error = sqrt(fnm1*error); printf(" for n=%d, fwd/bck error=%e\n",N,error); first = 0; } else { t1 = ((float)clock())/((float) CLOCKS_PER_SEC); for(it=0;it<1000;it++){ sign = +1.0; cfft2(n,x,y,w,sign); sign = -1.0; cfft2(n,y,x,w,sign); } t1 = ((float)clock())/((float) CLOCKS_PER_SEC) - t1; t1 = t1/2000.0; ln2 = 10.0; // reset this for different N mflops = 5.0*((float) N)*ln2/((1.e+6)*t1); printf(" for n=%d, t1=%e, mflops=%e\n",n,t1,mflops); } } } void cfft2(n,x,y,w,sign) int n; float x[][2],y[][2],w[][2],sign; { int jb, jc, jw, k, k2, lj, m, j, mj, mj2, pass, tgle; float (*a)[2],(*b)[2],(*c)[2],(*d)[2]; float (*aa)[2],(*bb)[2],(*cb)[2],(*dd)[2]; float rp,up,wr[4],wu[4]; __m128 V0,V1,V2,V3,V4,V5,V6,V7; __m128 V8,V9,V10,V11,V12,V13,V14,V15; if(n<=1){ y[0][0] = x[0][0]; y[0][1] = x[0][1]; return; } m = (int) (log((float) n)/log(1.99)); mj = 1; mj2 = 2; lj = n/2; // first pass thru data: x -> y a = (void *)&x[0][0]; b = (void *)&x[n/2][0]; c = (void *)&y[0][0]; d = (void *)&y[1][0]; for(j=0;j float ggl(float *ds) { /* generate u(0,1) distributed random numbers. Seed ds must be saved between calls. ggl is essentially the same as the IMSL routine RNUM. W. Petersen and M. Troyer, 24 Oct. 2002, ETHZ: a modification of a fortran version from I. Vattulainen, Tampere Univ. of Technology, Finland, 1992 */ double t,d2=0.2147483647e10; t = (float) *ds; t = fmod(0.16807e5*t,d2); *ds = (float) t; return((float) ((t-1.0e0)/(d2-1.0e0))); }