#include #include #include #include "mpi.h" #define am(p,q,r) *(a+2*nx*nx*p+2*nx*q+r) int main(int argc, char **argv ) { /* 3-D binary radix FFT using message passing. Data are in slabs. Given "size" processors, there are "size" slabs containing NZ/size planes each. Complex element A(i,j,k) is in slab i mod (NZ/size) and has location &A(i%(NZ/size),j,k) = &A(mod(i,NZ/size),j,k). From Sections 5.8 and 5.9 of Arbenz and Petersen, "Intro. to Parallel Computing," Oxford Univ. Press, 2004. W. Petersen, 22 Oct. 2004 */ int MAX_X=128, MAX_Z=128; int ierr,master,rank,size,nz,nx,i,j,ijk,ip,k,offset; static float seed; float *a,*b,*w,*buff,*acopy,sign,err,fnn; float ggl(); void cffti(int nx, float *w); void Checkres(float *a, float *acopy, int nz, int nx); void FFT3D(float *a,float *w,float sign,int nz,int nx); MPI_Status stat; MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); master = 0; nx = MAX_X; nz = nx/size; a = (float *) malloc(2*nz*nx*nx*sizeof(float)); acopy = (float *) malloc(2*nx*nx*nx*sizeof(float)); w = (float *) malloc(2*nx*sizeof(float)); buff = (float *) malloc(2*nx*nx*nz*sizeof(float)); /* initialize sine/cosine tables */ cffti(nx,w); if(rank==master){ /* now send section of A to each proccessor */ seed = 331.0; for(ip=0;ip0){ ierr=MPI_Send(buff,2*nx*nx*nz,MPI_FLOAT,ip,ip, MPI_COMM_WORLD); } } } else { /* slave parts */ ierr=MPI_Recv(buff,2*nx*nx*nz,MPI_FLOAT,master, MPI_ANY_TAG,MPI_COMM_WORLD,&stat); if(stat.MPI_TAG!=0){ ijk = 0; for(k=0;k0) free(pw); pw = (float *) malloc(2*nx*sizeof(float)); nfirst = nx; } /* X-direction */ for(k=0;k0){ ierr=MPI_Send(a,2*nz*nx*nx,MPI_FLOAT,0,0, MPI_COMM_WORLD); } else { /* master */ /* rank > 0 part of check */ for(is=1;is0) free(buf_io); buf_io = (float *) malloc(nz*n2*nx*sizeof(float)); init = nx; } /* local transpose of first block (in-place) */ for(j = 0; j < nx; j++){ offset = j*2*nx + rank*n2; for(k = 0; k < nz; k ++){ for(i = 0; i < k; i++) { t0 = a[offset + i*np + k*2]; t1 = a[offset + i*np + k*2+1]; a[offset+i*np+k*2] = a[offset+k*np+2*i]; a[offset+i*np+k*2+1] = a[offset+k*np+2*i+1]; a[offset+k*np+2*i] = t0; a[offset+k*np+2*i+1] = t1; } } } /* size-1 communication steps */ for (step = 1; step < size; step ++) { other = rank ^ step; /* fill send buffer */ ijk = 0; for(j=0;j 0.){ #pragma ivdep for(k=0;k 0.){ for(i=j;i 0.){ wku = w[kw][1]; } else { wku = -w[kw][1]; } for(i=0;i float ggl(float *ds) { /* generate u(0,1) distributed random numbers. Seed ds must be saved between calls. ggl is essentially the same as the IMSL routine RNUM. W. Petersen and M. Troyer, 24 Oct. 2002, ETHZ: a modification of a fortran version from I. Vattulainen, Tampere Univ. of Technology, Finland, 1992 */ double t,d2=0.2147483647e10; t = (float) *ds; t = fmod(0.16807e5*t,d2); *ds = (float) t; return((float) ((t-1.0e0)/(d2-1.0e0))); }