/* solution of exercise 3.2 SSE optimized BLAS level 1 routines SSUM and SAXPY for single precision arithmetic. From Section 3.2 of Petersen and Arbenz, "Intro to Parallel Computing" Oxford Univ. Press, 2003. Roman Geus, 2001, Wissenschaftliches Rechnen, Dept. Informatik, ETHZ, 14. Mar. 2001 modified by Patric Rousselot, Dept. Mathematics ETHZ 5. Jan. 2003 */ #include #include #include #include "sse.h" #define N 1000000 /* length of vectors, divisible by 4 */ #define NTEST 20 /* number of tests used for timing */ #define MHZ 650 /* clock speed of cpu: see /proc/cpuinfo */ /** * function prototypes for BLAS fortran routines */ void sscal_(int *, float *, float *, int *); void saxpy_(int *, float *, float *, int *, float *, int *); /** * union that stores value of %tick register */ union long_number{ long long no; struct{ unsigned int lo; unsigned int hi; } s_no; }; /** * Reads the cycle counter. * high part of the cycle counter is returned in hi * and low part is returned in lo. */ void rdtcsh(unsigned int *hi, unsigned int *lo) { unsigned int msr; __asm __volatile( ".byte 0xf; .byte 0x31 # Read the cycle counter movl %%edx, %0 # high order bits movl %%eax, %1 # low order bits" : "=g" (*hi), "=g" (*lo): "g" (msr): "eax", "edx" ); } /* end-rdtcsh */ /* ************************************************************** * * * * SSE optimized routines, i.e. sscal, saxpy, sdot * * * * ************************************************************** */ void sscal_sse(int n, float alpha, float *x) { int i; sse_t alpha_vec; // store vector with 4 alpha values in MMX register for (i = 0; i < 4; i ++) alpha_vec.sf[i] = alpha; movups_m2r(alpha_vec, XMM0); n = n/4; for (i = 0; i < n; i ++) { #ifndef NPREFETCH // prefetch the data that will be used two iterations later prefetchnta(*(((sse_t *) x) + i + 2)); #endif // load next 4 elements from x vector movups_m2r(*(((sse_t *) x) + i), XMM1); // parallel multiply: scale elements with alpha mulps_r2r(XMM0, XMM1); // store 4 elements in x vector movups_r2m(XMM1, *(((sse_t *) x) + i)); } } void saxpy_sse(int n, float alpha, float *x, float *y) { int i; sse_t alpha_vec; /* store vector with 4 alpha values in MMX register */ for (i = 0; i < 4; i ++) alpha_vec.sf[i] = alpha; movups_m2r(alpha_vec, XMM0); n = n/4; for (i = 0; i < n; i ++) { #ifndef NPREFETCH /* prefetch the data that will be used two iterations later */ prefetchnta(*(((sse_t *) x) + i + 2)); prefetchnta(*(((sse_t *) y) + i + 2)); #endif /* load next 4 elements from x vector */ movups_m2r(*(((sse_t *) x) + i), XMM1); /* parallel multiply: scale elements with alpha */ mulps_r2r(XMM0, XMM1); /* load next 4 elements from y vector */ movups_m2r(*(((sse_t *) y) + i), XMM2); /* add alpha*x to y */ addps_r2r(XMM1,XMM2); /* store 4 elements in x vector */ movups_r2m(XMM2, *(((sse_t *) y) + i)); } } //pr float ssum_sse(int n, float *x) { int i; sse_t p; n = n/4; p.sf[0]=0.0F; p.sf[1]=0.0F; p.sf[2]=0.0F; p.sf[3]=0.0F; movups_m2r(p,XMM0); for (i = 0; i < n; i ++) { #ifndef NPREFETCH // prefetch the data that will be used two iterations later prefetchnta(*(((sse_t *) x) + i + 2)); #endif // load next 4 elements from x vector movups_m2r(*(((sse_t *) x) + i), XMM1); // Add 4 products to XMM2 addps_r2r(XMM1, XMM0); } movups_r2m(XMM0,p); return (p.sf[3]+p.sf[2]+p.sf[1]+p.sf[0]); } /* ************************************************************** * * * * Trivially implemented routines, i.e. sscal, saxpy, ssum * * * * ************************************************************** */ void sscal_triv(int n, float alpha, float *x) { int i; for (i = 0; i < n; i ++) x[i] *= alpha; } void saxpy_triv(int n, float alpha, float *x, float *y) { int i; for (i = 0; i < n; i ++) y[i] += alpha*x[i]; } //pr float ssum_triv(int n, float *x) { int i; float s=0.0; for (i=0; i