#include #include #include #include "xmmintrin.h" #define N 20 main() { /* SSE unit step isamax with alignment code. From Section 3.5.7 of Petersen and Arbenz "Intro. to Parallel Computing," Oxford Univ. Press, 2004. wpp 31/7/2002 */ float x[N]; int i,im; int isamax0(int,float *); for(i=0;i bbig){ bbig = fabs(x[i]); iebig = i; } } return(iebig); } // n >= NS case done in SSE mode V7 = _mm_set_ps(3.0,2.0,1.0,0.0); V2 = _mm_set_ps(3.0,2.0,1.0,0.0); V6 = _mm_set_ps1(-0.0); offset4 = _mm_set_ps1(4.0); align = ((unsigned int) x >> 2) & 0x03; if(align == 1){ // bres = 3 case bbig = fabsf(x[0]); ibbig = 0; bres = 3.0; nn = n - 3; for(i=1;i<3;i++){ if(fabsf(x[i]) > bbig){ bbig = fabsf(x[i]); ibbig = i; } } } else if(align == 2){ // bres = 2 case bbig = fabsf(x[0]); ibbig = 0; bres = 2.0; nn = n - 2; if(fabsf(x[1]) > bbig){ bbig = fabsf(x[1]); ibbig = 1; } } else if(align == 1){ // bres = 1 case bbig = fabsf(x[0]); ibbig = 0; bres = 1.0; nn = n - 1; } else { // bres = 0 case bbig = 0.0; ibbig = 0; nn = n; bres = 0.0; } xp = x + (int) bres; nsegs = (nn >> 2) - 2; eres = nn - 4*(nsegs+2); V0 = _mm_load_ps(xp); xp += 4; // first four in 4/time seq. V1 = _mm_load_ps(xp); xp += 4; // next four in 4/time seq. V0 = _mm_andnot_ps(V6,V0); // take absolute value for(i=0;i 0){ V0 = _mm_max_ps(V0,V1); V3 = _mm_and_ps(V2,V3); V7 = _mm_max_ps(V7,V3); } V1 = _mm_load_ps(xp); xp += 4; // bottom load next four } // finish up the last segment of 4 V1 = _mm_andnot_ps(V6,V1); // take absolute value V3 = _mm_cmpnle_ps(V1,V0); // compare old max of 4 to new mb = _mm_movemask_ps(V3); // any of 4 bigger? V2 = _mm_add_ps(V2,offset4); // add offset if(mb > 0){ V0 = _mm_max_ps(V0,V1); V3 = _mm_and_ps(V2,V3); V7 = _mm_max_ps(V7,V3); } // Now finish up: segment maxima are in V0, indices in V7 _mm_store_ps(xbig,V0); _mm_store_ps(indx,V7); if(eres>0){ for(i=0;i ebig){ ebig = xbig[i]; iebig = (int) indx[i]; } } return(iebig); } #undef NS