#include <stdio.h>
#include <math.h>
#include <float.h>
#define N 1027
main()
{
/* 
  Mac G-4 unit step isamax for arbitrary N code 
  This is an Altivec version of isamax0 from Section 3.5.7
  in Arbenz and Petersen, "Intro. to Parallel Computing"
  Oxford Univ. Press, 2004
                                           wpp 5/8/2002 
*/
  float x[N];
  float xb;
  int err,flag,i,im,k,ki,kl,ib,n0,n;
  int isamax(int,float *);
  static float seed = 331.0;
  float ggl(float*);
  flag = 0;                         // error flag
  kl   = 3;
  n0   = 1;
  for(k=0;k<5;k++){
     for(ki=0;ki<kl;ki++){
        n  = n0 + ki;
        ib = 0; xb = 0.0;
        for(i=0;i<n;i++){
           x[i] = ggl(&seed);
           if(fabs(x[i]) > xb){
              xb = fabs(x[i]);
              ib = i;
           }
        }
        im  = isamax(n,x);
        err = ib - im;
        if(err != 0){
            printf(" err in isamax: n = %d, ib = %d, im = %d\n",n,ib,im);
            flag = 1;
        }
     }
     n0  *= 4;                    // increase n
     kl   = 4;                    // for n > 1, 3 steps of increase in n
  }
  if(flag==0) printf(" All n tests pass\n");
}
#define NS 12
int isamax(int n, float *x)
{
  float rbig,*xp;
  int i,ii,nres,nsegs,ibig,irbig;
  vector float V0,V1,V6;
  vector bool int V3;
  vector float V2 = (vector float) (0.0,1.0,2.0,3.0);
  vector float V7 = (vector float) (0.0,1.0,2.0,3.0);
  const vector float incr_4 = (vector float) (4.0,4.0,4.0,4.0);
  const vector float minus0 = (vector float) (-0.0,-0.0,-0.0,-0.0);
  float big,xbig[4],indx[4];
// n < NS done in scalar mode
  if(n < NS){
     ibig = 0;
     rbig = 0.0;
     for(i=0;i<n;i++){
        if(fabs(x[i]) > rbig){
           rbig = fabs(x[i]);
           ibig = i;
        }
     }
     return(ibig);
  }
// n >= NS case done with altivec 
  nsegs = (n >> 2) - 1;
  nres  = n - ((nsegs+1) << 2);    // nres = n mod 4
  V2 = vec_add(V2,incr_4);         // increment next index
  xp = x;
  V0 = vec_ld(0,xp); xp += 4;      // first four 
  V1 = vec_ld(0,xp); xp += 4;      // next four
  V0 = vec_abs(V0);                // absolute value of first four
  for(i=0;i<nsegs;i++){
     V1 = vec_abs(V1);             // take absolute value fo next segment
     V3 = vec_cmpgt(V1,V0);        // compare accumulated 4 to next 4
     V0 = vec_sel(V0,V1,V3);       // select new or old accumulation
     V7 = vec_sel(V7,V2,V3);       // select new index or old
     V1 = vec_ld(0,xp); xp += 4;   // bottom load next 4
     V2 = vec_add(V2,incr_4);
  }
  V1 = vec_ld(0,xp); xp += 4;  // bottom load next four
  V3 = vec_cmpgt(V1,V0);      // compare accumulated to last 4
  V0 = vec_sel(V0,V1,V3);     // select accumulation to last 4
  V7 = vec_sel(V7,V2,V3);     // select index of accum. to last 4
// Now finish up: segment maxima are in V0, indices in V7
  vec_st(V0,0,xbig);
  vec_st(V7,0,indx);
  ii   = ((n >> 2) << 2);
  big  = 0.0;
  ibig = 0.0;
  for(i=0;i<nres;i++){
     if(fabs(x[ii]) > big){
        big  = fabs(x[ii]);
        ibig = ii;
     }
     ii++;
  }
  for(i=0;i<4;i++){
     if(xbig[i] > big){
        big  = xbig[i];
        ibig = (int) indx[i];
     }
  }
  return(ibig);
}
#undef NS
#include <math.h>
float ggl(float *ds)
{
 
/* generate u(0,1) distributed random numbers.
   Seed ds must be saved between calls. ggl is
   essentially the same as the IMSL routine RNUM.
 
   W. Petersen and M. Troyer, 24 Oct. 2002, ETHZ:
   a modification of a fortran version from
   I. Vattulainen, Tampere Univ. of Technology,
   Finland, 1992 */
 
   double t,d2=0.2147483647e10;
   t   = (float) *ds;
   t   = fmod(0.16807e5*t,d2);
   *ds = (float) t;
   return((float) ((t-1.0e0)/(d2-1.0e0)));
}