/*

 solution of exercise 3.2
 SSE optimized BLAS level 1 routines SSUM and SAXPY
 for single precision arithmetic. From Section 3.2
 of Petersen and Arbenz, "Intro to Parallel Computing"
 Oxford Univ. Press, 2003.

 Roman Geus, 2001, Wissenschaftliches Rechnen, 
 Dept. Informatik, ETHZ, 14. Mar. 2001
 modified by Patric Rousselot, Dept. Mathematics ETHZ
 5. Jan. 2003
 
*/

#include <assert.h>
#include <stdio.h>
#include <time.h>
#include "sse.h"

#define N 1000000  		/* length of vectors, divisible by 4 */
#define NTEST 20		/* number of tests used for timing */
#define MHZ   650               /* clock speed of cpu: 
				       see /proc/cpuinfo
				*/

/**
 * function prototypes for BLAS fortran routines
 */
void sscal_(int *, float *, float *, int *);
void saxpy_(int *, float *, float *, int *, float *, int *);
/**
 * union that stores value of %tick register
 */
union long_number{
  long long no;
  struct{
    unsigned int lo;
    unsigned int hi;
  } s_no;
};

/**
 * Reads the cycle counter.
 * high part of the cycle counter is returned in hi
 * and low part is returned in lo.
 */
void rdtcsh(unsigned int *hi, unsigned int *lo)
{
  unsigned int msr;

  __asm __volatile(
       ".byte 0xf; .byte 0x31  # Read the cycle counter
        movl %%edx, %0         # high order bits
        movl %%eax, %1         # low order bits"
        : "=g" (*hi), "=g" (*lo): "g" (msr): "eax", "edx" );

} /* end-rdtcsh */


/* ************************************************************** *
 *                                                                *
 *        SSE optimized routines, i.e. sscal, saxpy, sdot         *
 *                                                                *
 * ************************************************************** */

void sscal_sse(int n, float alpha, float *x) {
  int i;
  sse_t alpha_vec;

  // store vector with 4 alpha values in MMX register
  for (i = 0; i < 4; i ++)
    alpha_vec.sf[i] = alpha;
  movups_m2r(alpha_vec, XMM0);
  
  n = n/4;
  for (i = 0; i < n; i ++) {
#ifndef NPREFETCH
    // prefetch the data that will be used two iterations later
    prefetchnta(*(((sse_t *) x) + i + 2));
#endif
    // load next 4 elements from x vector
    movups_m2r(*(((sse_t *) x) + i), XMM1);
    // parallel multiply: scale elements with alpha
    mulps_r2r(XMM0, XMM1);
    // store 4 elements in x vector
    movups_r2m(XMM1, *(((sse_t *) x) + i));
  }
}


void saxpy_sse(int n, float alpha, float *x, float *y) {
  int i;
  sse_t alpha_vec;

  /* store vector with 4 alpha values in MMX register */
  for (i = 0; i < 4; i ++)
    alpha_vec.sf[i] = alpha;
  movups_m2r(alpha_vec, XMM0);
  
  n = n/4;
  for (i = 0; i < n; i ++) {
#ifndef NPREFETCH
    /* prefetch the data that will be used two iterations later */
    prefetchnta(*(((sse_t *) x) + i + 2));
    prefetchnta(*(((sse_t *) y) + i + 2));
#endif
    /* load next 4 elements from x vector */
    movups_m2r(*(((sse_t *) x) + i), XMM1);
    /* parallel multiply: scale elements with alpha */
    mulps_r2r(XMM0, XMM1);
    /* load next 4 elements from y vector */
    movups_m2r(*(((sse_t *) y) + i), XMM2);
    /* add alpha*x to y */
    addps_r2r(XMM1,XMM2);
    /* store 4 elements in x vector */
    movups_r2m(XMM2, *(((sse_t *) y) + i));
  }
}
//pr
float ssum_sse(int n, float *x) {
  int i;
  sse_t p;

  n = n/4;
  
  p.sf[0]=0.0F;
  p.sf[1]=0.0F;
  p.sf[2]=0.0F;
  p.sf[3]=0.0F;
  
  movups_m2r(p,XMM0);

  for (i = 0; i < n; i ++) {
#ifndef NPREFETCH
    // prefetch the data that will be used two iterations later
    prefetchnta(*(((sse_t *) x) + i + 2));
#endif
    // load next 4 elements from x vector
    movups_m2r(*(((sse_t *) x) + i), XMM1);
    // Add 4 products to XMM2
    addps_r2r(XMM1, XMM0);
  }
  movups_r2m(XMM0,p);

  return (p.sf[3]+p.sf[2]+p.sf[1]+p.sf[0]);
}


/* ************************************************************** *
 *                                                                *
 *   Trivially implemented routines, i.e. sscal, saxpy, ssum      *
 *                                                                *
 * ************************************************************** */

void sscal_triv(int n, float alpha, float *x) {
  int i;
  
  for (i = 0; i < n; i ++)
    x[i] *= alpha;
}

void saxpy_triv(int n, float alpha, float *x, float *y) {
  int i;
  
  for (i = 0; i < n; i ++)
    y[i] += alpha*x[i];
}
//pr
float ssum_triv(int n, float *x) {
  int i;
  float s=0.0;
  for (i=0; i<n; i++)
    s+=x[i];
  return s;
}

/* ************************************************************** *
 *                                                                *
 *                     Verification of methods                    *
 *                                                                *
 * ************************************************************** */
void sscal_verify(int n, float *x1, float *x2, float *x3) {
  int ONE = 1;
  int i;
  float alpha = 1.5;
  
  for (i = 0; i < N; i ++)
    x1[i] = x2[i] = x3[i] = (float)i;
  sscal_triv(n, alpha, x1);
  sscal_sse(n, alpha, x2);
  sscal_(&n, &alpha, x3, &ONE);
  for (i = 0; i < N; i ++) {
    if (x1[i] != x2[i]) {
      printf("sscal_verify: x1[%d] = %g, x2[%d] = %g\n", 
	     i, x1[i], i, x2[i]);
    }
    if (x1[i] != x3[i]) {
      printf("sscal_verify: x1[%d] = %g, x3[%d] = %g\n", 
	     i, x1[i], i, x3[i]);
    }
    assert(x1[i] == x2[i]);
    assert(x1[i] == x3[i]);
  }
}

void saxpy_verify(int n, float *x1, float *x2, float *x3, float *y1, float *y2, float *y3) {
  int ONE = 1;
  int i;
  float alpha = 1.5;
  
  for (i = 0; i < N; i ++)
    x1[i] = x2[i] = x3[i] = y1[i] = y2[i] = y3[i] = (float)i * 1.0;
  saxpy_triv(n, alpha, x1, y1);
  saxpy_sse(n, alpha, x2, y2);
  saxpy_(&n, &alpha, x3, &ONE, y3, &ONE);
  for (i = 0; i < N; i ++) {
    if (y1[i] != y2[i]) {
      printf("saxpy_verify: y1[%d] = %g, y2[%d] = %g\n", 
             i, y1[i], i, y2[i]);
    }
    if (y1[i] != y3[i]) {
      printf("saxpy_verify: y1[%d] = %g, y3[%d] = %g\n", 
             i, y1[i], i, y3[i]);
    }
    //    assert(y1[i] == y2[i]);
    //assert(y1[i] == y3[i]);
  }
}

void ssum_verify(int n, float *x1, float *x2) {
  int ONE = 1;
  int i;
  float s_triv,s_sse;
  float alpha = 1.5;
  
  for (i = 0; i < N; i ++)
    x1[i] = x2[i] = (float)i;
  s_triv=ssum_triv(n, x1);
  s_sse=ssum_sse(n, x2);
  if (s_triv != s_sse) {
    printf("ssum_verify: s_triv != s_sse (%f vs. %f)\n",s_triv,s_sse);
    printf("for large N, float-datatype may be too small to save such big numbers!\n");
    printf("to verify correctness of ssum_sse, use N=1000\n");
  }
}
/* ************************************************************** *
 *                                                                *
 *                     Init and Main                              *
 *                                                                *
 * ************************************************************** */


void init_xy(int n, float *x, float *y) {
  int i;

  for (i = 0; i < n; i ++) {
    x[i] = (float)i;
    y[i] = (float)i;
  }
}

int main(void) {
  int ONE = 1;
  int i, n = N;
  float s;
  float alpha = 2.0;
  float *x, *y, *t, *y1, *y2, *y3;
  clock_t t1, t2;
  union long_number c1, c2;

  /* allocate memory for the two vectors we need */
  x = (float *)malloc(n * sizeof(float));
  y = (float *)malloc(n * sizeof(float));
  t = (float *)malloc(n * sizeof(float));
  y1 = (float *)malloc(n * sizeof(float));
  y2 = (float *)malloc(n * sizeof(float));
  y3 = (float *)malloc(n * sizeof(float));
  assert(x && y && y1 && y2 && y3 && t);

  /* verify routines */
  sscal_verify(n, x, y, t);
  saxpy_verify(n, x, y, t, y1, y2, y3);
  ssum_verify(n, x, y);

  // benchmark sscal_triv
  init_xy(n, x, y);
  t1 = clock();
  rdtcsh(&c1.s_no.hi, &c1.s_no.lo);
  for (i = 0; i < NTEST; i ++)
    sscal_triv(n, alpha, x);
  rdtcsh(&c2.s_no.hi, &c2.s_no.lo);
  t2 = clock();
  printf("sscal_triv\n");
  printf("  elapsed time (clock): %8d ticks   = %g musec \n", 
	 (t2-t1)/NTEST, 
	 1e6*(t2-t1)/NTEST/CLOCKS_PER_SEC);
  printf("  elapsed time (cycle): %8d cycles  = %g musec \n", 
	 (int)((c2.no - c1.no)/NTEST), 
	 (double)(c2.no - c1.no)/NTEST/MHZ);

  // benchmark sscal_sse
  init_xy(n, x, y);
  t1 = clock();
  rdtcsh(&c1.s_no.hi, &c1.s_no.lo);
  for (i = 0; i < NTEST; i ++)
    sscal_sse(n, alpha, x);
  rdtcsh(&c2.s_no.hi, &c2.s_no.lo);
  t2 = clock();
  printf("sscal_sse\n");
  printf("  elapsed time (clock): %8d ticks   = %g musec \n", 
	 (t2-t1)/NTEST, 
	 1e6*(t2-t1)/NTEST/CLOCKS_PER_SEC);
  printf("  elapsed time (cycle): %8d cycles  = %g musec \n", 
	 (int)((c2.no - c1.no)/NTEST), 
	 (double)(c2.no - c1.no)/NTEST/MHZ);

  // benchmark sscal_blas
  init_xy(n, x, y);
  t1 = clock();
  rdtcsh(&c1.s_no.hi, &c1.s_no.lo);
  for (i = 0; i < NTEST; i ++)
    sscal_(&n, &alpha, x, &ONE);
  rdtcsh(&c2.s_no.hi, &c2.s_no.lo);
  t2 = clock();
  printf("sscal_blas\n");
  printf("  elapsed time (clock): %8d ticks   = %g musec \n", 
	 (t2-t1)/NTEST, 
	 1e6*(t2-t1)/NTEST/CLOCKS_PER_SEC);
  printf("  elapsed time (cycle): %8d cycles  = %g musec \n", 
	 (int)((c2.no - c1.no)/NTEST), 
	 (double)(c2.no - c1.no)/NTEST/MHZ);

  /* fill the vectors with some numbers */
  for (i = 0; i < n; i ++) {
    x[i] = (float)i;
    y[i] = (float)i;
  }

  /* benchmark saxpy_triv */
  init_xy(n, x, y);
  t1 = clock();
  rdtcsh(&c1.s_no.hi, &c1.s_no.lo);
  for (i = 0; i < NTEST; i ++)
    saxpy_triv(n, alpha, x, y);
  rdtcsh(&c2.s_no.hi, &c2.s_no.lo);
  t2 = clock();
  printf("saxpy_triv\n");
  printf("  elapsed time (clock): %8d ticks   = %g musec \n", 
	 (t2-t1)/NTEST, 
	 1e6*(t2-t1)/NTEST/CLOCKS_PER_SEC);
  printf("  elapsed time (cycle): %8d cycles  = %g musec \n", 
	 (int)((c2.no - c1.no)/NTEST), 
	 (double)(c2.no - c1.no)/NTEST/MHZ);

  /* benchmark saxpy_sse */
  init_xy(n, x, y);
  t1 = clock();
  rdtcsh(&c1.s_no.hi, &c1.s_no.lo);
  for (i = 0; i < NTEST; i ++)
    saxpy_sse(n, alpha, x, y);
  rdtcsh(&c2.s_no.hi, &c2.s_no.lo);
  t2 = clock();
  printf("saxpy_sse\n");
  printf("  elapsed time (clock): %8d ticks   = %g musec \n", 
	 (t2-t1)/NTEST, 
	 1e6*(t2-t1)/NTEST/CLOCKS_PER_SEC);
  printf("  elapsed time (cycle): %8d cycles  = %g musec \n", 
	 (int)((c2.no - c1.no)/NTEST), 
	 (double)(c2.no - c1.no)/NTEST/MHZ);

  /* benchmark saxpy_blas */
  init_xy(n, x, y);
  t1 = clock();
  rdtcsh(&c1.s_no.hi, &c1.s_no.lo);
  for (i = 0; i < NTEST; i ++)
    saxpy_(&n, &alpha, x, &ONE, y, &ONE);
  rdtcsh(&c2.s_no.hi, &c2.s_no.lo);
  t2 = clock();
  printf("saxpy_blas\n");
  printf("  elapsed time (clock): %8d ticks   = %g musec \n", 
	 (t2-t1)/NTEST, 
	 1e6*(t2-t1)/NTEST/CLOCKS_PER_SEC);
  printf("  elapsed time (cycle): %8d cycles  = %g musec \n", 
	 (int)((c2.no - c1.no)/NTEST), 
	 (double)(c2.no - c1.no)/NTEST/MHZ);

 
  /* benchmark ssum_triv */
  init_xy(n, x, y);
  t1 = clock();
  rdtcsh(&c1.s_no.hi, &c1.s_no.lo);
  for (i = 0; i < NTEST; i ++)
    ssum_triv(n, x);
  rdtcsh(&c2.s_no.hi, &c2.s_no.lo);
  t2 = clock();
  printf("ssum_triv\n");
  printf("  elapsed time (clock): %8d ticks   = %g musec \n", 
	 (t2-t1)/NTEST, 
	 1e6*(t2-t1)/NTEST/CLOCKS_PER_SEC);
  printf("  elapsed time (cycle): %8d cycles  = %g musec \n", 
	 (int)((c2.no - c1.no)/NTEST), 
	 (double)(c2.no - c1.no)/NTEST/MHZ);

  /* benchmark ssum_sse */
  init_xy(n, x, y);
  t1 = clock();
  rdtcsh(&c1.s_no.hi, &c1.s_no.lo);
  for (i = 0; i < NTEST; i ++)
    ssum_sse(n, x);
  rdtcsh(&c2.s_no.hi, &c2.s_no.lo);
  t2 = clock();
  printf("ssum_sse\n");
  printf("  elapsed time (clock): %8d ticks   = %g musec \n", 
	 (t2-t1)/NTEST, 
	 1e6*(t2-t1)/NTEST/CLOCKS_PER_SEC);
  printf("  elapsed time (cycle): %8d cycles  = %g musec \n", 
	 (int)((c2.no - c1.no)/NTEST), 
	 (double)(c2.no - c1.no)/NTEST/MHZ); 


}