

#include <ia32intrin.h>
#include <stdio.h>
#include <math.h>
#include "rdtsc.h"





inline void MVM_NxN_novec(float * M, float * x, float * y)
{
	for (int i = 0; i < N; i=i+4)
	{
		y[i] = 0;
		y[i+1] = 0;
		y[i+2] = 0;
		y[i+3] = 0;
		for (int j = 0; j < N; j = j+4)
			for (int k = i; k < i+4; k++) //the 4x4 blocking	
				for (int l = j; l < j+4; l++)
					y[k] += M[k*N+l] * x[l];
	}
			
		
}

inline void MVM_NxN_scalar_replacement(float *M, float *x, float *y)
{
	float t1;
	for (int i = 0; i < N; i=i+4)
	{
		y[i] = 0;
		y[i+1] = 0;
		y[i+2] = 0;
		y[i+3] = 0;
		for (int j = 0; j < N; j = j+4)
			for (int k = i; k < i+4; k++) //the 4x4 blocking
			{
				t1 = 0;
				for (int l = j; l < j+4; l++)
					t1 += M[k*N+l] * x[l];
				y[k] += t1;				
			}
	}
}

inline void MVM_NxN_partial_unroll(float *M, float *x, float *y)
{
	float Mt[4][4];
	for (int i = 0; i < N; i=i+4)
	{
		y[i] = 0;
		y[i+1] = 0;
		y[i+2] = 0;
		y[i+3] = 0;
		for (int j = 0; j < N; j = j+4)
		{

			//the 4x4 blocking	
			for (int k = 0; k < 4; k++) 
			{
				int index = (k+i)*N+j;
				Mt[k][0] = M[index] * x[j];
				Mt[k][1] = M[index+1] * x[j+1];
				Mt[k][2] = M[index+2] * x[j+1];
				Mt[k][3] = M[index+3] * x[j+1];
			}
			y[i+0] += Mt[0][0] + Mt[0][1] + Mt[0][2] + Mt[0][3];
			y[i+1] += Mt[1][0] + Mt[1][1] + Mt[1][2] + Mt[1][3];
			y[i+2] += Mt[2][0] + Mt[2][1] + Mt[2][2] + Mt[2][3];
			y[i+3] += Mt[3][0] + Mt[3][1] + Mt[3][2] + Mt[3][3];
		}
	}
}



inline void MVM_NxN_vec(float * M, float * x, float * y)
{
	//your code here - MVM_4x4_partial_unroll should give you a pretty good idea how to do it

	
	for (int i = 0; i < N; i=i+4)
	{
	                      		
		for (int j = 0; j < N; j = j+4)
		{
			//the 4x4 blocking				
			
		}
		
	}
}

int main()
{	
	float * M;
	float * x;
	float * y;
	float * y_vec;
	tsc_counter a, b;
	double cycles, baseline;

	//N is a define
	M = (float*)_mm_malloc(sizeof(float)*N*N,16);
	x = (float*)_mm_malloc(sizeof(float)*N,16);
	y = (float*)_mm_malloc(sizeof(float)*N,16);
	y_vec = (float*)_mm_malloc(sizeof(float)*N,16);

	if (M == NULL || x == NULL || y == NULL || y_vec == NULL)
		return 1;

	//init vars
	for (int i = 0; i < N; i++)
		for (int j = 0; j < N; j++)
			M[i*N+j] = 2;
	for (int i = 0; i<N; i++){
			x[i] = 3;	
	}
			

	

	
	//-----------------------------------------Timing 1
	//warm up	

	//-----------------No Vec Warmup-------------------
	//lookup task b from the HW here
	//
	//State the Asm. Instructions used by the compiler as a comment here:
	//
	// <--- List of Instructions --->
	//
	MVM_NxN_novec(M,x,y);
	//-----------------No Vec Warmup end --------------



	CPUID(); RDTSC(a); CPUID(); RDTSC(b);
	CPUID(); RDTSC(a); CPUID(); RDTSC(b);

	RDTSC(a);
	for(int i=0; i<NUM_RUNS; ++i)
	{ 
		MVM_NxN_novec(M,x,y);
	}
	RDTSC(b);
	cycles = ((double)COUNTER_DIFF(b, a)) / ((double) NUM_RUNS);
	baseline = cycles;
	printf("%lf cycles -> Not vectorized code - %1.0f\n",cycles,y[0]); //print out of result so that compiler doesnt eliminate deadcode

	//-----------------------------------------Timing 2
	//warm up
	MVM_NxN_scalar_replacement(M,x,y);


	CPUID(); RDTSC(a); CPUID(); RDTSC(b);
	CPUID(); RDTSC(a); CPUID(); RDTSC(b);

	RDTSC(a);
	for(int i=0; i<NUM_RUNS; ++i)
	{ 
		MVM_NxN_scalar_replacement(M,x,y);
	}
	RDTSC(b);
	cycles = ((double)COUNTER_DIFF(b, a)) / ((double) NUM_RUNS);
	printf("%lf cycles -> Speedup: %2.2f x through Scalar replacement code - %1.0f\n",cycles,baseline/cycles,y[0]);
	
	//-----------------------------------------Timing 3
	//warm up
	MVM_NxN_partial_unroll(M,x,y);


	CPUID(); RDTSC(a); CPUID(); RDTSC(b);
	CPUID(); RDTSC(a); CPUID(); RDTSC(b);

	RDTSC(a);
	for(int i=0; i<NUM_RUNS; ++i)
	{ 
		MVM_NxN_partial_unroll(M,x,y);
	}
	RDTSC(b);
	cycles = ((double)COUNTER_DIFF(b, a)) / ((double) NUM_RUNS);
	printf("%lf cycles -> Speedup: %2.2f x through partially unrolled code - %1.0f\n",cycles,baseline/cycles,y[0]);


	//-----------------------------------------Timing 4
	//warm up
	MVM_NxN_vec(M,x,y_vec);

	CPUID(); RDTSC(a); CPUID(); RDTSC(b);
	CPUID(); RDTSC(a); CPUID(); RDTSC(b);

	RDTSC(a);
	for(int i=0; i<NUM_RUNS; ++i)
	{ 
		MVM_NxN_vec(M,x,y_vec);
	}
	RDTSC(b);
	cycles = ((double)COUNTER_DIFF(b, a)) / ((double) NUM_RUNS);
	printf("%lf cycles -> Speedup: %2.2f x through vectorized code - %1.0f\n",cycles,baseline/cycles,y_vec[0]);
	
	//Check if the Code is correct
	for (int i = 0; i< N; i++)
		if (abs(y_vec[i]-y[i])>10e-8)
		{
			printf("Your vectorized MVM seems to be wrong!!!");
			return 1;
		}
	

	_mm_free(M);
	_mm_free(y);
	_mm_free(y_vec);
	_mm_free(x);


	return 0;
}

