#pragma once
#include <hls_math.h>
#include <ap_int.h>
#include <ap_fixed.h>
//#include "matrix_multiply.h"


template <unsigned num_elements, unsigned exp_t_width, unsigned W,unsigned I>
class bfp_fpga {
private:
public:
	static constexpr unsigned custom_log2(unsigned n) {
		int log = 0;
		while (n >>= 1) ++log;
		return log;
	}

	static constexpr unsigned I_test_value_dot = custom_log2(num_elements)+1;
	using mantissa_t = ap_fixed<W,I>;
	using exp_t = ap_int<exp_t_width>;
	using headroom_t = ap_int<exp_t_width>;



struct bfp_t {
		mantissa_t  data[num_elements];
        exp_t exp;
        headroom_t hr;
        unsigned length;
        bfp_t() : exp(0), hr(0), length(0){}
        //LHS and RHS by using []
        mantissa_t& operator[](unsigned index) {
            return data[index];
        }

        const mantissa_t& operator[](unsigned index) const {
            return data[index];
        }
        //copy constructor
        bfp_t(const bfp_t& other)
            : exp(other.exp), hr(other.hr), length(other.length) {
            for (unsigned i = 0; i < num_elements; ++i) {
                data[i] = other.data[i];
            }
        }
    };

   struct bfp_dot_product_t {//new struct for the result of dop-mul
            mantissa_t data;
            exp_t exp;
            bfp_dot_product_t() : data(0), exp(0) {}
        };

   static void receive_int(bfp_t& a,const int input_data[num_elements])
		{
    	for (int i = 0; i < num_elements; i++) {
    		a.data[i]=input_data[i];
    	}
    	bfp_headroom(a);//pre-calculate the headroom of received array,so shared exp can be confirmed
	}

   static void receive_double(bfp_t& a,const double input_data[])
		{
    	for (int i = 0; i < num_elements; i++) {
    		a.data[i]=input_data[i];
    	}
    	bfp_headroom(a);
	}

   static void bfp_init( bfp_t& a,const bfp_t& b)//nomalize
    {
    	for (int i = 0; i < num_elements; i++) {
    	    a.data[i] = b[i]<<b.hr;// every element shift left to ensure the largest one's MSB is 1 behind the sign bit
    	  }
        a.length = num_elements;
        a.exp = -b.hr;
    }

   static headroom_t bfp_headroom(bfp_t& a)
    {
	   ap_fixed<2*W+I_test_value_dot,2*I+I_test_value_dot> conv_a_data[num_elements];
	   for(int i = 0; i < num_elements; ++i) {
		   conv_a_data[i] =a.data[i];
	 		   }
        a.hr = headroom_calc(0,conv_a_data);
        return a.hr;
    }
   static unsigned clz(ap_fixed<2*W+I_test_value_dot,2*I+I_test_value_dot> x) {
       if (x == 0) return 0;
       unsigned int n = 0;
       if ((x & 0xFFFF0000) == 0) { n += 16; x <<= 16; }
       if ((x & 0xFF000000) == 0) { n += 8; x <<= 8; }
       if ((x & 0xF0000000) == 0) { n += 4; x <<= 4; }
       if ((x & 0xC0000000) == 0) { n += 2; x <<= 2; }
       if ((x & 0x80000000) == 0) { n += 1; }
       return n;
   }
   static headroom_t headroom_calc(const int mode_num,const ap_fixed<2*W+I_test_value_dot,2*I+I_test_value_dot> v[]){
	   ap_fixed<2*W+I_test_value_dot,2*I+I_test_value_dot> largest=0;
	   ap_fixed<2*W+I_test_value_dot,2*I+I_test_value_dot> abs_val=0;

	   if(mode_num!=3){
		   for(int k = 0; k < num_elements; k++){
			   abs_val = (v[k] < 0) ? ap_fixed<2*W+I_test_value_dot,2*I+I_test_value_dot>(-v[k]) : v[k];
			   if(abs_val > largest){
	   		   largest = abs_val;
			   }
	   	   }
	   }
	   else{
	   largest = (v[0] < 0) ? ap_fixed<2*W+I_test_value_dot,2*I+I_test_value_dot>(-v[0]) : v[0];
	   }
	   unsigned headroom = 0;
	   unsigned headroom_result = 0;
		if (largest == 0) {
			headroom_result =2*I+I_test_value_dot-1;
		 }
		 else if(largest>=1){
			 while (largest >= 1)
			 {
				 largest >>= 1;
				 headroom++;
			 }
			 headroom_result =2*I-headroom-1+I_test_value_dot;
		 }
		 else{
			 while (largest < 1 )
			 {
				 largest <<= 1;
				 headroom++;
			 }
			 headroom_result =2*I+headroom-2+I_test_value_dot;
		 }

	      return headroom_result - (mode_num == 1 ? I + I_test_value_dot : (mode_num == 2 ? I + I_test_value_dot - 1 : (mode_num == 3 ? I_test_value_dot : 0)));
	  }
   static exp_t check_overflow(const int mode_num,const ap_fixed<2*W+I_test_value_dot,2*I+I_test_value_dot> interm[]){
  	   exp_t overflow,cond;
  	   headroom_t hr=headroom_calc(mode_num,interm);
 	   if(mode_num==1){
 		   cond=0;
 	   }
 	   else if(mode_num==2){
 		   cond=I;
 	   }
 	   else{
 		   cond=I+I_test_value_dot;
 	   }

  	   if(hr<=cond){
  		   overflow=cond-hr;
  	   }
  	   else{
  		   overflow=0;
  	   }
  	 return overflow;

    }


   static bfp_t denomalize(bfp_t& a,const bfp_t& b,exp_t c){//used by add and sub operations
	   for(unsigned i = 0; i < num_elements; ++i) {
			   if(c<0){//shift depending on the sign of c
			   a.data[i]=b.data[i]<<(-c);
			   }
			   else{
			   a.data[i]=b.data[i]>>c;
			   }
		   }
	       a.length = num_elements;
		   return a;
	   }

   static bfp_t renomalize(bfp_t& a){
	   ap_fixed<2*W+I_test_value_dot,2*I+I_test_value_dot> conv_a_data[num_elements];
	   for(int i = 0; i < num_elements; ++i) {
		   conv_a_data[i] =a.data[i];
	 		   }
	   a.hr=headroom_calc(0,conv_a_data);//Calculate the result's headroom to get the new shared exp
	   for(unsigned i = 0; i < num_elements; ++i) {
		   a.data[i]<<=a.hr;//shift left the value according to headroom area
	   }
	   a.exp=-a.hr;
       a.length = num_elements;
	   return a;
   }

   static bfp_dot_product_t renomalize_dot_product(bfp_dot_product_t& a){//same as bfp_init but with different struct
	   headroom_t hr;
	   ap_fixed<2*W+I_test_value_dot,2*I+I_test_value_dot> conv_a_data[1];
	   conv_a_data[0]=a.data;
	   hr=headroom_calc(3,conv_a_data);
	   a.data<<=hr;
	   a.exp=-hr;
	   return a;
   }

   static bfp_t add(const bfp_t& a, const bfp_t& b) {
			  unsigned i;
			  exp_t exp_val,interm_exp,a_shr,b_shr;
	   	   	  bfp_t c,d,result,result_temp;
	   	   ap_fixed<2*W+I_test_value_dot,2*I+I_test_value_dot> interm[num_elements];
	   	   	  //ap_fixed<W+1,I+1> interm[num_elements];
	   	   	  result.length = num_elements;
		   	   if(a.exp<b.exp){//new bfp's shared exp is equal to max(a.exp,b.exp)+1
		   		   interm_exp=b.exp+1;
		   	   }
		   	   else if(a.exp>b.exp){
		   		   interm_exp=a.exp+1;
		   	   }
		   	   else{
		   		   interm_exp=0;
		   	   }
		   	   a_shr=interm_exp-a.exp;//get the shift right number for denomalizing
		   	   b_shr=interm_exp-b.exp;
	   	   	  denomalize(c,a,a_shr);//return c,d with same exp, so they could be added together
	   	   	  denomalize(d,b,b_shr);
		   	   for(i = 0; i < num_elements; ++i) {
		   		interm[i] = c.data[i] + d.data[i];
		   	   }
		   	   exp_val=check_overflow(1,interm);//check ovf if it needs to renomalize
		   	   if(exp_val==0){//no ovf
			   	   for(i = 0; i < num_elements; ++i) {
			   		result_temp.data[i] =interm[i];//send data to result bfp
			   	   }
			   	   	  renomalize(result_temp);//find the largest number to comfirm the exp, and make sure the largest number' MSB is 1 behind the sign bit
			   	   	  result.exp=result_temp.exp+interm_exp;
			   	   for(i = 0; i < num_elements; ++i) {
						result.data[i] =result_temp.data[i];
		   	   }
		   	   }
		   	   else{//ovf, but it could only be 1 more bit than the original bit width in add and sub operations
		   		for(i = 0; i < num_elements; ++i) {
		   			result.data[i] =interm[i]>>exp_val;
		   			result.exp=1+interm_exp;
				   }
		   	   }
              result.length = num_elements;
              return result;
   }

   friend bfp_t operator+(const bfp_t& a, const bfp_t& b) {
	   return add(a, b);
   }

   static bfp_t subtract(const bfp_t& a, const bfp_t& b) {
	  unsigned i;
	  exp_t exp_val,interm_exp,a_shr,b_shr;
	  bfp_t c,d,result,result_temp;
	  ap_fixed<2*W+I_test_value_dot,2*I+I_test_value_dot> interm[num_elements];
	  //ap_fixed<W+1,I+1> interm[num_elements];
 	   if(a.exp<b.exp){
 		   interm_exp=b.exp+1;
 	   }
 	   else if(a.exp>b.exp){
 		   interm_exp=a.exp+1;
 	   }
 	   else{
 		   interm_exp=0;
 	   }
 	   a_shr=interm_exp-a.exp;
 	   b_shr=interm_exp-b.exp;
	   	  denomalize(c,a,a_shr);
	   	  denomalize(d,b,b_shr);
 	   for(i = 0; i < num_elements; ++i) {
 		interm[i] = c.data[i] - d.data[i];// different from add here
 	   }
 	   exp_val=check_overflow(1,interm);
 	   if(exp_val==0){
	   	   for(i = 0; i < num_elements; ++i) {
	   		result_temp.data[i] =interm[i];
	   	   }
	   	   	  renomalize(result_temp);
	   	   	  result.exp=result_temp.exp+interm_exp;
	   	   for(i = 0; i < num_elements; ++i) {
				result.data[i] =result_temp.data[i];
 	   }
 	   }
 	   else{
 		for(i = 0; i < num_elements; ++i) {
 			result.data[i] =interm[i]>>exp_val;
 			result.exp=1+interm_exp;
		   }
 	   }
    result.length = num_elements;
    return result;
   }

   friend bfp_t operator-(const bfp_t& a, const bfp_t& b) {
 	  return subtract(a, b);
   }

   static bfp_t multiply(const bfp_t& a, const bfp_t& b) {
		  unsigned i;
		  exp_t exp_val,interm_exp;
	   	  bfp_t c,d,result;
	   	ap_fixed<2*W+I_test_value_dot,2*I+I_test_value_dot> interm[num_elements];
         //ap_fixed<2*W,2*I> interm[num_elements];
         interm_exp=a.exp+b.exp;//new shared exp
         for (i = 0; i < num_elements; ++i) {
        	 interm[i] = a.data[i] * b.data[i];
         }

	      exp_val=check_overflow(2,interm);
	      if(exp_val==0){
		   for(i = 0; i < num_elements; ++i) {
			result.data[i] =interm[i];
		   }
			  renomalize(result);
			  result.exp=result.exp+interm_exp;

	   }
	   else{
		for(i = 0; i < num_elements; ++i) {
		  result.data[i] =interm[i]>>exp_val;
		   }
		  result.exp=interm_exp+exp_val;
	   }
		  result.length = num_elements;
		 return result;

     }
   friend bfp_t operator*(const bfp_t& a, const bfp_t& b) {
         return multiply(a, b);
     }

   static bfp_dot_product_t dot_product(const bfp_t& a, const bfp_t& b) {
	  unsigned i;
	  bfp_dot_product_t result;
	  exp_t exp_val=0,interm_exp;
	  //constexpr unsigned int I_test_value_dot = compute_ITest();
	  ap_fixed<2*W+I_test_value_dot,2*I+I_test_value_dot> interm[1];
	  //ap_fixed<2*W+I_test_value,2*I+I_test_value> interm_test=0;
	  interm_exp=a.exp+b.exp;
	  for (i = 0; i < num_elements; ++i) {
		 interm[0] += a.data[i] * b.data[i];
		 //interm_test=interm>>16;
		 //std::cout << interm_test << " ";
	  }
	  exp_val=check_overflow(3,interm);
	  if(exp_val==0){
		  result.data =interm[0];
		  renomalize_dot_product(result);
		  result.exp=result.exp+interm_exp;
	   }
	   else{
			result.data =interm[0]>>exp_val;
			result.exp=interm_exp+exp_val;
	   }
		 return result;
	   }
   static bfp_dot_product_t dot_product_m(const mantissa_t data1[num_elements],const mantissa_t data2[num_elements],
		   const exp_t exp1, const exp_t exp2, int start1, int start2,int MATRIX_SIZE) {
	  unsigned i;
	  bfp_dot_product_t result;
	  exp_t exp_val=0,interm_exp;
	  //constexpr unsigned int I_test_value_dot = compute_ITest();
	  ap_fixed<2*W+I_test_value_dot,2*I+I_test_value_dot> interm[1];
	  //ap_fixed<2*W+I_test_value,2*I+I_test_value> interm_test=0;
	  interm_exp=exp1+exp2;
	  for (i = 0; i < MATRIX_SIZE; ++i) {
		 interm[0] += data1[i+start1] * data2[i+start2];
		 //interm_test=interm>>16;
		 //std::cout << interm_test << " ";
	  }
	  exp_val=check_overflow(3,interm);
	  if(exp_val==0){
		  result.data =interm[0];
		  renomalize_dot_product(result);
		  result.exp=result.exp+interm_exp;
	   }
	   else{
		   result.data =interm[0]>>exp_val;
		   result.exp=interm_exp+exp_val;
	   }
		 return result;
	   }

   void to_double_array(const mantissa_t data,const exp_t exp, double& result_array) {
	   	   	   double data_double,exp_double,aa;
	   	   	   data_double=data.to_double();
	   	   	   exp_double=exp.to_double();
	   		result_array =data_double*pow(2,exp_double);
       }
   void to_double_array_1(const bfp_t& a, double result_array[num_elements]) {
	   	   	   double data_double,exp_double;
	   	   	   exp_double=a.exp.to_double();
	   	   	for (int i = 0; i < num_elements; ++i){
	   	   	   data_double=a.data[i].to_double();
	   		result_array[i] =data_double*pow(2,exp_double);
	   	   	}
       }
};


