/*
 * pulp_gmm.c
 * Manuele Rusci - manuele.rusci@unibo.it
 * Tommaso Polonelli - 
 *
 * Copyright (C) 2020 University of Bologna, Greenwaves Technolgies
 *
 * This software may be modified and distributed under the terms
 * of the MIT license.  See the LICENSE file for details.
 *
 * Created on: September 24, 2020
 *
 */
#include "pmsis.h"
#include "pulp_gmm.h"
#include "gmm.h"

#define PERF 1

inline static unsigned int __attribute__((always_inline)) ChunkSize(unsigned int X)

{
    unsigned int NCore = pi_cl_team_nb_cores();
    unsigned int Log2Core =  __builtin_pulp_fl1(NCore);
    unsigned int Chunk = (X>>Log2Core) + ((X&(NCore-1))!=0);
    return Chunk;
}

void pulpCV_fc_GMMInit(
	pulp_GMM_Operator * pulpGMMop,		// GMM configuration structure
	int W_IMG, 					// image width
	int H_IMG, 					// image height
	int hystory, 				// GMM history
	uint8_t Modes, 				// GMM gaussian mode
	struct pi_device *ram 		// ram handler
){
	// temp variuable
	uint32_t hyper_buff;
	uint8_t *buff;

	//
	GMM_Settings * GMMconf = (GMM_Settings *) pmsis_l2_malloc(sizeof(GMM_Settings));
	if (GMMconf == NULL)
	{
		printf("buff alloc failed !\n\r");
		pmsis_exit(-1);
	}
	pulpGMMop->GMMconf = GMMconf;
	pulpGMMop->ram = ram;


	//Parameter Initialization
	GMMconf->ImgWidth = W_IMG;
	GMMconf->ImgHeight = H_IMG;
	GMMconf->ImgSize = (W_IMG*H_IMG);
	GMMconf->MaxModes = Modes;
	// algorithm parameters
	GMMconf->Threshold = 16;
	GMMconf->variance = INT2FIXEDU(15);
	GMMconf->max_variance = INT2FIXEDU(45);
	GMMconf->min_variance = INT2FIXEDU(4);
	GMMconf->complexity_prior = FLOAT2FIXEDU(0.05f);
	GMMconf->BG_Th = FLOAT2FIXEDU(0.9f);

	GMMconf->alpha = FIXEDU_DIV(FIXEDU_ONE, INT2FIXEDU(hystory));
	GMMconf->prune = __builtin_pulp_muluRN(GMMconf->alpha, GMMconf->complexity_prior, QBITS, 1<<(QBITS-1));	//operands are positive and < 1 (Q16.16)
	GMMconf->one_minus_alpha = (FIXEDU_ONE - GMMconf->alpha);

	PRINTF("prune=%d\n", GMMconf->prune);
	PRINTF("alphs=%d,one_minus_alpha=%d\n",GMMconf->alpha,GMMconf->one_minus_alpha);

	// Init hyper at cold boot
	if (pi_ram_alloc(ram, &hyper_buff, (uint32_t) GMMconf->ImgSize*GMMconf->MaxModes*sizeof(GMM)))
	{
		PRINTF("Ram malloc failed !\n\r");
		pmsis_exit(-4);
	}

	PRINTF("Ram allocated : %lx %ld.\n\r", hyper_buff, (uint32_t) GMMconf->ImgSize*GMMconf->MaxModes*sizeof(GMM));
	pulpGMMop->GMModel = hyper_buff;

	/* Init model */
	buff = (uint8_t *) pmsis_l2_malloc((uint32_t) GMMconf->ImgSize);
	if (buff == NULL)
	{
		PRINTF("buff alloc failed !\n\r");
		pmsis_exit(-1);
	}
	memset(buff,0,GMMconf->ImgSize);
	PRINTF("GMM INIT: ");
	for(int i = 0; i < (int)(((int) GMMconf->MaxModes)*sizeof(GMM)); i++){
		pi_ram_write(ram, hyper_buff, (uint8_t *)buff, (uint32_t) GMMconf->ImgSize);
		hyper_buff += GMMconf->ImgSize;
		PRINTF(".");
	}
	PRINTF("\n\r");
	pmsis_l2_malloc_free(buff, (uint32_t) GMMconf->ImgSize);

	PRINTF("GMM_OK\n\r");


}



void pulpCV_fc_GMMSubtract(	
	pulp_GMM_Operator * pulpGMMop
){

	uint8_t	* input_frame = pulpGMMop->input_frame;
	uint8_t	* output_frame = pulpGMMop->output_frame;
	uint32_t GMModel = pulpGMMop->GMModel;
	struct pi_device *ram = pulpGMMop->ram;

	// GMM settings
	GMM_Settings * GMMconf = pulpGMMop->GMMconf;
	int ImgHeight = GMMconf->ImgHeight;
	int ImgWidth = GMMconf->ImgWidth;
	int ImgSize = GMMconf->ImgSize;
	int MaxModes = GMMconf->MaxModes;

	// temporary memory to buffer data from L3
	GMM *Gauss;
	uint32_t hyper_p;

	// TEMP --- FIXME!!!
	int BLOCK_SIZE = ImgWidth;


#if PERF
	uint32_t tt;
	pi_perf_conf(1 << PI_PERF_CYCLES | 1 << PI_PERF_ACTIVE_CYCLES);
	pi_perf_start();
	tt = pi_perf_read(PI_PERF_ACTIVE_CYCLES);
#endif

	PRINTF("---------------\n");
	// allocate temporary memory
	Gauss = (GMM *) pi_l2_malloc((uint32_t) sizeof(GMM)*MaxModes*BLOCK_SIZE);
	if (Gauss == NULL)
	{
		PRINTF("buff alloc failed !\n");
		pmsis_exit(-1);
	}

	// iterate on tiles (image blocks)
	for(int i = 0; i < ImgSize; i += BLOCK_SIZE){		
		
		// read data and parameters to local memory
		hyper_p = GMModel + i*sizeof(GMM)*MaxModes;
		pi_ram_read(ram, hyper_p, (uint8_t *) Gauss, 
			(uint32_t)(sizeof(GMM)*MaxModes*BLOCK_SIZE));

		// compute GMM over the tile's pixels
		for(int j = 0; j < BLOCK_SIZE; j++){
			int pIdx = i+j;
			uint8_t pixel = input_frame[pIdx];
			// compute the binary output
			pixel = SubtractPixel(pixel, j, Gauss, GMMconf);
			output_frame[pIdx] = pixel;

		}
		PRINTF("\n");

		// store data back to memory
		pi_ram_write(ram, hyper_p, (uint8_t *) Gauss, 
			(uint32_t)(sizeof(GMM)*MaxModes*BLOCK_SIZE));
	}

	pi_l2_free(Gauss, (uint32_t) sizeof(GMM)*MaxModes*BLOCK_SIZE);

#if PERF
	pi_perf_stop();
	printf("GMM Cycles %d \n",(int)(pi_perf_read(PI_PERF_ACTIVE_CYCLES)-tt));
	printf("GMM cpp %d \n",((int)(pi_perf_read(PI_PERF_ACTIVE_CYCLES)-tt))/((int)(ImgSize)));
#endif


}



void __pulpCV_cl_GMMSubtract(
	pulp_GMM_Operator * pulpGMMop
)
{
	uint32_t CoreId = pi_core_id();

	uint8_t	* input_frame   = pulpGMMop->input_frame;
	uint8_t	* output_frame 	= pulpGMMop->output_frame;
	uint32_t GMModel 		= pulpGMMop->GMModel;
	struct pi_device *ram = pulpGMMop->ram;
    pi_cl_ram_req_t buff_req,buff_req2;
 	pi_cl_alloc_req_t cl_alloc_req;
 	struct pi_cl_free_req_s pi_cl_free_req_t;


	// GMM settings
	GMM_Settings * GMMconf = pulpGMMop->GMMconf;
	int ImgHeight = GMMconf->ImgHeight;
	int ImgWidth = GMMconf->ImgWidth;
	int ImgSize = GMMconf->ImgSize;
	int MaxModes = GMMconf->MaxModes;

	// temporary memory to buffer data from L3
	uint32_t hyper_p;

	// chuncking
	int BLOCK_SIZE = ImgWidth;
    int Chunk  = ChunkSize(BLOCK_SIZE);
    int First  = CoreId*Chunk;
    int Last   = (First+Chunk > BLOCK_SIZE) ? (BLOCK_SIZE) : (First+Chunk);
    int ParBlocksize = Last - First;

//	printf("----- From %d to %d----------\n", First, Last);

	// allocate temporary per-core memory
	pi_cl_l2_malloc((uint32_t) sizeof(GMM)*MaxModes*ParBlocksize, &cl_alloc_req);
	GMM * Gauss = (GMM *) pi_cl_l2_malloc_wait(&cl_alloc_req);

	if (Gauss == NULL)
	{
		PRINTF("buff alloc failed !\n");
		pmsis_exit(-1);
	}

//	printf("(%d)%x\n", CoreId, Gauss);

	// iterate on tiles (image blocks)
	for(int i = 0; i < ImgHeight; i++){		
		
		int pIdx= i*ImgWidth+First;
		// read data and parameters to local memory
		hyper_p = GMModel + pIdx*sizeof(GMM)*MaxModes;
		pi_cl_ram_read(ram, hyper_p, (uint8_t *) Gauss, 
			(uint32_t)(sizeof(GMM)*MaxModes*ParBlocksize), &buff_req);
		pi_cl_ram_read_wait(&buff_req);

		// compute GMM over the tile's pixels
		for(int j = 0; j < ParBlocksize; j++){
			uint8_t pixel = input_frame[pIdx+j];
			// compute the binary output
			pixel = SubtractPixel(pixel, j, Gauss, GMMconf);
			output_frame[pIdx+j] = pixel;

		}

		PRINTF("\n");

		// store data back to memory
		pi_cl_ram_write(ram, hyper_p, (uint8_t *) Gauss, 
			(uint32_t)(sizeof(GMM)*MaxModes*ParBlocksize), &buff_req2);
		pi_cl_ram_write_wait(&buff_req2);

	}


	pi_cl_l2_free(Gauss, (uint32_t) sizeof(GMM)*MaxModes*ParBlocksize, &pi_cl_free_req_t);
	pi_cl_l2_free_wait(&pi_cl_free_req_t);

	pi_cl_team_barrier(); 	

}

void pulpCV_cl_GMMSubtract(	
	pulp_GMM_Operator * pulpGMMop
)
{
	GMM_Settings * GMMconf = pulpGMMop->GMMconf;
	int ImgSize = GMMconf->ImgSize;

#if PERF
	uint32_t tt;
	pi_perf_conf(1 << PI_PERF_CYCLES | 1 << PI_PERF_ACTIVE_CYCLES);
	pi_perf_start();
	tt = pi_perf_read(PI_PERF_ACTIVE_CYCLES);
#endif

	pi_cl_team_fork( 	
		pi_cl_cluster_nb_cores(), 
		(void *)__pulpCV_cl_GMMSubtract, (void *) pulpGMMop);

#if PERF
	pi_perf_stop();
	printf("GMM Cycles %d \n",(int)(pi_perf_read(PI_PERF_ACTIVE_CYCLES)-tt));
	printf("GMM cpp %d \n",((int)(pi_perf_read(PI_PERF_ACTIVE_CYCLES)-tt))/((int)(ImgSize)));
#endif
}
