/*
 * pulp_gmm.c
 * Manuele Rusci - manuele.rusci@unibo.it
 * Tommaso Polonelli - 
 *
 * Copyright (C) 2020 University of Bologna, Greenwaves Technolgies
 *
 * This software may be modified and distributed under the terms
 * of the MIT license.  See the LICENSE file for details.
 *
 * Created on: September 24, 2020
 *
 */
#include "pmsis.h"
#include "pulp_framediff.h"
#include "setup.h"

static struct pi_device dmacpy;


inline static unsigned int __attribute__((always_inline)) ChunkSize(unsigned int X)

{
    unsigned int NCore = pi_cl_team_nb_cores();
    unsigned int Log2Core =  __builtin_pulp_fl1(NCore);
    unsigned int Chunk = (X>>Log2Core) + ((X&(NCore-1))!=0);
    return Chunk;
}

void pulpCV_fc_FDInit(
	pulpCV_framediff_Operator * pulpFDop,	// GMM configuration structure
	int W_IMG, 								// image width
	int H_IMG, 								// image height
	int FD_mode, 							// FD mode
	uint8_t FD_threshold 					// Binarization threshold, if any
){
	//
	FD_Settings * FDconf = (FD_Settings *) pmsis_l2_malloc(sizeof(FD_Settings));
	if (FDconf == NULL)
	{
		printf("buff alloc failed !\n\r");
		pmsis_exit(-1);
	}
	pulpFDop->FDconf = FDconf;

	//Parameter Initialization
	FDconf->ImgWidth = W_IMG;
	FDconf->ImgHeight = H_IMG;
	FDconf->BlockSize = W_IMG;
	FDconf->Mode = FD_mode;
	FDconf->Threshold = FD_threshold;
}



void pulpCV_fc_FrameDiff(	
	pulpCV_framediff_Operator * pulpFDop
){

	uint8_t	* input_frame = pulpFDop->input_frame;
	uint8_t	* output_frame = pulpFDop->output_frame;
	uint8_t	* bckg_frame = pulpFDop->bckg_frame;

	// GMM settings
	FD_Settings * FDconf = pulpFDop->FDconf;
	int ImgHeight 		= FDconf->ImgHeight;
	int ImgWidth 		= FDconf->ImgWidth;
	int BlockSize 		= FDconf->BlockSize;
	int Mode 			= FDconf->Mode;
	uint8_t Threshold 	= FDconf->Threshold;

	int ImgSize = ImgWidth * ImgHeight;


    /* Init & open dmacpy. */
    int errcpy, errcpy2, errcpy3;
    struct pi_dmacpy_conf dmacpy_conf = {0};
    pi_dmacpy_conf_init(&dmacpy_conf);
    pi_open_from_conf(&dmacpy, &dmacpy_conf);
    errcpy = pi_dmacpy_open(&dmacpy);
    if (errcpy)
    {
        printf("Error dmacpy open : %ld !\n", errcpy);
        pmsis_exit(-1);
    }



#if PERF
	uint32_t tt;
	pi_perf_conf(1 << PI_PERF_CYCLES | 1 << PI_PERF_ACTIVE_CYCLES);
	pi_perf_start();
	tt = pi_perf_read(PI_PERF_ACTIVE_CYCLES);
#endif

	uint8_t *  input_block_L1 = (uint8_t *) pi_fc_l1_malloc(BlockSize);
	uint8_t *   bckg_block_L1 = (uint8_t *) pi_fc_l1_malloc(BlockSize);

    if ((input_block_L1 == NULL)||(bckg_block_L1 == NULL))
    {
        printf("FC L1 buffer alloc failed !\n");
        pmsis_exit(-2);
    }

    PRINTF("Going to process %d data splitted by %d sized blocks!\n", ImgSize,BlockSize );

	// iterate on tiles (image blocks)
	for(int i = 0; i < ImgSize; i += BlockSize){		
		
		PRINTF("Start %d transfer\n", i);

		// read block data from L2 to L1
		errcpy = pi_dmacpy_copy( &dmacpy, 
					(uint8_t *) input_frame + i, (uint8_t *) input_block_L1,
                    BlockSize, PI_DMACPY_L2_FC_L1);

		errcpy2 = pi_dmacpy_copy( &dmacpy, 
					(uint8_t *) bckg_frame + i, (uint8_t *) bckg_block_L1,
                    BlockSize, PI_DMACPY_L2_FC_L1);

	    if (errcpy || errcpy2)
	    {
	        printf("Copy from L2 to FC failed : %d - %d\n", errcpy, errcpy2);
	        pmsis_exit(-3);
	    }

		// compute FD over the tile's pixels - inplace 
		PRINTF("FrameDifference\n");

		FrameDifference(
				input_block_L1,		// input frame in L1 memory
				bckg_block_L1,		// bckgnd frame in L1 memory	
				input_block_L1, 	// output frame in L1 memory - inplace
				FDconf				// FD configuration structure
		);

		PRINTF("\n");

		// store data back to memory
    	errcpy = pi_dmacpy_copy(&dmacpy, 
    			(uint8_t *) bckg_block_L1, (uint8_t *) bckg_frame + i,
                BlockSize, PI_DMACPY_FC_L1_L2);

    	errcpy2 = pi_dmacpy_copy(&dmacpy, 
    			(uint8_t *) input_block_L1, (uint8_t *) output_frame + i,
                BlockSize, PI_DMACPY_FC_L1_L2);
	    if (errcpy || errcpy2)
	    {
	        printf("Copy from FC to L2 failed : %d - %d\n", errcpy, errcpy2);
	        pmsis_exit(-4);
	    }
	}

    pi_fc_l1_free(input_block_L1, BlockSize);
    pi_fc_l1_free(bckg_block_L1,  BlockSize);


#if PERF
	pi_perf_stop();
	printf("GMM Cycles %d \n",(int)(pi_perf_read(PI_PERF_ACTIVE_CYCLES)-tt));
	printf("GMM cpp %d \n",((int)(pi_perf_read(PI_PERF_ACTIVE_CYCLES)-tt))/((int)(ImgSize)));
#endif


}



//void __pulpCV_cl_GMMSubtract(
//	pulp_GMM_Operator * pulpGMMop
//)
//{
//	uint32_t CoreId = pi_core_id();
//
//	uint8_t	* input_frame   = pulpGMMop->input_frame;
//	uint8_t	* output_frame 	= pulpGMMop->output_frame;
//	uint32_t GMModel 		= pulpGMMop->GMModel;
//	struct pi_device *ram = pulpGMMop->ram;
//    pi_cl_ram_req_t buff_req,buff_req2;
// 	pi_cl_alloc_req_t cl_alloc_req;
//
//	// GMM settings
//	GMM_Settings * GMMconf = pulpGMMop->GMMconf;
//	int ImgHeight = GMMconf->ImgHeight;
//	int ImgWidth = GMMconf->ImgWidth;
//	int ImgSize = GMMconf->ImgSize;
//	int MaxModes = GMMconf->MaxModes;
//
//	// temporary memory to buffer data from L3
//	uint32_t hyper_p;
//
//	// chuncking
//	int BlockSize = ImgWidth;
//    int Chunk  = ChunkSize(BlockSize);
//    int First  = CoreId*Chunk;
//    int Last   = (First+Chunk > BlockSize) ? (BlockSize) : (First+Chunk);
//    int ParBlocksize = Last - First;
//
////	printf("----- From %d to %d----------\n", First, Last);
//
//	// allocate temporary per-core memory
//	pi_cl_l2_malloc((uint32_t) sizeof(GMM)*MaxModes*ParBlocksize, &cl_alloc_req);
//	GMM * Gauss = (GMM *) pi_cl_l2_malloc_wait(&cl_alloc_req);
//
//	if (Gauss == NULL)
//	{
//		PRINTF("buff alloc failed !\n");
//		pmsis_exit(-1);
//	}
//
////	printf("(%d)%x\n", CoreId, Gauss);
//
//	// iterate on tiles (image blocks)
//	for(int i = 0; i < ImgHeight; i++){		
//		
//		int pIdx= i*ImgWidth+First;
//		// read data and parameters to local memory
//		hyper_p = GMModel + pIdx*sizeof(GMM)*MaxModes;
//		pi_cl_ram_read(ram, hyper_p, (uint8_t *) Gauss, 
//			(uint32_t)(sizeof(GMM)*MaxModes*ParBlocksize), &buff_req);
//		pi_cl_ram_read_wait(&buff_req);
//
//		// compute GMM over the tile's pixels
//		for(int j = 0; j < ParBlocksize; j++){
//			uint8_t pixel = input_frame[pIdx+j];
//			// compute the binary output
//			pixel = SubtractPixel(pixel, j, Gauss, GMMconf);
//			output_frame[pIdx+j] = pixel;
//
//		}
//
//		PRINTF("\n");
//
//		// store data back to memory
//		pi_cl_ram_write(ram, hyper_p, (uint8_t *) Gauss, 
//			(uint32_t)(sizeof(GMM)*MaxModes*ParBlocksize), &buff_req2);
//		pi_cl_ram_write_wait(&buff_req2);
//
//	}
//
//
//	pi_cl_l2_free(Gauss, (uint32_t) sizeof(GMM)*MaxModes*ParBlocksize, &cl_alloc_req);
//	pi_cl_l2_free_wait(&cl_alloc_req);
//
//	pi_cl_team_barrier(); 	
//
//}
//
//void pulpCV_cl_GMMSubtract(	
//	pulp_GMM_Operator * pulpGMMop
//)
//{
//	GMM_Settings * GMMconf = pulpGMMop->GMMconf;
//	int ImgSize = GMMconf->ImgSize;
//
//#if PERF
//	uint32_t tt;
//	pi_perf_conf(1 << PI_PERF_CYCLES | 1 << PI_PERF_ACTIVE_CYCLES);
//	pi_perf_start();
//	tt = pi_perf_read(PI_PERF_ACTIVE_CYCLES);
//#endif
//
//	pi_cl_team_fork( 	
//		pi_cl_cluster_nb_cores(), 
//		(void *)__pulpCV_cl_GMMSubtract, (void *) pulpGMMop);
//
//#if PERF
//	pi_perf_stop();
//	printf("GMM Cycles %d \n",(int)(pi_perf_read(PI_PERF_ACTIVE_CYCLES)-tt));
//	printf("GMM cpp %d \n",((int)(pi_perf_read(PI_PERF_ACTIVE_CYCLES)-tt))/((int)(ImgSize)));
//#endif
//}