/*
 * pulp_gmm.c
 * Manuele Rusci - manuele.rusci@unibo.it
 * Tommaso Polonelli - tommaso.polonelli2@unibo.it
 *
 * Copyright (C) 2020 University of Bologna, Greenwaves Technolgies
 *
 * This software may be modified and distributed under the terms
 * of the MIT license.  See the LICENSE file for details.
 *
 * Created on: January 24, 2021
 *
 */

#include "pmsis.h"
#include "equalize_hist.h"


/*********** Constants  ****************/

/* Operation block size (number of pixel per operators)
 * this impact the L1 memory but decrease number of transfers L1 -> L3
 */

#define PERF		(0)

/*********** Structures *************/

enum {
	/* must be equal to the color depth */
	HIST_SZ = 256
};

inline static void __attribute__((always_inline)) cl_in(uint8_t *l2_in, uint8_t *l1_buffer, int sz){

	//pi_cl_team_critical_enter();

	pi_cl_dma_copy_t copy;

	copy.dir = PI_CL_DMA_DIR_EXT2LOC;
	copy.merge = 0;
	copy.size = (uint16_t) sz;
	copy.id = 0;
	copy.ext = (uint32_t) l2_in;
	copy.loc = (uint32_t) l1_buffer;

	pi_cl_dma_memcpy(&copy);
	pi_cl_dma_wait(&copy);

	//pi_cl_team_critical_exit();

}

inline static void __attribute__((always_inline)) cl_out(uint8_t *l2_out, uint8_t *l1_buffer, int sz){

	//pi_cl_team_critical_enter();

	pi_cl_dma_copy_t copy;

	copy.dir = PI_CL_DMA_DIR_LOC2EXT;
	copy.merge = 0;
	copy.size = (uint16_t) sz;
	copy.id = 0;
	copy.ext = (uint32_t)l2_out;
	copy.loc = (uint32_t)l1_buffer;

	pi_cl_dma_memcpy(&copy);
	pi_cl_dma_wait(&copy);

	//pi_cl_team_critical_exit();

}


void pulpCV_fc_Hist_Eq(
		pulp_Hist_Operator * pulpHistop
){

	uint8_t	* input_frame = pulpHistop->input_frame;
	uint8_t	* output_frame = pulpHistop->output_frame;
	int W_IMG 	= pulpHistop->W_IMG;
	int H_IMG	= pulpHistop->H_IMG;
	int hist_sz = pulpHistop->hist_sz;

	// Init
	int *hist; //could be initialized to zero
	int *lut;
	int i = 0;
	int total = W_IMG * H_IMG;


#if PERF
	uint32_t tt;
	int ImgSize = pulpHistop->W_IMG * pulpHistop->H_IMG;
	pi_perf_conf(1 << PI_PERF_CYCLES | 1 << PI_PERF_ACTIVE_CYCLES);
	pi_perf_start();
	tt = pi_perf_read(PI_PERF_ACTIVE_CYCLES);
	printf("---------------\n");
#endif

	/* init color depth */
	if ((hist_sz == 0) || (hist_sz > HIST_SZ)){
		/* default color depth */
		hist_sz = HIST_SZ;
	}

	// allocate temporary memory
	/* Init hist in L1 */
	hist = (int *) pi_fc_l1_malloc((uint32_t) sizeof(int)*hist_sz);
	if (hist == NULL)
	{
		printf("buff alloc failed !\n");
		pmsis_exit(-1);
	}
	/* Init lut in L1 */
	lut = (int *) pi_fc_l1_malloc((uint32_t) sizeof(int)*hist_sz);
	if (lut == NULL)
	{
		printf("buff alloc failed !\n");
		pmsis_exit(-1);
	}

	/* run body calc */
	memset(hist,0,sizeof(int)*hist_sz);
	EqualizeHistCalcHist_Run(input_frame, hist, total);

	/* check Hist */
	/* error if the image has a single color */
	while (!hist[i]) ++i;
	if (hist[i] == total)
	{
		if (output_frame != input_frame){
			memcpy(output_frame, input_frame, total);
			struct pi_dmacpy_conf conf;
			pi_dmacpy_conf_init	(&conf);
			struct pi_device device;
			pi_dmacpy_open	(&device);

			total = total - (total%4);
			pi_dmacpy_copy	(&device, input_frame, output_frame, total, PI_DMACPY_L2_L2);

			pi_dmacpy_close	(&device);
		}
		return;
	}

	EqualizeScale_Run(lut, hist, hist_sz, total, i);

	EqualizeHistLut_Run(lut, input_frame, output_frame, total);

	pi_fc_l1_free(hist, (uint32_t) sizeof(int)*hist_sz);
	pi_fc_l1_free(lut, (uint32_t) sizeof(int)*hist_sz);

#if PERF
	pi_perf_stop();
	printf("Hist Cycles %d \n",(int)(pi_perf_read(PI_PERF_ACTIVE_CYCLES)-tt));
	printf("Hist cpp %d \n",((int)(pi_perf_read(PI_PERF_ACTIVE_CYCLES)-tt))/((int)(ImgSize)));
#endif

}


void __pulpCV_cl_Hist_Eq(
		pulp_Hist_Operator * pulpHistop
){

	uint32_t CoreId = pi_core_id();

	uint8_t	* input_frame = pulpHistop->input_frame;
	uint8_t	* output_frame = pulpHistop->output_frame;

	// Init
	int W_IMG 	= pulpHistop->W_IMG;
	int H_IMG	= pulpHistop->H_IMG;
	int hist_sz = pulpHistop->hist_sz;
	int *hist, *hist_tot, *lut_tot;
	int i;
	int total = W_IMG * H_IMG;
	uint8_t * img_l1;
	unsigned int NCore = pi_cl_team_nb_cores();

	hist_tot = pulpHistop->hist_tot;
	lut_tot = pulpHistop->lut_tot;

	// chuncking
	int BLOCK_SIZE = (W_IMG) - ((W_IMG) % 4);
	int blocks = total/BLOCK_SIZE;

	//allocate memory
	pi_cl_team_critical_enter();
	/* Init hist in L1 */
	hist = (int *) pmsis_l1_malloc((uint32_t) sizeof(int)*hist_sz);
	if (hist == NULL)
	{
		printf("buff alloc failed !\n");
		pmsis_exit(-1);
	}
	/* Init image buffer in L1 */
	img_l1 = (uint8_t *) pmsis_l1_malloc((uint32_t) BLOCK_SIZE+4);
	if (img_l1 == NULL)
	{
		printf("buff alloc failed !\n");
		pmsis_exit(-1);
	}
	pi_cl_team_critical_exit();

	/* run body calc */
	memset(hist,0,sizeof(int)*hist_sz);
	for(i = CoreId; i < blocks; i=i+NCore){
		uint8_t	* in = input_frame + (BLOCK_SIZE*i);
		cl_in(in, img_l1, BLOCK_SIZE);
		EqualizeHistCalcHist_Run(img_l1, hist, BLOCK_SIZE);
	}
	/* Asymmetric part */
	/* first free core enter and execute */
	if (((NCore == 1)) || ((CoreId == 1) && (NCore > 1))){
		int Last = H_IMG * (W_IMG % 4);
		if (Last){
			uint8_t	* in = input_frame + total - Last;
			cl_in(in, img_l1, BLOCK_SIZE);
			EqualizeHistCalcHist_Run(img_l1, hist, Last);
		}
	}

	/* global hist update */
	pi_cl_team_critical_enter();
	for(i = 0; i < hist_sz; i++){
		hist_tot[i] += hist[i];
	}
	pi_cl_team_critical_exit();

	pi_cl_team_barrier();

	/* check Hist */
	/* error if the image has a single color */
	i = 0;
	while (!hist_tot[i]) ++i;
	if (hist_tot[i] == total)
	{
		/* only core 0 */
		if (CoreId == 0){
			//call fc L2 to L2 copy
		}
		pi_cl_team_barrier();
		return;
	}

	/* only core 0 */
	if (CoreId == 0){
		//call fc L2 to L2 copy
		EqualizeScale_Run(lut_tot, hist_tot, hist_sz, total, i);
	}

	pi_cl_team_barrier();

	/* run body calc */
	for(i = CoreId; i < blocks; i=i+NCore){
		uint8_t	* in = input_frame + (BLOCK_SIZE*i);
		uint8_t	* out = output_frame + (BLOCK_SIZE*i);
		cl_in(in, img_l1, BLOCK_SIZE);
		EqualizeHistLut_Run(lut_tot, img_l1, img_l1, BLOCK_SIZE);
		cl_out(out, img_l1, BLOCK_SIZE);
	}
	/* Asymmetric part */
	/* first free core enter and execute */
	if (CoreId == 0){
		int Last = H_IMG * (W_IMG % 4);
		if (Last){
			uint8_t	* in = input_frame + total - Last;
			uint8_t	* out = input_frame + total - Last;
			cl_in(in, img_l1, BLOCK_SIZE);
			EqualizeHistLut_Run(lut_tot, img_l1, img_l1, Last);
			/* error in copying */
			cl_out(out, img_l1, (Last - (Last % 4)));
		}
	}

	// free memory
	pi_cl_team_critical_enter();
	pmsis_l1_malloc_free(hist, (uint32_t) sizeof(int)*hist_sz);
	pmsis_l1_malloc_free(img_l1, (uint32_t) BLOCK_SIZE);
	pi_cl_team_critical_exit();

	pi_cl_team_barrier();

	return;

}

void pulpCV_cl_Hist_Eq(
		pulp_Hist_Operator * pulpHistop
){

#if PERF
	uint32_t tt;
	int ImgSize = pulpHistop->W_IMG * pulpHistop->H_IMG;
	pi_perf_conf(1 << PI_PERF_CYCLES | 1 << PI_PERF_ACTIVE_CYCLES);
	printf("---------------\n");
	printf("Core controller: %d \n",pi_core_id());
	pi_perf_start();
	tt = pi_perf_read(PI_PERF_ACTIVE_CYCLES);
#endif

	/* init color depth */
	if ((pulpHistop->hist_sz == 0) || (pulpHistop->hist_sz > HIST_SZ)){
		/* default color depth */
		pulpHistop->hist_sz = HIST_SZ;
	}

	/* Init hist in L1 */
	pulpHistop->hist_tot = (int *) pmsis_l1_malloc((uint32_t) sizeof(int)*pulpHistop->hist_sz);
	if (pulpHistop->hist_tot == NULL)
	{
		printf("buff alloc failed !\n");
		pmsis_exit(-1);
	}
	memset(pulpHistop->hist_tot,0,sizeof(int)*pulpHistop->hist_sz);
	/* Init lut in L1 */
	pulpHistop->lut_tot = (int *) pmsis_l1_malloc((uint32_t) sizeof(int)*pulpHistop->hist_sz);

	pi_cl_team_fork(
			pi_cl_cluster_nb_cores(),
			(void *)__pulpCV_cl_Hist_Eq, (void *) pulpHistop);

	pmsis_l1_malloc_free(pulpHistop->hist_tot, (uint32_t) sizeof(int)*pulpHistop->hist_sz);
	pmsis_l1_malloc_free(pulpHistop->lut_tot, (uint32_t) sizeof(int)*pulpHistop->hist_sz);


#if PERF
	pi_perf_stop();
	printf("Hist Cycles %d \n",(int)(pi_perf_read(PI_PERF_ACTIVE_CYCLES)-tt));
	printf("Hist cpp %d \n",((int)(pi_perf_read(PI_PERF_ACTIVE_CYCLES)-tt))/((int)(ImgSize)));
#endif

}


