library(data.table)
library(formattable)
library(dplyr)
library(tidyr)
library(partykit)
library(fitdistrplus)
library(basefun)
library(diffpriv)
library(caret)
library(wakefield)
library(ggplot2)
library(plyr)
library(ineq)
library(vegan)
#library(fmsb)
#library(plotly)
library(rlist)
library(qwraps2)

windows.options(record=TRUE)
customGreen0 = "#DeF7E9"

customGreen = "#71CA97"

customRed = "#ff7f7f"

set.seed(7)
setwd("C:\\Users\\Flavio\\Documenti\\Tesi\\raccomandatore\\dataset")

source("util.R")

hr=read.table("student-por.csv",sep=";",header=TRUE)
hr$G1 <- NULL
hr$G2 <- NULL
f = G3 ~ .
fit <- ctree(f , data = hr,control = ctree_control(teststat = c("quad"), testtype = c("Bonferroni")))


plot(fit)

# stampa le circostanze per ogni tipo
c = partykit:::.list.rules.party(fit)

dfc <- data.frame(c)

colnames(dfc) <- c('Circumstances')
dfc$Type <- row.names(dfc)
dfc$Type<-unlist(lapply(dfc$Type,mapptype))
row.names(dfc)<-1:nrow(dfc)

formattable(dfc,
            align =c("l","r"), 
            list(`Type` = formatter(
              "span", style = ~ style(color = "black",font.weight = "bold")) 
            ))

# stampa statistiche per ogni tipo 
m = with(fitted(fit), tapply(`(response)`, `(fitted)`, mean))
s = with(fitted(fit), tapply(`(response)`, `(fitted)`, sd))
v = with(fitted(fit), tapply(`(response)`, `(fitted)`, var))


hr$node_placement <- predict(fit,newdata = hr,type = "node")

tipi = unique(hr$node_placement)

outcome <- "G3"

### DATASET 2
poveri <- hr[(hr$G3 < 10),]
#poveri_padrilow <- poveri[poveri$Fedu <= 2,]
#tanti_poveri_padristupidi <- sample_n(poveri_padristupidi, replace = TRUE, size = 300)
#hr10 <-rbind(hr, tanti_poveri_padristupidi)

### poveri bravi
poveri_bravi <- poveri[poveri$schoolsup == "no",]
hr10 <-rbind(hr, poveri_bravi)

#poveri_bravi <- sample_n(poveri_bravi, replace = TRUE, size = 300)

### DATASET 1
#hr <- sample_n(hr, size=1000, replace=TRUE)

############################ ANALISI DA FARE ############################
# - 1. stima tipi/effort
# - 2. ranking secondo outcome std
# - 3. calcolo ineq. con Gini + grafico Lorentz
# - 4. Opportunity loss profile
# - 5. Opportunity Loss Rate
# - 6. Distributive Rate
# - 7. Shannon index (su alcuni attributi)
# - 8. Theil index ( su G3 e outcome std)

# 1
br1 <- metodo_brunori(hr)
br2 <- metodo_brunori(hr10)

br <- br1

df <- br$df
info <- br$info
ineq(df$outcome)

df$node_placement <- unlist(lapply(df$node_placement, mapptype))

info$tipi <-unlist(lapply(info$tipi,mapptype))
####################################################################################################
### GRAFICI ECDF A CONFRONTO CON APPROSSIMAZIONE DI BERNSTEIN
par(mfrow=c(2,4))

for(k in 1:nrow(info)){
  tipo <- info[k,]$tipi
  ss <- df[ df$node_placement == tipo[[1]],]
  
  grado <- info[k,]$grado
  yy <- sort(ss[[outcome]])
  
  ba <- bern_app2(yy, grado)
  plot(ecdf(yy),xlab="outcome", ylab="CDF", main=paste0("CDF estimation for type ",tipo))
  lines(yy,ba, col="blue", type="b", lty=2,pch = 18)
  legend("topleft", legend=c("ECDF", "Bernstein approx."),
         col=c("black", "blue"), lty=1:2, cex=0.8)
}

# 2
### ranking equity
#rl <- ranking_top(df, "outcome")
r.100 <- rank_equity(df,100)
r.250 <- rank_equity(df,250)
r.500 <- rank_equity(df,500)

r.100$Ranking <- "equity-100"
r.250$Ranking <- "equity-250"
r.500$Ranking <- "equity-500"


### ranking equality
re.100 <- rank_equality(df,"sex",100)
re.250 <- rank_equality(df,"sex",250)
re.500 <- rank_equality(df,"sex",500)

re.100$Ranking <- "equality-100"
re.250$Ranking <- "equality-250"
re.500$Ranking <- "equality-500"

### ranking needing
rn.100 <- rank_needing(df,"sex",100)
rn.250 <- rank_needing(df,"sex",250)
rn.500 <- rank_needing(df,"sex",500)

rn.100$Ranking <- "need-100"
rn.250$Ranking <- "need-250"
rn.500$Ranking <- "need-500"

ranking_list <- list("equity-100" = r.100, "equity-250" = r.250, "equity-500" = r.500, "equal-100" = re.100, "equal-250" = re.250,
                     "equal-500"=re.500,"need-100"=rn.100,"need-250"=rn.250,"need-500"=rn.500)
# 3
stampa_ineq(ranking_list)
#calcolo_ineq(df)

# 4
stampa_olp(ranking_list)
# olp100 <- opportunity_lossprofile(rl$top100)
# olp250 <- opportunity_lossprofile(rl$top250)
# olp500 <- opportunity_lossprofile(rl$top500)
# 
# olp_table(olp100)
# olp_table(olp250)
# olp_table(olp500)

# 5
stampa_olr(ranking_list)
#rl$top100 <- opportunity_lossrate(rl$top100)

# 6
stampa_dist_rate(ranking_list)
#rl$top100 <- distributive_rate(rl$top100)

# 7
stampa_shannon(r.100)

df$sample <- "modified"
df2 <- br2$df
df2$sample <- "original"
all <- bind_rows(df2, df)
stampa_shannon(all)
#shannon_diversity(rl$top100)

# 8
stampa_theil(ranking_list)
#theil_entropy(rl$top100)


in.100 <- head(df[order(-df$G3),],100)
in.100$Ranking <- "Initial"
rtry <- bind_rows(re.100,r.100,rn.100, in.100)
rtry$Ranking <- factor(rtry$Ranking,levels = c("Initial", "equity-100", "equality-100", "need-100"))
stampa_tipi(rtry)

in.250 <- head(df[order(-df$G3),],250)
in.250$Ranking <- "Initial"
rtry <- bind_rows(re.250,r.250,rn.250, in.250)
rtry$Ranking <- factor(rtry$Ranking,levels = c("Initial", "equity-250", "equality-250", "need-250"))
stampa_tipi(rtry)

in.500 <- head(df[order(-df$G3),],500)
in.500$Ranking <- "Initial"
rtry <- bind_rows(re.500,r.500,rn.500, in.500)
rtry$Ranking <- factor(rtry$Ranking,levels = c("Initial", "equity-500", "equality-500", "need-500"))
stampa_tipi(rtry)


dft <- Filter(is.numeric, df)
dft$quantile <- NULL
dft$outcome <- NULL
dtt <- data.frame(unclass(summary(dft)))

dtt <- sapply(dft, summary)

df_transpose = data.frame(t(dtt))
df_transpose$Mean <- round(df_transpose$Mean,2)
formattable(df_transpose,
            list(
              'Mean'=color_bar(fun = unit.scale)
            ))



# define the markup language we are working in.
# options(qwraps2_markup = "latex") is also supported.
options(qwraps2_markup = "latex")

our_summary1 <-
  list("Age" =
         list("min" = ~ min(.data$age),
              "max" = ~ max(.data$age),
              "mean (sd)" = ~ qwraps2::mean_sd(.data$age)),
       "Mother's education" =
         list("min" = ~ min(.data$Medu),
              "median" = ~ median(.data$Medu),
              "max" = ~ max(.data$Medu),
              "mean (sd)" = ~ qwraps2::mean_sd(.data$Medu)),
       "Weight (1000 lbs)" =
         list("min" = ~ min(.data$wt),
              "max" = ~ max(.data$wt),
              "mean (sd)" = ~ qwraps2::mean_sd(.data$wt)),
       "Forward Gears" =
         list("Three" = ~ qwraps2::n_perc0(.data$gear == 3),
              "Four"  = ~ qwraps2::n_perc0(.data$gear == 4),
              "Five"  = ~ qwraps2::n_perc0(.data$gear == 5))
  )

whole <- summary_table(dft)
whole





M <- rtry[,c("failures","higher","school","schoolsup","studytime","Fedu","Ranking","node_placement")]
ra <- M$Ranking
must_convert<-sapply(M,is.factor)       # logical vector telling if a variable needs to be displayed as numeric
M2<-sapply(M[,must_convert],unclass)    # data.frame of all categorical variables now displayed as numeric
out<-cbind(M[,!must_convert],M2) 
out$Ranking <- ra
tops <- ddply(out, ~Ranking+node_placement, summarize, failures = mean(failures),
              higher = mean(higher),
              school = mean(school),
              schoolsup = mean(schoolsup),
              studytime = mean(studytime),
              Fedu = mean(Fedu)
              )

ALL <- tops[tops$Ranking=="need-250",]

fig <- plot_ly(
  type = 'scatterpolar',
  fill = 'toself'
)
for(i in seq(1:nrow(ALL))){
  fig <- fig %>%
    add_trace(
      r = as.numeric(as.vector(ALL[i,3:8])),
      theta = names(ALL[,3:8]),
      name = ALL$node_placement[i],
      mode = 'lines+markers'
    )
}

fig <- fig %>%
  layout(
    polar = list(
      radialaxis = list(
        visible = T,
        range = c(0,max(ALL))
      )
    )
  )

fig

ph <- .SavedPlots
for(i in 1:lastplot) {
  png(sprintf("plotsaved-%s.png", i))
  print(ph[i])
  dev.off()
}
