scale_color_manual(values = c("Firebrick","gold"),guide=FALSE)+
ylab("Unfair")+
xlab("Model")+
facet_grid(BalIx~FairIx)+
coord_cartesian(clip = "off")+
annotate("text", x=0.5,y=0,label=TeX("fair"),
parse=TRUE, size=2.5, color="gray40", hjust=0)+
annotate("text", x=1.15,y=56,label=TeX("unfair"),
parse=TRUE, size=2.5, color="gray40",hjust=1)+
#  annotate("text", x=c(0,50),y=c(0,0),label=c("<-- fair","unfair -->"))+
theme_light()+
theme(strip.text.y = element_text(angle=0),
axis.title.y = element_text(angle=0,hjust=0,vjust=1.05,))
ggsave('images/run1 unfairness distrib. risk 33 models, baseline, extensive Ind,TPR,FPR.png', plot = last_plot())
# extensive analysis, differences among different thresholds (Box plot distributions)
run = 'run1'
base_paper = 'all attributes measures/'  # base paper measures/  OR 'all attributes' (for no base paper)
# Compas and Juvenile removal
model_df <- function(model, smote=FALSE) {
imbalance_measures <- readODS::read_ods(paste(base_paper,"balance measures.ods", sep=''), run) %>%
slice(10:38) %>% # 29 or 38 (38 -> without Compas, juvenile, community and Age category)
filter(Attribute != 'Age category' & Attribute != 'Age cateogry') %>%
select("Dataset -", "Attribute", "Gini","Shannon","Simpson", "IIR")
# "Diff Ind" ,"Diff TPR","Diff FPR"
# "Diff PPV","Diff NPV", "Diff OAE"
fairIx = c("Diff Ind" ,"Diff TPR","Diff FPR", "Diff PPV","Diff NPV", "Diff OAE")
# or 'measures smote' # ATTENTION HERE
if(smote){
unfairness_measures <- readODS::read_ods(paste(base_paper,"measures smote/unfairness measures ", model, ".ods", sep=''),run) %>%
slice(10:38) %>% # 29 or 38 (38 -> without Compas, juvenile, community and Age category)
filter(Attribute != 'Age category' & Attribute != 'Age cateogry') %>%
select("Dataset -", "Attribute", "m",fairIx)
}else{
unfairness_measures <- readODS::read_ods(paste(base_paper,"measures/unfairness measures ", model, ".ods", sep=''),run) %>%
slice(10:38) %>% # 29 or 38 (38 -> without Compas, juvenile, community and Age category)
filter(Attribute != 'Age category' & Attribute != 'Age cateogry') %>%
select("Dataset -", "Attribute", "m",fairIx)
}
d <- merge(imbalance_measures, unfairness_measures) %>%
rename(Dataset=1) %>%
mutate(Dataset=str_replace(Dataset," *- *","")) %>%
pivot_longer(starts_with("Diff"),names_to="FairIx",
names_prefix="Diff ",values_to="FairValue")%>%
transform(FairValue = FairValue*100) %>%
transform(FairValue = as.integer(FairValue)) %>%
pivot_longer(c("Gini","Shannon","Simpson","IIR"),names_to="BalIx",
values_to="BalValue") %>%
transform(BalValue = BalValue*100) %>%
transform(BalValue = as.integer(BalValue)) %>%
#mutate(FairIx = fct_relevel(fct_relabel(FairIx, ~
#if_else(.=="Ind","Independence", paste0("Separation (",.,")"))), "Independence","Separation (TPR)")) %>%
#if_else(.=="OAE","OAE", paste0("Sufficiency (",.,")"))), "Sufficiency (PPV)","Sufficiency (NPV)")) %>%
mutate(Balance.Class = fct_rev( cut(BalValue,c(0,33.3,66.7,100),
labels=c("Imbalanced","Undecided","Balanced"),
include.lowest = TRUE, ordered_result = TRUE)))
model_vect = rep(model,nrow(d))
d$model=model_vect
if(smote){
model_type = rep('smote',nrow(d))
d$type=model_type
}else{
model_type = rep('baseline',nrow(d))
d$type=model_type
}
return(d)
}
risk_df <- function(model, risk=33) {
d <- model_df(model)
d <- d %>% mutate(Risk = if_else(BalValue < risk,"Higher risk","Lower risk"))
print(as.character(risk))
risk_type_vect = rep(as.character(risk), nrow(d))
d$risk_type=risk_type_vect
return(d)
}
model = 'lr'
d <- bind_rows(risk_df(model, 25), risk_df(model, 33), risk_df(model, 50),
risk_df(model, 75)) #, risk_df(model, 80))
d$FairIx_f = factor(d$FairIx, levels=c('Ind','TPR','FPR','PPV','NPV','OAE'))
# Box-Plot
ggplot(d,aes(x=risk_type,y=FairValue,fill=Risk,color=Risk))+
geom_boxplot(alpha=0.5,width=0.6)+
scale_fill_manual(values = c("Firebrick","gold"),guide=FALSE)+
scale_color_manual(values = c("Firebrick","gold"),guide=FALSE)+
ylab("Unfair")+
xlab("Type")+
facet_grid(FairIx_f~BalIx)+
coord_cartesian(clip = "off")+
annotate("text", x=0.5,y=0,label=TeX("fair"),
parse=TRUE, size=2.5, color="gray40", hjust=0)+
annotate("text", x=1.15,y=56,label=TeX("unfair"),
parse=TRUE, size=2.5, color="gray40",hjust=1)+
#  annotate("text", x=c(0,50),y=c(0,0),label=c("<-- fair","unfair -->"))+
theme_light()+
theme(strip.text.y = element_text(angle=0),
axis.title.y = element_text(angle=0,hjust=0,vjust=1.05,))
ggsave('images/extensive thresholds comparison lr.png', plot = last_plot())
# extensive analysis, differences among different thresholds (Box plot distributions)
run = 'run1'
base_paper = 'all attributes measures/'  # base paper measures/  OR 'all attributes' (for no base paper)
# Compas and Juvenile removal
model_df <- function(model, smote=FALSE) {
imbalance_measures <- readODS::read_ods(paste(base_paper,"balance measures.ods", sep=''), run) %>%
slice(10:38) %>% # 29 or 38 (38 -> without Compas, juvenile, community and Age category)
filter(Attribute != 'Age category' & Attribute != 'Age cateogry') %>%
select("Dataset -", "Attribute", "Gini","Shannon","Simpson", "IIR")
# "Diff Ind" ,"Diff TPR","Diff FPR"
# "Diff PPV","Diff NPV", "Diff OAE"
fairIx = c("Diff Ind" ,"Diff TPR","Diff FPR", "Diff PPV","Diff NPV", "Diff OAE")
# or 'measures smote' # ATTENTION HERE
if(smote){
unfairness_measures <- readODS::read_ods(paste(base_paper,"measures smote/unfairness measures ", model, ".ods", sep=''),run) %>%
slice(10:38) %>% # 29 or 38 (38 -> without Compas, juvenile, community and Age category)
filter(Attribute != 'Age category' & Attribute != 'Age cateogry') %>%
select("Dataset -", "Attribute", "m",fairIx)
}else{
unfairness_measures <- readODS::read_ods(paste(base_paper,"measures/unfairness measures ", model, ".ods", sep=''),run) %>%
slice(10:38) %>% # 29 or 38 (38 -> without Compas, juvenile, community and Age category)
filter(Attribute != 'Age category' & Attribute != 'Age cateogry') %>%
select("Dataset -", "Attribute", "m",fairIx)
}
d <- merge(imbalance_measures, unfairness_measures) %>%
rename(Dataset=1) %>%
mutate(Dataset=str_replace(Dataset," *- *","")) %>%
pivot_longer(starts_with("Diff"),names_to="FairIx",
names_prefix="Diff ",values_to="FairValue")%>%
transform(FairValue = FairValue*100) %>%
transform(FairValue = as.integer(FairValue)) %>%
pivot_longer(c("Gini","Shannon","Simpson","IIR"),names_to="BalIx",
values_to="BalValue") %>%
transform(BalValue = BalValue*100) %>%
transform(BalValue = as.integer(BalValue)) %>%
#mutate(FairIx = fct_relevel(fct_relabel(FairIx, ~
#if_else(.=="Ind","Independence", paste0("Separation (",.,")"))), "Independence","Separation (TPR)")) %>%
#if_else(.=="OAE","OAE", paste0("Sufficiency (",.,")"))), "Sufficiency (PPV)","Sufficiency (NPV)")) %>%
mutate(Balance.Class = fct_rev( cut(BalValue,c(0,33.3,66.7,100),
labels=c("Imbalanced","Undecided","Balanced"),
include.lowest = TRUE, ordered_result = TRUE)))
model_vect = rep(model,nrow(d))
d$model=model_vect
if(smote){
model_type = rep('smote',nrow(d))
d$type=model_type
}else{
model_type = rep('baseline',nrow(d))
d$type=model_type
}
return(d)
}
risk_df <- function(model, risk=33) {
d <- model_df(model)
d <- d %>% mutate(Risk = if_else(BalValue < risk,"Higher risk","Lower risk"))
print(as.character(risk))
risk_type_vect = rep(as.character(risk), nrow(d))
d$risk_type=risk_type_vect
return(d)
}
model = 'svm'
d <- bind_rows(risk_df(model, 25), risk_df(model, 33), risk_df(model, 50),
risk_df(model, 75)) #, risk_df(model, 80))
d$FairIx_f = factor(d$FairIx, levels=c('Ind','TPR','FPR','PPV','NPV','OAE'))
# Box-Plot
ggplot(d,aes(x=risk_type,y=FairValue,fill=Risk,color=Risk))+
geom_boxplot(alpha=0.5,width=0.6)+
scale_fill_manual(values = c("Firebrick","gold"),guide=FALSE)+
scale_color_manual(values = c("Firebrick","gold"),guide=FALSE)+
ylab("Unfair")+
xlab("Type")+
facet_grid(FairIx_f~BalIx)+
coord_cartesian(clip = "off")+
annotate("text", x=0.5,y=0,label=TeX("fair"),
parse=TRUE, size=2.5, color="gray40", hjust=0)+
annotate("text", x=1.15,y=56,label=TeX("unfair"),
parse=TRUE, size=2.5, color="gray40",hjust=1)+
#  annotate("text", x=c(0,50),y=c(0,0),label=c("<-- fair","unfair -->"))+
theme_light()+
theme(strip.text.y = element_text(angle=0),
axis.title.y = element_text(angle=0,hjust=0,vjust=1.05,))
ggsave('images/extensive thresholds comparison svm.png', plot = last_plot())
# extensive analysis, differences among different thresholds (Box plot distributions)
run = 'run1'
base_paper = 'all attributes measures/'  # base paper measures/  OR 'all attributes' (for no base paper)
# Compas and Juvenile removal
model_df <- function(model, smote=FALSE) {
imbalance_measures <- readODS::read_ods(paste(base_paper,"balance measures.ods", sep=''), run) %>%
slice(10:38) %>% # 29 or 38 (38 -> without Compas, juvenile, community and Age category)
filter(Attribute != 'Age category' & Attribute != 'Age cateogry') %>%
select("Dataset -", "Attribute", "Gini","Shannon","Simpson", "IIR")
# "Diff Ind" ,"Diff TPR","Diff FPR"
# "Diff PPV","Diff NPV", "Diff OAE"
fairIx = c("Diff Ind" ,"Diff TPR","Diff FPR", "Diff PPV","Diff NPV", "Diff OAE")
# or 'measures smote' # ATTENTION HERE
if(smote){
unfairness_measures <- readODS::read_ods(paste(base_paper,"measures smote/unfairness measures ", model, ".ods", sep=''),run) %>%
slice(10:38) %>% # 29 or 38 (38 -> without Compas, juvenile, community and Age category)
filter(Attribute != 'Age category' & Attribute != 'Age cateogry') %>%
select("Dataset -", "Attribute", "m",fairIx)
}else{
unfairness_measures <- readODS::read_ods(paste(base_paper,"measures/unfairness measures ", model, ".ods", sep=''),run) %>%
slice(10:38) %>% # 29 or 38 (38 -> without Compas, juvenile, community and Age category)
filter(Attribute != 'Age category' & Attribute != 'Age cateogry') %>%
select("Dataset -", "Attribute", "m",fairIx)
}
d <- merge(imbalance_measures, unfairness_measures) %>%
rename(Dataset=1) %>%
mutate(Dataset=str_replace(Dataset," *- *","")) %>%
pivot_longer(starts_with("Diff"),names_to="FairIx",
names_prefix="Diff ",values_to="FairValue")%>%
transform(FairValue = FairValue*100) %>%
transform(FairValue = as.integer(FairValue)) %>%
pivot_longer(c("Gini","Shannon","Simpson","IIR"),names_to="BalIx",
values_to="BalValue") %>%
transform(BalValue = BalValue*100) %>%
transform(BalValue = as.integer(BalValue)) %>%
#mutate(FairIx = fct_relevel(fct_relabel(FairIx, ~
#if_else(.=="Ind","Independence", paste0("Separation (",.,")"))), "Independence","Separation (TPR)")) %>%
#if_else(.=="OAE","OAE", paste0("Sufficiency (",.,")"))), "Sufficiency (PPV)","Sufficiency (NPV)")) %>%
mutate(Balance.Class = fct_rev( cut(BalValue,c(0,33.3,66.7,100),
labels=c("Imbalanced","Undecided","Balanced"),
include.lowest = TRUE, ordered_result = TRUE)))
model_vect = rep(model,nrow(d))
d$model=model_vect
if(smote){
model_type = rep('smote',nrow(d))
d$type=model_type
}else{
model_type = rep('baseline',nrow(d))
d$type=model_type
}
return(d)
}
risk_df <- function(model, risk=33) {
d <- model_df(model)
d <- d %>% mutate(Risk = if_else(BalValue < risk,"Higher risk","Lower risk"))
print(as.character(risk))
risk_type_vect = rep(as.character(risk), nrow(d))
d$risk_type=risk_type_vect
return(d)
}
model = 'knn'
d <- bind_rows(risk_df(model, 25), risk_df(model, 33), risk_df(model, 50),
risk_df(model, 75)) #, risk_df(model, 80))
d$FairIx_f = factor(d$FairIx, levels=c('Ind','TPR','FPR','PPV','NPV','OAE'))
# Box-Plot
ggplot(d,aes(x=risk_type,y=FairValue,fill=Risk,color=Risk))+
geom_boxplot(alpha=0.5,width=0.6)+
scale_fill_manual(values = c("Firebrick","gold"),guide=FALSE)+
scale_color_manual(values = c("Firebrick","gold"),guide=FALSE)+
ylab("Unfair")+
xlab("Type")+
facet_grid(FairIx_f~BalIx)+
coord_cartesian(clip = "off")+
annotate("text", x=0.5,y=0,label=TeX("fair"),
parse=TRUE, size=2.5, color="gray40", hjust=0)+
annotate("text", x=1.15,y=56,label=TeX("unfair"),
parse=TRUE, size=2.5, color="gray40",hjust=1)+
#  annotate("text", x=c(0,50),y=c(0,0),label=c("<-- fair","unfair -->"))+
theme_light()+
theme(strip.text.y = element_text(angle=0),
axis.title.y = element_text(angle=0,hjust=0,vjust=1.05,))
ggsave('images/extensive thresholds comparison knn.png', plot = last_plot())
# extensive analysis, differences among different thresholds (Box plot distributions)
run = 'run1'
base_paper = 'all attributes measures/'  # base paper measures/  OR 'all attributes' (for no base paper)
# Compas and Juvenile removal
model_df <- function(model, smote=FALSE) {
imbalance_measures <- readODS::read_ods(paste(base_paper,"balance measures.ods", sep=''), run) %>%
slice(10:38) %>% # 29 or 38 (38 -> without Compas, juvenile, community and Age category)
filter(Attribute != 'Age category' & Attribute != 'Age cateogry') %>%
select("Dataset -", "Attribute", "Gini","Shannon","Simpson", "IIR")
# "Diff Ind" ,"Diff TPR","Diff FPR"
# "Diff PPV","Diff NPV", "Diff OAE"
fairIx = c("Diff Ind" ,"Diff TPR","Diff FPR", "Diff PPV","Diff NPV", "Diff OAE")
# or 'measures smote' # ATTENTION HERE
if(smote){
unfairness_measures <- readODS::read_ods(paste(base_paper,"measures smote/unfairness measures ", model, ".ods", sep=''),run) %>%
slice(10:38) %>% # 29 or 38 (38 -> without Compas, juvenile, community and Age category)
filter(Attribute != 'Age category' & Attribute != 'Age cateogry') %>%
select("Dataset -", "Attribute", "m",fairIx)
}else{
unfairness_measures <- readODS::read_ods(paste(base_paper,"measures/unfairness measures ", model, ".ods", sep=''),run) %>%
slice(10:38) %>% # 29 or 38 (38 -> without Compas, juvenile, community and Age category)
filter(Attribute != 'Age category' & Attribute != 'Age cateogry') %>%
select("Dataset -", "Attribute", "m",fairIx)
}
d <- merge(imbalance_measures, unfairness_measures) %>%
rename(Dataset=1) %>%
mutate(Dataset=str_replace(Dataset," *- *","")) %>%
pivot_longer(starts_with("Diff"),names_to="FairIx",
names_prefix="Diff ",values_to="FairValue")%>%
transform(FairValue = FairValue*100) %>%
transform(FairValue = as.integer(FairValue)) %>%
pivot_longer(c("Gini","Shannon","Simpson","IIR"),names_to="BalIx",
values_to="BalValue") %>%
transform(BalValue = BalValue*100) %>%
transform(BalValue = as.integer(BalValue)) %>%
#mutate(FairIx = fct_relevel(fct_relabel(FairIx, ~
#if_else(.=="Ind","Independence", paste0("Separation (",.,")"))), "Independence","Separation (TPR)")) %>%
#if_else(.=="OAE","OAE", paste0("Sufficiency (",.,")"))), "Sufficiency (PPV)","Sufficiency (NPV)")) %>%
mutate(Balance.Class = fct_rev( cut(BalValue,c(0,33.3,66.7,100),
labels=c("Imbalanced","Undecided","Balanced"),
include.lowest = TRUE, ordered_result = TRUE)))
model_vect = rep(model,nrow(d))
d$model=model_vect
if(smote){
model_type = rep('smote',nrow(d))
d$type=model_type
}else{
model_type = rep('baseline',nrow(d))
d$type=model_type
}
return(d)
}
risk_df <- function(model, risk=33) {
d <- model_df(model)
d <- d %>% mutate(Risk = if_else(BalValue < risk,"Higher risk","Lower risk"))
print(as.character(risk))
risk_type_vect = rep(as.character(risk), nrow(d))
d$risk_type=risk_type_vect
return(d)
}
model = 'rf'
d <- bind_rows(risk_df(model, 25), risk_df(model, 33), risk_df(model, 50),
risk_df(model, 75)) #, risk_df(model, 80))
d$FairIx_f = factor(d$FairIx, levels=c('Ind','TPR','FPR','PPV','NPV','OAE'))
# Box-Plot
ggplot(d,aes(x=risk_type,y=FairValue,fill=Risk,color=Risk))+
geom_boxplot(alpha=0.5,width=0.6)+
scale_fill_manual(values = c("Firebrick","gold"),guide=FALSE)+
scale_color_manual(values = c("Firebrick","gold"),guide=FALSE)+
ylab("Unfair")+
xlab("Type")+
facet_grid(FairIx_f~BalIx)+
coord_cartesian(clip = "off")+
annotate("text", x=0.5,y=0,label=TeX("fair"),
parse=TRUE, size=2.5, color="gray40", hjust=0)+
annotate("text", x=1.15,y=56,label=TeX("unfair"),
parse=TRUE, size=2.5, color="gray40",hjust=1)+
#  annotate("text", x=c(0,50),y=c(0,0),label=c("<-- fair","unfair -->"))+
theme_light()+
theme(strip.text.y = element_text(angle=0),
axis.title.y = element_text(angle=0,hjust=0,vjust=1.05,))
ggsave('images/extensive thresholds comparison rf.png', plot = last_plot())
library(ggplot2)
library(tidyverse)
library(ggrepel)
library(latex2exp)
library(readODS)
knitr::opts_chunk$set(echo = FALSE)
# extensive analysis, differences among different thresholds (Box plot distributions)
run = 'run1'
base_paper = 'all attributes measures/'  # base paper measures/  OR 'all attributes' (for no base paper)
# Compas and Juvenile removal
model_df <- function(model, smote=FALSE) {
imbalance_measures <- readODS::read_ods(paste(base_paper,"balance measures.ods", sep=''), run) %>%
slice(10:38) %>% # 29 or 38 (38 -> without Compas, juvenile, community and Age category)
filter(Attribute != 'Age category' & Attribute != 'Age cateogry') %>%
select("Dataset -", "Attribute", "Gini","Shannon","Simpson", "IIR")
# "Diff Ind" ,"Diff TPR","Diff FPR"
# "Diff PPV","Diff NPV", "Diff OAE"
fairIx = c("Diff Ind" ,"Diff TPR","Diff FPR", "Diff PPV","Diff NPV", "Diff OAE")
# or 'measures smote' # ATTENTION HERE
if(smote){
unfairness_measures <- readODS::read_ods(paste(base_paper,"measures smote/unfairness measures ", model, ".ods", sep=''),run) %>%
slice(10:38) %>% # 29 or 38 (38 -> without Compas, juvenile, community and Age category)
filter(Attribute != 'Age category' & Attribute != 'Age cateogry') %>%
select("Dataset -", "Attribute", "m",fairIx)
}else{
unfairness_measures <- readODS::read_ods(paste(base_paper,"measures/unfairness measures ", model, ".ods", sep=''),run) %>%
slice(10:38) %>% # 29 or 38 (38 -> without Compas, juvenile, community and Age category)
filter(Attribute != 'Age category' & Attribute != 'Age cateogry') %>%
select("Dataset -", "Attribute", "m",fairIx)
}
d <- merge(imbalance_measures, unfairness_measures) %>%
rename(Dataset=1) %>%
mutate(Dataset=str_replace(Dataset," *- *","")) %>%
pivot_longer(starts_with("Diff"),names_to="FairIx",
names_prefix="Diff ",values_to="FairValue")%>%
transform(FairValue = FairValue*100) %>%
transform(FairValue = as.integer(FairValue)) %>%
pivot_longer(c("Gini","Shannon","Simpson","IIR"),names_to="BalIx",
values_to="BalValue") %>%
transform(BalValue = BalValue*100) %>%
transform(BalValue = as.integer(BalValue)) %>%
#mutate(FairIx = fct_relevel(fct_relabel(FairIx, ~
#if_else(.=="Ind","Independence", paste0("Separation (",.,")"))), "Independence","Separation (TPR)")) %>%
#if_else(.=="OAE","OAE", paste0("Sufficiency (",.,")"))), "Sufficiency (PPV)","Sufficiency (NPV)")) %>%
mutate(Balance.Class = fct_rev( cut(BalValue,c(0,33.3,66.7,100),
labels=c("Imbalanced","Undecided","Balanced"),
include.lowest = TRUE, ordered_result = TRUE)))
model_vect = rep(model,nrow(d))
d$model=model_vect
if(smote){
model_type = rep('smote',nrow(d))
d$type=model_type
}else{
model_type = rep('baseline',nrow(d))
d$type=model_type
}
return(d)
}
risk_df <- function(model, risk=33) {
d <- model_df(model)
d <- d %>% mutate(Risk = if_else(BalValue < risk,"Higher risk","Lower risk"))
print(as.character(risk))
risk_type_vect = rep(as.character(risk), nrow(d))
d$risk_type=risk_type_vect
return(d)
}
model = 'rf'
d <- bind_rows(risk_df(model, 5), risk_df(model, 10), risk_df(model, 15), risk_df(model, 25), risk_df(model, 33), risk_df(model, 50),
risk_df(model, 75)) #, risk_df(model, 80))
d$FairIx_f = factor(d$FairIx, levels=c('Ind','TPR','FPR','PPV','NPV','OAE'))
# Box-Plot
ggplot(d,aes(x=risk_type,y=FairValue,fill=Risk,color=Risk))+
geom_boxplot(alpha=0.5,width=0.6)+
scale_fill_manual(values = c("Firebrick","gold"),guide=FALSE)+
scale_color_manual(values = c("Firebrick","gold"),guide=FALSE)+
ylab("Unfair")+
xlab("Type")+
facet_grid(FairIx_f~BalIx)+
coord_cartesian(clip = "off")+
annotate("text", x=0.5,y=0,label=TeX("fair"),
parse=TRUE, size=2.5, color="gray40", hjust=0)+
annotate("text", x=1.15,y=56,label=TeX("unfair"),
parse=TRUE, size=2.5, color="gray40",hjust=1)+
#  annotate("text", x=c(0,50),y=c(0,0),label=c("<-- fair","unfair -->"))+
theme_light()+
theme(strip.text.y = element_text(angle=0),
axis.title.y = element_text(angle=0,hjust=0,vjust=1.05,))
ggsave('images/extensive thresholds comparison rf.png', plot = last_plot())
# extensive analysis, differences among different thresholds (Box plot distributions)
run = 'run1'
base_paper = 'all attributes measures/'  # base paper measures/  OR 'all attributes' (for no base paper)
# Compas and Juvenile removal
model_df <- function(model, smote=FALSE) {
imbalance_measures <- readODS::read_ods(paste(base_paper,"balance measures.ods", sep=''), run) %>%
slice(10:38) %>% # 29 or 38 (38 -> without Compas, juvenile, community and Age category)
filter(Attribute != 'Age category' & Attribute != 'Age cateogry') %>%
select("Dataset -", "Attribute", "Gini","Shannon","Simpson", "IIR")
# "Diff Ind" ,"Diff TPR","Diff FPR"
# "Diff PPV","Diff NPV", "Diff OAE"
fairIx = c("Diff Ind" ,"Diff TPR","Diff FPR", "Diff PPV","Diff NPV", "Diff OAE")
# or 'measures smote' # ATTENTION HERE
if(smote){
unfairness_measures <- readODS::read_ods(paste(base_paper,"measures smote/unfairness measures ", model, ".ods", sep=''),run) %>%
slice(10:38) %>% # 29 or 38 (38 -> without Compas, juvenile, community and Age category)
filter(Attribute != 'Age category' & Attribute != 'Age cateogry') %>%
select("Dataset -", "Attribute", "m",fairIx)
}else{
unfairness_measures <- readODS::read_ods(paste(base_paper,"measures/unfairness measures ", model, ".ods", sep=''),run) %>%
slice(10:38) %>% # 29 or 38 (38 -> without Compas, juvenile, community and Age category)
filter(Attribute != 'Age category' & Attribute != 'Age cateogry') %>%
select("Dataset -", "Attribute", "m",fairIx)
}
d <- merge(imbalance_measures, unfairness_measures) %>%
rename(Dataset=1) %>%
mutate(Dataset=str_replace(Dataset," *- *","")) %>%
pivot_longer(starts_with("Diff"),names_to="FairIx",
names_prefix="Diff ",values_to="FairValue")%>%
transform(FairValue = FairValue*100) %>%
transform(FairValue = as.integer(FairValue)) %>%
pivot_longer(c("Gini","Shannon","Simpson","IIR"),names_to="BalIx",
values_to="BalValue") %>%
transform(BalValue = BalValue*100) %>%
transform(BalValue = as.integer(BalValue)) %>%
#mutate(FairIx = fct_relevel(fct_relabel(FairIx, ~
#if_else(.=="Ind","Independence", paste0("Separation (",.,")"))), "Independence","Separation (TPR)")) %>%
#if_else(.=="OAE","OAE", paste0("Sufficiency (",.,")"))), "Sufficiency (PPV)","Sufficiency (NPV)")) %>%
mutate(Balance.Class = fct_rev( cut(BalValue,c(0,33.3,66.7,100),
labels=c("Imbalanced","Undecided","Balanced"),
include.lowest = TRUE, ordered_result = TRUE)))
model_vect = rep(model,nrow(d))
d$model=model_vect
if(smote){
model_type = rep('smote',nrow(d))
d$type=model_type
}else{
model_type = rep('baseline',nrow(d))
d$type=model_type
}
return(d)
}
risk_df <- function(model, risk=33) {
d <- model_df(model)
d <- d %>% mutate(Risk = if_else(BalValue < risk,"Higher risk","Lower risk"))
print(as.character(risk))
risk_type_vect = rep(as.character(risk), nrow(d))
d$risk_type=risk_type_vect
return(d)
}
model = 'rf'
d <- bind_rows(risk_df(model, 18), risk_df(model, 25), risk_df(model, 33), risk_df(model, 50),
risk_df(model, 75)) #, risk_df(model, 80))
d$FairIx_f = factor(d$FairIx, levels=c('Ind','TPR','FPR','PPV','NPV','OAE'))
# Box-Plot
ggplot(d,aes(x=risk_type,y=FairValue,fill=Risk,color=Risk))+
geom_boxplot(alpha=0.5,width=0.6)+
scale_fill_manual(values = c("Firebrick","gold"),guide=FALSE)+
scale_color_manual(values = c("Firebrick","gold"),guide=FALSE)+
ylab("Unfair")+
xlab("Type")+
facet_grid(FairIx_f~BalIx)+
coord_cartesian(clip = "off")+
annotate("text", x=0.5,y=0,label=TeX("fair"),
parse=TRUE, size=2.5, color="gray40", hjust=0)+
annotate("text", x=1.15,y=56,label=TeX("unfair"),
parse=TRUE, size=2.5, color="gray40",hjust=1)+
#  annotate("text", x=c(0,50),y=c(0,0),label=c("<-- fair","unfair -->"))+
theme_light()+
theme(strip.text.y = element_text(angle=0),
axis.title.y = element_text(angle=0,hjust=0,vjust=1.05,))
ggsave('images/extensive thresholds comparison rf.png', plot = last_plot())
