import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi']= 100
import time
import csv
df_instituciones = pd.read_csv('Acronimos Organismos.csv', index_col=0, encoding="ISO-8859-1", delimiter =';')
region_codes = {'Region Metropolitana de Santiago':13,
'Region de Antofagasta':2,
'Region de Arica y Parinacota':15,
'Region de Atacama':3,
'Region de Aysen':11,
'Region de Coquimbo':4,
'Region de La Araucania':9,
'Region de Los Lagos':10,
'Region de Los Rios':14,
'Region de Magallanes':12,
'Region de Nuble':16,
'Region de OHiggins':6,
'Region de Tarapaca':1,
'Region de Valparaiso':5,
'Region del Biobio':8,
'Region del Maule':7}
df_estate = pd.read_csv('Results/Results - EstateResult.csv')
df_estate[['Estate', 'Region']] = df_estate['Estates by region'].str.split(pat=',', n=1, expand=True)
df_estate = df_estate[df_estate['Current Distint Names']>0]
##Sumary
print(df_estate['LABEL'].value_counts().to_latex())
\begin{tabular}{lr} \toprule {} & LABEL \\ \midrule na & 44 \\ s(***) & 40 \\ n(***) & 28 \\ n(**) & 15 \\ s(*) & 9 \\ n(*) & 9 \\ s(**) & 3 \\ \bottomrule \end{tabular}
df_organism = pd.read_csv('Results/Results - OrganismResult.csv')
df_organism = df_organism[df_organism['Current Distint Names']>0]
##1 SUMMARY TOTAL
print(df_organism['LABEL'].value_counts().to_latex())
\begin{tabular}{lr} \toprule {} & LABEL \\ \midrule na & 173 \\ n(***) & 162 \\ s(***) & 112 \\ n(**) & 68 \\ n(*) & 37 \\ s(**) & 13 \\ s(*) & 12 \\ \bottomrule \end{tabular}
#2 BY REGIONS NEPOTISM
target = pd.merge(df_organism, df_instituciones, left_on='Organism', right_on='Organismo', how='inner')
target = target[target['LABEL'].isin(['n(***)','n(**)'])]
target['Region_ranking'] = target['LABEL']
target['Region_ranking'] = target.Region_ranking.replace({'Region de Arica y Parinacota': 1,
'Region de Tarapaca': 2,
'Region de Antofagasta': 3,
'Region de Atacama': 4,
'Region de Coquimbo': 5,
'Region de Valparaiso':6,
'Region Metropolitana de Santiago': 7,
'Region de OHiggins': 8,
'Region del Maule': 9,
'Region de Nuble': 10,
'Region del Biobio': 11,
'Region de La Araucania':12,
'Region de Los Rios': 13,
'Region de Los Lagos': 14,
'Region de Aysen': 15,
'Region de Magallanes':16})
print(target['Region'].value_counts().sort_index(ascending=True).to_latex())
\begin{tabular}{lr} \toprule {} & Region \\ \midrule Región Metropolitana de Santiago & 61 \\ Región de Antofagasta & 6 \\ Región de Atacama & 2 \\ Región de Aysén del General Carlos Ibáñez del C... & 1 \\ Región de Coquimbo & 6 \\ Región de La Araucanía & 25 \\ Región de Los Lagos & 22 \\ Región de Los Ríos & 11 \\ Región de Magallanes y de la Antártica Chilena & 7 \\ Región de Tarapacá & 3 \\ Región de Valparaíso & 24 \\ Región de Ñuble & 1 \\ Región del Biobío & 21 \\ Región del Libertador General Bernardo OHiggins & 21 \\ Región del Maule & 16 \\ \bottomrule \end{tabular}
target = pd.merge(df_organism, df_instituciones, left_on='Organism', right_on='Organismo', how='inner')
print(target['Region'].value_counts().sort_index(ascending=True).to_latex())
\begin{tabular}{lr} \toprule {} & Region \\ \midrule Región Metropolitana de Santiago & 203 \\ Región de Antofagasta & 10 \\ Región de Arica y Parinacota & 8 \\ Región de Atacama & 10 \\ Región de Aysén del General Carlos Ibáñez del C... & 15 \\ Región de Coquimbo & 20 \\ Región de La Araucanía & 34 \\ Región de Los Lagos & 41 \\ Región de Los Ríos & 16 \\ Región de Magallanes y de la Antártica Chilena & 16 \\ Región de Tarapacá & 10 \\ Región de Valparaíso & 45 \\ Región de Ñuble & 24 \\ Región del Biobío & 41 \\ Región del Libertador General Bernardo OHiggins & 40 \\ Región del Maule & 30 \\ \bottomrule \end{tabular}
#3 BY AREAS NEPOTISM
target = pd.merge(df_organism, df_instituciones, left_on='Organism', right_on='Organismo', how='inner')
target = target[target['LABEL'].isin(['n(***)','n(**)'])]
print(target['Padre_org'].value_counts().to_latex())
\begin{tabular}{lr} \toprule {} & Padre\_org \\ \midrule Salud & 38 \\ Municipios de R. Metropolitana de Santiago & 29 \\ Corporaciones Municipales & 20 \\ Municipios de La Araucanía & 20 \\ Municipios de Valparaíso & 19 \\ Municipios del Libertador General Bernardo OHig... & 18 \\ Municipios del Bíobio & 16 \\ Municipios del Maule & 15 \\ Municipios de Los Lagos & 12 \\ Municipios de Los Ríos & 9 \\ Municipios de Coquimbo & 6 \\ Municipios de Magallanes y de la Antártica Chilena & 6 \\ Educación & 3 \\ Municipios de Antofagasta & 3 \\ Obras Públicas & 2 \\ Municipios de Atacama & 2 \\ Defensa Nacional & 2 \\ Municipios de Tarapacá & 2 \\ Hacienda & 1 \\ Interior y Seguridad Pública & 1 \\ Municipios de Ñuble & 1 \\ Municipios de Aysen del General Carlos Ibáñez d... & 1 \\ Vivienda y Urbanismo & 1 \\ Transportes y Telecomunicaciones & 1 \\ \bottomrule \end{tabular}
#4 BY REGION OVERREPRESENTATION
target = pd.merge(df_organism, df_instituciones, left_on='Organism', right_on='Organismo', how='inner')
target = target[target['LABEL'].isin(['s(***)','s(**)'])]
#print(target['Region'].value_counts().to_latex())
target['Region'].value_counts().sort_index(ascending=True)
Región Metropolitana de Santiago 55 Región de Antofagasta 1 Región de Arica y Parinacota 3 Región de Aysén del General Carlos Ibáñez del Campo 9 Región de Coquimbo 3 Región de La Araucanía 2 Región de Los Lagos 2 Región de Los Ríos 2 Región de Magallanes y de la Antártica Chilena 4 Región de Tarapacá 2 Región de Valparaíso 4 Región de Ñuble 16 Región del Biobío 7 Región del Libertador General Bernardo OHiggins 6 Región del Maule 2 Name: Region, dtype: int64
#5 BY AREAS OVERREPRESENTATION
target = pd.merge(df_organism, df_instituciones, left_on='Organism', right_on='Organismo', how='inner')
target = target[target['LABEL'].isin(['s(***)','s(**)'])]
print(target['Padre_org'].value_counts().to_latex())
\begin{tabular}{lr} \toprule {} & Padre\_org \\ \midrule Salud & 20 \\ Educación & 11 \\ Municipios de Ñuble & 11 \\ Interior y Seguridad Pública & 9 \\ Vivienda y Urbanismo & 7 \\ Economía Fomento y Turismo & 7 \\ Obras Públicas & 6 \\ Municipios de Aysen del General Carlos Ibáñez d... & 6 \\ Trabajo y Previsión Social & 4 \\ Corporaciones Municipales & 3 \\ Minería & 3 \\ Hacienda & 3 \\ Municipios del Bíobio & 3 \\ Desarrollo Social & 3 \\ Agricultura & 2 \\ Municipios del Libertador General Bernardo OHig... & 2 \\ Justicia & 2 \\ Municipios de Coquimbo & 2 \\ Energía & 1 \\ Superintendencia de Servicios Sanitarios & 1 \\ Universidades & 1 \\ Ciencia Tecnología e Innovación & 1 \\ Municipios del Maule & 1 \\ Municipios de R. Metropolitana de Santiago & 1 \\ Deporte & 1 \\ Transportes y Telecomunicaciones & 1 \\ Municipios de Magallanes y de la Antártica Chilena & 1 \\ Municipios de Arica y Parinacota & 1 \\ Medio Ambiente & 1 \\ Relaciones Exteriores & 1 \\ Defensa Nacional & 1 \\ Secretaría General de la Presidencia & 1 \\ Culturas y las Artes & 1 \\ \bottomrule \end{tabular}
##1 SUMMARY TOTAL
print(df_estate['LABEL'].value_counts().to_latex())
\begin{tabular}{lr} \toprule {} & LABEL \\ \midrule na & 44 \\ s(***) & 40 \\ n(***) & 40 \\ n(**) & 15 \\ s(*) & 9 \\ n(*) & 9 \\ s(**) & 3 \\ \bottomrule \end{tabular}
#2 BY REGIONS NEPOTISM
df_estate = df_estate[df_estate['Current Distint Names'] > 0 ]
target = df_estate[df_estate['LABEL'].isin(['n(***)','n(**)'])]
print(target['Region'].value_counts().sort_index(ascending=True).to_latex())
\begin{tabular}{lr} \toprule {} & Region \\ \midrule Region Metropolitana de Santiago & 3 \\ Region de Antofagasta & 5 \\ Region de Arica y Parinacota & 3 \\ Region de Atacama & 2 \\ Region de Aysen & 4 \\ Region de Coquimbo & 1 \\ Region de La Araucania & 4 \\ Region de Los Lagos & 5 \\ Region de Los Rios & 2 \\ Region de Magallanes & 3 \\ Region de OHiggins & 2 \\ Region de Tarapaca & 4 \\ Region de Valparaiso & 3 \\ Region del Biobio & 2 \\ \bottomrule \end{tabular}
#3 BY HIERARCHY NEPOTISM
target = df_estate[df_estate['LABEL'].isin(['n(***)','n(**)'])]
print(target['Estate'].value_counts().sort_index(ascending=True).to_latex())
\begin{tabular}{lr} \toprule {} & Estate \\ \midrule EstamentoInferior & 12 \\ Tecnicos & 8 \\ EstamentoSuperiorSalud & 6 \\ Alcaldes & 5 \\ EstamentoSuperior & 3 \\ ProfesionalesEducacion & 3 \\ Jefaturas & 3 \\ Profesionales & 2 \\ Fiscalizadores & 1 \\ \bottomrule \end{tabular}
#4 BY REGIONS OVERREPRESENTATION
df_estate = df_estate[df_estate['Current Distint Names'] > 0 ]
target = df_estate[df_estate['LABEL'].isin(['s(***)','s(**)'])]
print(target['Region'].value_counts().sort_index(ascending=True).to_latex())
print(df_estate['Region'].value_counts().sort_index(ascending=True).to_latex())
\begin{tabular}{lr} \toprule {} & Region \\ \midrule Region Metropolitana de Santiago & 3 \\ Region de Atacama & 2 \\ Region de Aysen & 5 \\ Region de Coquimbo & 4 \\ Region de La Araucania & 1 \\ Region de Los Lagos & 3 \\ Region de Los Rios & 1 \\ Region de Magallanes & 2 \\ Region de Nuble & 8 \\ Region de OHiggins & 5 \\ Region de Valparaiso & 2 \\ Region del Biobio & 3 \\ Region del Maule & 4 \\ \bottomrule \end{tabular} \begin{tabular}{lr} \toprule {} & Region \\ \midrule Region Metropolitana de Santiago & 10 \\ Region de Antofagasta & 9 \\ Region de Arica y Parinacota & 9 \\ Region de Atacama & 9 \\ Region de Aysen & 9 \\ Region de Coquimbo & 9 \\ Region de La Araucania & 9 \\ Region de Los Lagos & 10 \\ Region de Los Rios & 9 \\ Region de Magallanes & 9 \\ Region de Nuble & 9 \\ Region de OHiggins & 10 \\ Region de Tarapaca & 9 \\ Region de Valparaiso & 10 \\ Region del Biobio & 9 \\ Region del Maule & 9 \\ \bottomrule \end{tabular}
#5 BY HIERARCHY OVERREPRESENTATION
target = df_estate[df_estate['LABEL'].isin(['s(***)','s(**)'])]
print(target['Estate'].value_counts().sort_index(ascending=True).to_latex())
\begin{tabular}{lr} \toprule {} & Estate \\ \midrule Directivos & 10 \\ Profesionales & 8 \\ Alcaldes & 6 \\ Fiscalizadores & 6 \\ EstamentoSuperiorSalud & 6 \\ ProfesionalesEducacion & 4 \\ Tecnicos & 1 \\ EstamentoInferior & 1 \\ Jefaturas & 1 \\ \bottomrule \end{tabular}
import geopandas as gpd
import matplotlib.pyplot as plt
#tracts = gpd.GeoDataFrame.from_file('Regiones/Regional.shp')
#print('Observations, Attributes:',tracts.shape)
#tracts = pd.merge(tracts, df_a, left_on='codregion', right_on='code', how='inner')
#tracts.head(4)
#tracts.set_index("code")
#tracts['geometry'][0]
#tracts.plot(column='count', cmap='OrRd', edgecolor='k', categorical=True, legend=True, legend_kwds={'loc': 'upper left'}, figsize=(10, 10))
#Very Problematic Organisms
import plotly.graph_objects as go
labels = ['Institutions', 'Ranks']
na = [df_organism[df_organism['LABEL'].isin(['na', 's(*)','n(*)'])]['LABEL'].value_counts().sum()]
p = [df_organism[df_organism['LABEL'].isin(['n(***)', 's(***)'])]['LABEL'].value_counts().sum()]
fig = go.Figure(data=[
go.Bar(name='non-risky', x=labels, y=na, text=na, textposition='auto',),
go.Bar(name='very risky', x=labels, y=p, text=p, textposition='auto',)
])
# Change the bar mode
fig.update_layout(
title='Number of Very Risky and Non-risky Institutions ',
xaxis_tickfont_size=14,
yaxis=dict(
title='Number of Institutions',
titlefont_size=16,
tickfont_size=14,
),
barmode='group',
bargap=0.15, # gap between bars of adjacent location coordinates.
bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.show()
#Problematic Organisms
import plotly.graph_objects as go
labels = ['Institutions', 'Ranks']
na = [df_organism[df_organism['LABEL'].isin(['na', 's(*)','n(*)'])]['LABEL'].value_counts().sum()]
p = [df_organism[df_organism['LABEL'].isin(['n(***)', 's(***)', 'n(**)', 's(**)'])]['LABEL'].value_counts().sum()]
fig = go.Figure(data=[
go.Bar(name='non-risky', x=labels, y=na, text=na, textposition='auto',),
go.Bar(name='risky', x=labels, y=p, text=p, textposition='auto',)
])
# Change the bar mode
fig.update_layout(
title='Number of Risky and Non-risky Institutions ',
xaxis_tickfont_size=14,
yaxis=dict(
title='Number of Institutions',
titlefont_size=16,
tickfont_size=14,
),
barmode='group',
bargap=0.15, # gap between bars of adjacent location coordinates.
bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.show()
#Very Problematic Estates
import plotly.graph_objects as go
df_estate = df_estate[df_estate['Current Distint Names'] > 0 ]
labels = ['Ranks']
na = [df_estate[df_estate['LABEL'].isin(['na', 'n(*)', 's(*)'])]['LABEL'].value_counts().sum()]
p = [df_estate[df_estate['LABEL'].isin(['n(***)', 's(***)'])]['LABEL'].value_counts().sum()]
fig = go.Figure(data=[
go.Bar(name='non-risky', x=labels, y=na, text=na, textposition='auto',),
go.Bar(name='very risky', x=labels, y=p, text=p, textposition='auto',)
])
# Change the bar mode
fig.update_layout(
title='Number of Very Risky and Non-risky Ranks ',
xaxis_tickfont_size=14,
yaxis=dict(
title='Number of Ranks',
titlefont_size=16,
tickfont_size=14,
),
barmode='group',
bargap=0.15, # gap between bars of adjacent location coordinates.
bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.show()
#Problematic Estates
import plotly.graph_objects as go
df_estate = df_estate[df_estate['Current Distint Names'] > 0 ]
labels = ['Ranks']
na = [df_estate[df_estate['LABEL'].isin(['na', 's(*)', 'n(*)'])]['LABEL'].value_counts().sum()]
p = [df_estate[df_estate['LABEL'].isin(['n(***)', 's(***)', 'n(**)', 's(**)'])]['LABEL'].value_counts().sum()]
fig = go.Figure(data=[
go.Bar(name='non-risky', x=labels, y=na, text=na, textposition='auto',),
go.Bar(name='risky', x=labels, y=p, text=p, textposition='auto',)
])
# Change the bar mode
fig.update_layout(
title='Number of Risky and Non-risky Ranks ',
xaxis_tickfont_size=14,
yaxis=dict(
title='Number of Ranks',
titlefont_size=16,
tickfont_size=14,
),
barmode='group',
bargap=0.15, # gap between bars of adjacent location coordinates.
bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.show()
#Organisms Summary
import plotly.graph_objects as go
labels = ['Institutions']
na = [df_organism[df_organism['LABEL'].isin(['na', 'n(*)', 's(*)'])]['LABEL'].value_counts().sum()]
n = [df_organism[df_organism['LABEL'].isin(['n(***)', 'n(**)'])]['LABEL'].value_counts().sum()]
s = [df_organism[df_organism['LABEL'].isin(['s(***)', 's(**)'])]['LABEL'].value_counts().sum()]
fig = go.Figure(data=[
go.Bar(name='non-risky', x=labels, y=na, text=na, textposition='auto',),
go.Bar(name='nepotism risk', x=labels, y=n, text=n, textposition='auto',),
go.Bar(name='elite capture risk', x=labels, y=s, text=s, textposition='auto',)
])
# Change the bar mode
fig.update_layout(
title='Number of Institutions: Non-risky vs Nepotism Risk vs Elite Capture Risk',
xaxis_tickfont_size=14,
yaxis=dict(
title='Number of Institutions',
titlefont_size=16,
tickfont_size=14,
),
barmode='group',
bargap=0.15, # gap between bars of adjacent location coordinates.
bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.show()
#Organisms Summary
import plotly.graph_objects as go
target = pd.merge(df_organism, df_instituciones, left_on='Organism', right_on='Organismo', how='inner')
target = target[target['LABEL'].isin(['s(***)','s(**)'])]['Padre_org'].value_counts()
target = target[ target > 1]
labels = ['Estates']
na = [df_organism[df_organism['LABEL'].isin(['na', 'n(*)', 's(*)'])]['LABEL'].value_counts().sum()]
n = [df_organism[df_organism['LABEL'].isin(['n(***)', 'n(**)'])]['LABEL'].value_counts().sum()]
s = [df_organism[df_organism['LABEL'].isin(['s(***)', 's(**)'])]['LABEL'].value_counts().sum()]
fig = go.Figure(data=[
go.Bar(name='non problematic (p value > 0.05)', x=labels, y=na, text=na, textposition='auto',),
go.Bar(name='nepotistic (p value < 0.05)', x=labels, y=n, text=n, textposition='auto',),
go.Bar(name='overrepresented (p value < 0.05)', x=labels, y=s, text=s, textposition='auto',)
])
# Change the bar mode
fig.update_layout(
title='Number of Organisms: Non-problematic vs Nepotistic vs Overrepresented',
xaxis_tickfont_size=14,
yaxis=dict(
title='Number of organism',
titlefont_size=16,
tickfont_size=14,
),
barmode='group',
bargap=0.15, # gap between bars of adjacent location coordinates.
bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.show()
target = pd.merge(df_organism, df_instituciones, left_on='Organism', right_on='Organismo', how='inner')
na = target[target['LABEL'].isin(['na','n(*)','s(*)'])]['Padre_org'].value_counts()
ne = target[target['LABEL'].isin(['n(***)','n(**)'])]['Padre_org'].value_counts()
ov = target[target['LABEL'].isin(['s(***)','s(**)'])]['Padre_org'].value_counts()
#target = target[ target > 1]
#target['Padre_org'].value_counts()
#Summary Estate
import plotly.graph_objects as go
a = df_estate[df_estate['LABEL'].isin(['s(***)', 's(**)'])]['LABEL'].value_counts().sum()
b = df_estate[df_estate['LABEL'].isin(['s(*)','na', 'n(*)'])]['LABEL'].value_counts().sum()
c = df_estate[df_estate['LABEL'].isin(['n(***)', 'n(**)'])]['LABEL'].value_counts().sum()
x= ['Estates']
fig = go.Figure(go.Bar(x=x, y=[a], name='overrepresented', text=[a], textposition='auto',))
fig.add_trace(go.Bar(x=x, y=[b], name='na', text=[b], textposition='auto',))
fig.add_trace(go.Bar(x=x, y=[c], name='nepotistic', text=[c], textposition='auto',))
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})
fig.show()
#Estates Summary
import plotly.graph_objects as go
df_estate = df_estate[df_estate['Current Distint Names'] > 0 ]
labels = ['Ranks']
na = [df_estate[df_estate['LABEL'].isin(['na', 'n(*)', 's(*)'])]['LABEL'].value_counts().sum()]
n = [df_estate[df_estate['LABEL'].isin(['n(***)', 'n(**)'])]['LABEL'].value_counts().sum()]
s = [df_estate[df_estate['LABEL'].isin(['s(***)', 's(**)'])]['LABEL'].value_counts().sum()]
fig = go.Figure(data=[
go.Bar(name='non-risky', x=labels, y=na, text=na, textposition='auto',),
go.Bar(name='nepotism risk', x=labels, y=n, text=n, textposition='auto',),
go.Bar(name='elite capture risk', x=labels, y=s, text=s, textposition='auto',)
])
# Change the bar mode
fig.update_layout(
title='Number of Ranks: Non-risky vs Nepotism Risk vs Elite Capture Risk',
xaxis_tickfont_size=14,
yaxis=dict(
title='Number of Ranks',
titlefont_size=16,
tickfont_size=14,
),
barmode='group',
bargap=0.15, # gap between bars of adjacent location coordinates.
bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.show()
#Details by Estate
import plotly.graph_objects as go
df_estate = df_estate[df_estate['Current Distint Names'] > 0 ]
a = df_estate[df_estate['LABEL'].isin(['s(***)', 's(**)'])]['Estate'].value_counts().reindex(
df_estate.Estate.unique(), fill_value=0).sort_index(ascending=True)
b = df_estate[df_estate['LABEL'].isin(['s(*)','na', 'n(*)'])]['Estate'].value_counts().reindex(
df_estate.Estate.unique(), fill_value=0).sort_index(ascending=True)
c = df_estate[df_estate['LABEL'].isin(['n(***)', 'n(**)'])]['Estate'].value_counts().reindex(
df_estate.Estate.unique(), fill_value=0).sort_index(ascending=True)
output = []
for i in range (0, len(a)):
output.append([a[i],b[i],c[i]])
output
x= list(df_estate[df_estate['LABEL'].isin(['na'])]['Estate'].value_counts().reindex(
df_estate.Estate.unique(), fill_value=0).sort_index(ascending=True).index)
x=['Mayors', 'Directors', 'Support', 'Superior Rank', 'Health Superior Rank', 'Auditors', 'Managers',
'Professionals', 'Education', 'Technical Roles']
fig = go.Figure(go.Bar(x=x, y=a, name='elite capture risk', marker_color='#00cc96'))
fig.add_trace(go.Bar(x=x, y=b, name='non-risky', marker_color='#636efa'))
fig.add_trace(go.Bar(x=x, y=c, name='nepotism risk', marker_color='#ef553b'))
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'},title = "Types of Risks by Rank",
xaxis_title = 'Ranks', yaxis_title =
'Number of Ranks')
fig.show()
#Very Problematic Estates
import plotly.graph_objects as go
labels = ['Director Clusters']
directors = df_estate[df_estate['Estate'] == 'Directivos']
na = [directors[directors['LABEL'].isin(['na', 'n(*)', 's(*)'])]['LABEL'].value_counts().sum()]
p = [directors[directors['LABEL'].isin(['s(***)','s(**)'])]['LABEL'].value_counts().sum()]
fig = go.Figure(data=[
go.Bar(name='non-risky', x=labels, y=na, text=na, textposition='auto',),
go.Bar(name='elite capture risk', x=labels, y=p, text=p, textposition='auto', marker_color='#00cc96')
])
# Change the bar mode
fig.update_layout(
title='Number of Risky and Non-risky Director Clusters ',
xaxis_tickfont_size=14,
yaxis=dict(
title='Number of Rank Clusters',
titlefont_size=16,
tickfont_size=14,
),
barmode='group',
bargap=0.15, # gap between bars of adjacent location coordinates.
bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.show()
# Full Summary
import plotly.graph_objects as go
labels = ['Organisms', 'Estates']
na = [df_organism[df_organism['LABEL'].isin(['na'])]['LABEL'].value_counts()[0],
df_estate[df_estate['LABEL'].isin(['na'])]['LABEL'].value_counts()[0]]
n3 = [df_organism[df_organism['LABEL'].isin(['n(***)'])]['LABEL'].value_counts()[0],
df_estate[df_estate['LABEL'].isin(['n(***)'])]['LABEL'].value_counts()[0]]
n2 = [df_organism[df_organism['LABEL'].isin(['n(**)'])]['LABEL'].value_counts()[0],
df_estate[df_estate['LABEL'].isin(['n(**)'])]['LABEL'].value_counts()[0]]
n = [df_organism[df_organism['LABEL'].isin(['n(*)'])]['LABEL'].value_counts()[0],
df_estate[df_estate['LABEL'].isin(['n(*)'])]['LABEL'].value_counts()[0]]
s3 = [df_organism[df_organism['LABEL'].isin(['s(***)'])]['LABEL'].value_counts()[0],
df_estate[df_estate['LABEL'].isin(['s(***)'])]['LABEL'].value_counts()[0]]
s2 = [df_organism[df_organism['LABEL'].isin(['s(**)'])]['LABEL'].value_counts()[0],
df_estate[df_estate['LABEL'].isin(['s(**)'])]['LABEL'].value_counts()[0]]
s = [df_organism[df_organism['LABEL'].isin(['s(*)'])]['LABEL'].value_counts()[0],
df_estate[df_estate['LABEL'].isin(['s(*)'])]['LABEL'].value_counts()[0]]
fig = go.Figure(data=[
go.Bar(name='non-significant (p value > 0.1)', x=labels, y=na, text=na, textposition='auto',),
go.Bar(name='nepotism (p value < 0.01)', x=labels, y=n3, text=n3, textposition='auto',),
go.Bar(name='nepotism (0.01 < p value < 0.05)', x=labels, y=n2, text=n2, textposition='auto',),
go.Bar(name='nepotism (0.05 < p value < 0.1)', x=labels, y=n, text=n, textposition='auto',),
go.Bar(name='overrepresented (p value < 0.01)', x=labels, y=s3, text=s3, textposition='auto',),
go.Bar(name='overrepresented (0.01 < p value < 0.05)', x=labels, y=s2, text=s2, textposition='auto',),
go.Bar(name='overrepresented (0.05 < p value < 0.1)', x=labels, y=s, text=s, textposition='auto',)
])
# Change the bar mode
fig.update_layout(
title='Number of Significant and Non-significant Clusters ',
xaxis_tickfont_size=14,
yaxis=dict(
title='Number of clusters',
titlefont_size=16,
tickfont_size=14,
),
barmode='group',
bargap=0.15, # gap between bars of adjacent location coordinates.
bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.show()
#Summary Nepotism
import plotly.graph_objects as go
ntotal = [df_organism[df_organism['LABEL'].isin(['n(**)','n(***)'])]['LABEL'].value_counts().sum(),
df_estate[df_estate['LABEL'].isin(['n(**)','n(***)'])]['LABEL'].value_counts().sum()]
x=['Organisms', 'Estates']
fig = go.Figure(go.Bar(x=x, y=na, name='non-significant', text=na, textposition='auto',))
fig.add_trace(go.Bar(x=x, y=ntotal, name='nepotism p value < 0.05', text=ntotal, textposition='auto',))
# Change the bar mode
fig.update_layout(
title='Number of Nepotistic Clusters vs Non-nepotistic Clusters',
xaxis={'categoryorder':'total descending'},
xaxis_tickfont_size=14,
yaxis=dict(
title='Number of clusters',
titlefont_size=16,
tickfont_size=14,
),
barmode='stack',
bargap=0.15, # gap between bars of adjacent location coordinates.
bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.show()
#Summary Overepresentation
import plotly.graph_objects as go
stotal = [df_organism[df_organism['LABEL'].isin(['s(**)','s(***)'])]['LABEL'].value_counts().sum(),
df_estate[df_estate['LABEL'].isin(['s(**)','s(***)'])]['LABEL'].value_counts().sum()]
x=['Organisms', 'Estates']
fig = go.Figure(go.Bar(x=x, y=na, name='non-significant', text=na, textposition='auto',))
fig.add_trace(go.Bar(x=x, y=stotal, name='overrepresented p value < 0.05', text=stotal, textposition='auto',))
fig.update_layout(
title='Number of Overrepresented Clusters vs Overrepresented Clusters',
xaxis={'categoryorder':'total descending'},
xaxis_tickfont_size=14,
yaxis=dict(
title='Number of clusters',
titlefont_size=16,
tickfont_size=14,
),
barmode='stack',
bargap=0.15, # gap between bars of adjacent location coordinates.
bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.show()
#Summary Nepotism
import plotly.graph_objects as go
labels = ['Organisms', 'Estates']
na = [df_organism[df_organism['LABEL'].isin(['na'])]['LABEL'].value_counts()[0],
df_estate[df_estate['LABEL'].isin(['na'])]['LABEL'].value_counts()[0]]
n3 = [df_organism[df_organism['LABEL'].isin(['n(***)'])]['LABEL'].value_counts()[0],
df_estate[df_estate['LABEL'].isin(['n(***)'])]['LABEL'].value_counts()[0]]
n2 = [df_organism[df_organism['LABEL'].isin(['n(**)'])]['LABEL'].value_counts()[0],
df_estate[df_estate['LABEL'].isin(['n(**)'])]['LABEL'].value_counts()[0]]
n = [df_organism[df_organism['LABEL'].isin(['n(*)'])]['LABEL'].value_counts()[0],
df_estate[df_estate['LABEL'].isin(['n(*)'])]['LABEL'].value_counts()[0]]
fig = go.Figure(data=[
go.Bar(name='non-significant', x=labels, y=na, text=na, textposition='auto',),
go.Bar(name='nepotism p value < 0.01', x=labels, y=n3, text=n3, textposition='auto',),
go.Bar(name='nepotism 0.01 < p value < 0.05', x=labels, y=n2, text=n2, textposition='auto',),
go.Bar(name='nepotism 0.05 < p value < 0.1', x=labels, y=n, text=n, textposition='auto',),
])
# Change the bar mode
fig.update_layout(
title='Number of Nepotistic Clusters by Significance',
xaxis_tickfont_size=14,
yaxis=dict(
title='Number of clusters',
titlefont_size=16,
tickfont_size=14,
),
barmode='group',
bargap=0.15, # gap between bars of adjacent location coordinates.
bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.show()
###Libraries
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import statsmodels.api as sm
import numpy as npdf
#Get wages
%store -r estates_sueldos
estates_sueldos['cluster_R'] = estates_sueldos['Cluster'] + ',' + estates_sueldos['region']
#Estates + Nepotism
r0 = pd.merge(df_estate, estates_sueldos, left_on='Estates by region', right_on='cluster_R', how='left')
r0 = r0[r0[('remuneracionbruta_mensual', 'mean')].notnull()]
r0['Cluster_ranking'] = r0[('Cluster','')]
r0['Cluster_ranking'] = r0.Cluster_ranking.replace({'EstamentoInferior': 1,
'Tecnicos ': 2,
'ProfesionalesEducacion': 3,
'Jefaturas': 4,
'Profesionales': 5,
'Fiscalizadores':6,
'EstamentoSuperiorSalud': 7,
'Directivos': 8,
'Alcaldes': 9,
'EstamentoSuperior': 10})
r0['Region_ranking'] = r0.Region
r0['Region_ranking'] = r0.Region_ranking.replace({'Region de Arica y Parinacota': 1,
'Region de Tarapaca': 2,
'Region de Antofagasta': 3,
'Region de Atacama': 4,
'Region de Coquimbo': 5,
'Region de Valparaiso':6,
'Region Metropolitana de Santiago': 7,
'Region de OHiggins': 8,
'Region del Maule': 9,
'Region de Nuble': 10,
'Region del Biobio': 11,
'Region de La Araucania':12,
'Region de Los Rios': 13,
'Region de Los Lagos': 14,
'Region de Aysen': 15,
'Region de Magallanes':16})
r0['Label_ranking'] = r0['LABEL']
r0['Label_ranking'] = r0.Label_ranking.replace({'na': 0, 'n(*)': 0, 's(*)': 0,
's(**)':0, 's(***)': 0, 'n(***)': 1, 'n(**)':1})
r0['Wage'] = r0[('remuneracionbruta_mensual', 'mean')]
r0
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pandas/core/reshape/merge.py:643: UserWarning: merging between different levels can give an unintended result (1 levels on the left,2 on the right)
Estates by region | Avergare Distint Names | Current Distint Names | P(MonteCarlo's Distint Names > Current Distint Names) | P(MonteCarlo's Distint Names < Current Distint Names) | LABEL | Estate | Region | (Cluster, ) | (region, ) | (remuneracionbruta_mensual, mean) | (remuneracionbruta_mensual, count) | (cluster_R, ) | Cluster_ranking | Region_ranking | Label_ranking | Wage | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ProfesionalesEducacion,Region Metropolitana de... | 2164.08 | 1784 | 1.000000 | 0.000000 | n(***) | ProfesionalesEducacion | Region Metropolitana de Santiago | ProfesionalesEducacion | Region Metropolitana de Santiago | 1.513246e+06 | 9714 | ProfesionalesEducacion,Region Metropolitana de... | 3 | 7 | 1 | 1.513246e+06 |
1 | ProfesionalesEducacion,Region de Antofagasta | 972.66 | 978 | 0.377243 | 0.602563 | na | ProfesionalesEducacion | Region de Antofagasta | ProfesionalesEducacion | Region de Antofagasta | 1.633296e+06 | 3180 | ProfesionalesEducacion,Region de Antofagasta | 3 | 3 | 0 | 1.633296e+06 |
2 | ProfesionalesEducacion,Region de Arica y Parin... | 487.19 | 497 | 0.201798 | 0.774789 | na | ProfesionalesEducacion | Region de Arica y Parinacota | ProfesionalesEducacion | Region de Arica y Parinacota | 1.673599e+06 | 975 | ProfesionalesEducacion,Region de Arica y Parin... | 3 | 1 | 0 | 1.673599e+06 |
3 | ProfesionalesEducacion,Region de Atacama | 351.78 | 354 | 0.398255 | 0.565015 | na | ProfesionalesEducacion | Region de Atacama | ProfesionalesEducacion | Region de Atacama | 1.708386e+06 | 742 | ProfesionalesEducacion,Region de Atacama | 3 | 4 | 0 | 1.708386e+06 |
4 | ProfesionalesEducacion,Region de Aysen | 180.59 | 193 | 0.028397 | 0.960570 | s(**) | ProfesionalesEducacion | Region de Aysen | ProfesionalesEducacion | Region de Aysen | 2.073724e+06 | 268 | ProfesionalesEducacion,Region de Aysen | 3 | 15 | 0 | 2.073724e+06 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
143 | Jefaturas,Region de OHiggins | 100.23 | 95 | 0.861428 | 0.094875 | n(*) | Jefaturas | Region de OHiggins | Jefaturas | Region de OHiggins | 1.598792e+06 | 129 | Jefaturas,Region de OHiggins | 4 | 8 | 0 | 1.598792e+06 |
144 | Jefaturas,Region de Tarapaca | 7.91 | 7 | 0.912475 | 0.003539 | n(***) | Jefaturas | Region de Tarapaca | Jefaturas | Region de Tarapaca | 2.040437e+06 | 8 | Jefaturas,Region de Tarapaca | 4 | 2 | 1 | 2.040437e+06 |
145 | Jefaturas,Region de Valparaiso | 118.30 | 112 | 0.899272 | 68.000000 | na | Jefaturas | Region de Valparaiso | Jefaturas | Region de Valparaiso | 1.718247e+06 | 146 | Jefaturas,Region de Valparaiso | 4 | 6 | 0 | 1.718247e+06 |
146 | Jefaturas,Region del Biobio | 87.19 | 87 | 0.474791 | 0.413868 | na | Jefaturas | Region del Biobio | Jefaturas | Region del Biobio | 1.738453e+06 | 104 | Jefaturas,Region del Biobio | 4 | 11 | 0 | 1.738453e+06 |
147 | Jefaturas,Region del Maule | 72.71 | 72 | 0.531315 | 0.360468 | na | Jefaturas | Region del Maule | Jefaturas | Region del Maule | 1.538980e+06 | 90 | Jefaturas,Region del Maule | 4 | 9 | 0 | 1.538980e+06 |
148 rows × 17 columns
model = sm.GLM.from_formula("Label_ranking ~ Cluster_ranking", family = sm.families.Binomial(), data=r0)
result = model.fit()
result.summary()
Dep. Variable: | Label_ranking | No. Observations: | 148 |
---|---|---|---|
Model: | GLM | Df Residuals: | 146 |
Model Family: | Binomial | Df Model: | 1 |
Link Function: | logit | Scale: | 1.0000 |
Method: | IRLS | Log-Likelihood: | -85.663 |
Date: | Tue, 30 Mar 2021 | Deviance: | 171.33 |
Time: | 22:30:01 | Pearson chi2: | 154. |
No. Iterations: | 4 | ||
Covariance Type: | nonrobust |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 0.0083 | 0.379 | 0.022 | 0.982 | -0.734 | 0.750 |
Cluster_ranking | -0.1854 | 0.072 | -2.580 | 0.010 | -0.326 | -0.045 |
model = sm.GLM.from_formula("Label_ranking ~ Cluster_ranking + Wage", family = sm.families.Binomial(), data=r0)
result = model.fit()
result.summary()
Dep. Variable: | Label_ranking | No. Observations: | 148 |
---|---|---|---|
Model: | GLM | Df Residuals: | 145 |
Model Family: | Binomial | Df Model: | 2 |
Link Function: | logit | Scale: | 1.0000 |
Method: | IRLS | Log-Likelihood: | -77.182 |
Date: | Tue, 30 Mar 2021 | Deviance: | 154.36 |
Time: | 22:30:01 | Pearson chi2: | 153. |
No. Iterations: | 5 | ||
Covariance Type: | nonrobust |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 0.0436 | 0.380 | 0.115 | 0.909 | -0.701 | 0.788 |
Cluster_ranking | -0.8958 | 0.214 | -4.192 | 0.000 | -1.315 | -0.477 |
Wage | 1.372e-06 | 3.8e-07 | 3.614 | 0.000 | 6.28e-07 | 2.12e-06 |
model = sm.GLM.from_formula("Label_ranking ~ Cluster_ranking + Wage+ Region_ranking", family = sm.families.Binomial(), data=r0)
result = model.fit()
result.summary()
Dep. Variable: | Label_ranking | No. Observations: | 148 |
---|---|---|---|
Model: | GLM | Df Residuals: | 144 |
Model Family: | Binomial | Df Model: | 3 |
Link Function: | logit | Scale: | 1.0000 |
Method: | IRLS | Log-Likelihood: | -77.112 |
Date: | Tue, 30 Mar 2021 | Deviance: | 154.22 |
Time: | 22:30:01 | Pearson chi2: | 153. |
No. Iterations: | 5 | ||
Covariance Type: | nonrobust |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | -0.0875 | 0.518 | -0.169 | 0.866 | -1.103 | 0.928 |
Cluster_ranking | -0.9011 | 0.215 | -4.199 | 0.000 | -1.322 | -0.481 |
Wage | 1.382e-06 | 3.81e-07 | 3.626 | 0.000 | 6.35e-07 | 2.13e-06 |
Region_ranking | 0.0155 | 0.042 | 0.372 | 0.710 | -0.066 | 0.097 |
#Overrepresentaiton + Estates
r1 = pd.merge(df_estate, estates_sueldos, left_on='Estates by region', right_on='cluster_R', how='left')
r1 = r1[r1[('remuneracionbruta_mensual', 'mean')].notnull()]
r1['Cluster_ranking'] = r1[('Cluster','')]
r1['Cluster_ranking'] = r1.Cluster_ranking.replace({'EstamentoInferior': 1,
'Tecnicos ': 2,
'ProfesionalesEducacion': 3,
'Jefaturas': 4,
'Profesionales': 5,
'Fiscalizadores':6,
'EstamentoSuperiorSalud': 7,
'Directivos': 8,
'Alcaldes': 9,
'EstamentoSuperior': 10})
r1['Region_ranking'] = r1.Region
r1['Region_ranking'] = r1.Region_ranking.replace({'Region de Arica y Parinacota': 1,
'Region de Tarapaca': 2,
'Region de Antofagasta': 3,
'Region de Atacama': 4,
'Region de Coquimbo': 5,
'Region de Valparaiso':6,
'Region Metropolitana de Santiago': 7,
'Region de OHiggins': 8,
'Region del Maule': 9,
'Region de Nuble': 10,
'Region del Biobio': 11,
'Region de La Araucania':12,
'Region de Los Rios': 13,
'Region de Los Lagos': 14,
'Region de Aysen': 15,
'Region de Magallanes':16})
r1['Label_ranking'] = r1['LABEL']
r1['Label_ranking'] = r1.Label_ranking.replace({'na': 0, 'n(*)': 0, 's(*)': 0,
's(**)':1, 's(***)': 1, 'n(***)': 0, 'n(**)':0})
r1['Wage'] = r1[('remuneracionbruta_mensual', 'mean')]
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pandas/core/reshape/merge.py:643: UserWarning: merging between different levels can give an unintended result (1 levels on the left,2 on the right)
model = sm.GLM.from_formula("Label_ranking ~ Cluster_ranking", family = sm.families.Binomial(), data=r1)
result = model.fit()
result.summary()
Dep. Variable: | Label_ranking | No. Observations: | 148 |
---|---|---|---|
Model: | GLM | Df Residuals: | 146 |
Model Family: | Binomial | Df Model: | 1 |
Link Function: | logit | Scale: | 1.0000 |
Method: | IRLS | Log-Likelihood: | -83.238 |
Date: | Tue, 30 Mar 2021 | Deviance: | 166.48 |
Time: | 22:30:01 | Pearson chi2: | 143. |
No. Iterations: | 4 | ||
Covariance Type: | nonrobust |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | -2.2373 | 0.473 | -4.730 | 0.000 | -3.164 | -1.310 |
Cluster_ranking | 0.2448 | 0.074 | 3.290 | 0.001 | 0.099 | 0.391 |
model = sm.GLM.from_formula("Label_ranking ~ Cluster_ranking + Wage", family = sm.families.Binomial(), data=r1)
result = model.fit()
result.summary()
Dep. Variable: | Label_ranking | No. Observations: | 148 |
---|---|---|---|
Model: | GLM | Df Residuals: | 145 |
Model Family: | Binomial | Df Model: | 2 |
Link Function: | logit | Scale: | 1.0000 |
Method: | IRLS | Log-Likelihood: | -78.325 |
Date: | Tue, 30 Mar 2021 | Deviance: | 156.65 |
Time: | 22:30:01 | Pearson chi2: | 145. |
No. Iterations: | 5 | ||
Covariance Type: | nonrobust |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | -2.5097 | 0.536 | -4.681 | 0.000 | -3.561 | -1.459 |
Cluster_ranking | 0.7610 | 0.196 | 3.876 | 0.000 | 0.376 | 1.146 |
Wage | -9.719e-07 | 3.42e-07 | -2.842 | 0.004 | -1.64e-06 | -3.02e-07 |
model = sm.GLM.from_formula("Label_ranking ~ Cluster_ranking + Wage + Region_ranking", family = sm.families.Binomial(), data=r1)
result = model.fit()
result.summary()
Dep. Variable: | Label_ranking | No. Observations: | 148 |
---|---|---|---|
Model: | GLM | Df Residuals: | 144 |
Model Family: | Binomial | Df Model: | 3 |
Link Function: | logit | Scale: | 1.0000 |
Method: | IRLS | Log-Likelihood: | -75.427 |
Date: | Tue, 30 Mar 2021 | Deviance: | 150.85 |
Time: | 22:30:01 | Pearson chi2: | 139. |
No. Iterations: | 5 | ||
Covariance Type: | nonrobust |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | -3.5383 | 0.731 | -4.840 | 0.000 | -4.971 | -2.106 |
Cluster_ranking | 0.7891 | 0.204 | 3.865 | 0.000 | 0.389 | 1.189 |
Wage | -1.008e-06 | 3.57e-07 | -2.824 | 0.005 | -1.71e-06 | -3.08e-07 |
Region_ranking | 0.1076 | 0.046 | 2.340 | 0.019 | 0.017 | 0.198 |
#Get wages
%store -r organisms_sueldos
###Nepotism - Organisms
#Overrepresentaiton + Estates
r3 = pd.merge(df_organism, organisms_sueldos, left_on='Organism', right_on='Organismo', how='left')
r3= r3.groupby(['Organism']).first().reset_index()
r3['Region_ranking'] = r3[('region', '')]
r3['Region_ranking'] = r3.Region_ranking.replace({'Region de Arica y Parinacota': 1,
'Region de Tarapaca': 2,
'Region de Antofagasta': 3,
'Region de Atacama': 4,
'Region de Coquimbo': 5,
'Region de Valparaiso':6,
'Region Metropolitana de Santiago': 7,
'Region de OHiggins': 8,
'Region del Maule': 9,
'Region de Nuble': 10,
'Region del Biobio': 11,
'Region de La Araucania':12,
'Region de Los Rios': 13,
'Region de Los Lagos': 14,
'Region de Aysen': 15,
'Region de Magallanes':16})
r3['Label_ranking'] = r3['LABEL']
r3['Label_ranking'] = r3.Label_ranking.replace({'na': 0, 'n(*)': 0, 's(*)': 0,
's(**)':0, 's(***)': 0, 'n(***)': 1, 'n(**)':1})
median = r3[('remuneracionbruta_mensual', 'mean')].median()
median_e = r3[('remuneracionbruta_mensual', 'count')].median()
r3[('remuneracionbruta_mensual', 'mean')] = r3[('remuneracionbruta_mensual', 'mean')].fillna(median)
r3[('remuneracionbruta_mensual', 'count')] = r3[('remuneracionbruta_mensual', 'count')].fillna(median_e)
r3['Wage'] = r3[('remuneracionbruta_mensual', 'mean')]
r3['Employees'] = r3[('remuneracionbruta_mensual', 'count')]
r3[r3['Organism'] == 'Comisión para el Mercado Financiero (CMF ex SVS)']['Region_ranking'].fillna(7, inplace=True)
r3.loc[523,'Region_ranking'] = 6
r3.loc[522,'Region_ranking'] = 6
r3.loc[458,'Region_ranking'] = 6
r3.loc[457,'Region_ranking'] = 2
r3.loc[451,'Region_ranking'] = 5
r3.loc[449,'Region_ranking'] = 4
r3.loc[448,'Region_ranking'] = 1
r3.loc[447,'Region_ranking'] = 3
r3.loc[446,'Region_ranking'] = 7
r3.loc[13,'Region_ranking'] = 7
r3.loc[476,'Region_ranking'] = 7
r3.loc[86,'Region_ranking'] = 6
r3.loc[76,'Region_ranking'] = 7
r3
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pandas/core/reshape/merge.py:643: UserWarning: merging between different levels can give an unintended result (1 levels on the left,2 on the right) /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pandas/core/series.py:4529: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Organism | Avergare Distint Names | Current Distint Names | P(MonteCarlo's Distint Names > Current Distint Names) | P(MonteCarlo's Distint Names < Current Distint Names) | LABEL | (Organismo, ) | (region, ) | (remuneracionbruta_mensual, mean) | (remuneracionbruta_mensual, count) | Region_ranking | Label_ranking | Wage | Employees | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AGENCIA CHILENA DE COOPERACIÓN INTERNACIONAL P... | 9.9 | 10 | 0.000000 | 0.135054 | s(***) | AGENCIA CHILENA DE COOPERACIÓN INTERNACIONAL P... | Region Metropolitana de Santiago | 3.208614e+06 | 10.0 | 7.0 | 0 | 3.208614e+06 | 10.0 |
1 | Administradora de los Tribunales Tributarios y... | 1.0 | 1 | 0.000000 | 0.000000 | s(***) | Administradora de los Tribunales Tributarios y... | Region Metropolitana de Santiago | 7.040634e+06 | 1.0 | 7.0 | 0 | 7.040634e+06 | 1.0 |
2 | Agencia de Calidad de la Educación | 6.9 | 7 | 0.000000 | 0.065852 | s(***) | Agencia de Calidad de la Educación | Region Metropolitana de Santiago | 5.319503e+06 | 7.0 | 7.0 | 0 | 5.319503e+06 | 7.0 |
3 | Agencia de Promoción de la Inversión Extranjera | 14.7 | 15 | 0.000000 | 0.283769 | s(***) | Agencia de Promoción de la Inversión Extranjera | Region Metropolitana de Santiago | 5.438907e+06 | 15.0 | 7.0 | 0 | 5.438907e+06 | 15.0 |
4 | Caja de Previsión de la Defensa Nacional (CAPR... | 63.4 | 63 | 0.506435 | 0.338401 | na | Caja de Previsión de la Defensa Nacional (CAPR... | Region Metropolitana de Santiago | 1.806751e+06 | 54.0 | 7.0 | 0 | 1.806751e+06 | 54.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
572 | Superintendencia del Medio Ambiente (SMA) | 5.0 | 5 | 0.000000 | 0.035369 | s(***) | Superintendencia del Medio Ambiente (SMA) | Region Metropolitana de Santiago | 7.896341e+06 | 4.0 | 7.0 | 0 | 7.896341e+06 | 4.0 |
573 | Tesorería General de la República (TGR) | 315.8 | 297 | 0.974456 | 0.019728 | n(**) | Tesorería General de la República (TGR) | Region Metropolitana de Santiago | 1.680884e+06 | 245.0 | 7.0 | 1 | 1.680884e+06 | 245.0 |
574 | Unidad de Análisis Financiero | 4.0 | 4 | 0.000000 | 0.019330 | s(***) | Unidad de Análisis Financiero | Region Metropolitana de Santiago | 7.497702e+06 | 4.0 | 7.0 | 0 | 7.497702e+06 | 4.0 |
575 | Universidad de Aysén | 6.9 | 7 | 0.000000 | 0.089429 | s(***) | Universidad de Aysén | Region de Aysen | 5.332341e+06 | 7.0 | 15.0 | 0 | 5.332341e+06 | 7.0 |
576 | Universidad de Ohiggins | 27.1 | 27 | 0.424515 | 0.294216 | na | Universidad de Ohiggins | Region de OHiggins | 3.772783e+06 | 29.0 | 8.0 | 0 | 3.772783e+06 | 29.0 |
577 rows × 14 columns
model = sm.GLM.from_formula("Label_ranking ~ Employees", family = sm.families.Binomial(), data=r3)
result = model.fit()
result.summary()
Dep. Variable: | Label_ranking | No. Observations: | 577 |
---|---|---|---|
Model: | GLM | Df Residuals: | 575 |
Model Family: | Binomial | Df Model: | 1 |
Link Function: | logit | Scale: | 1.0000 |
Method: | IRLS | Log-Likelihood: | -373.19 |
Date: | Tue, 30 Mar 2021 | Deviance: | 746.38 |
Time: | 22:30:01 | Pearson chi2: | 579. |
No. Iterations: | 4 | ||
Covariance Type: | nonrobust |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | -0.7772 | 0.113 | -6.887 | 0.000 | -0.998 | -0.556 |
Employees | 0.0014 | 0.000 | 4.876 | 0.000 | 0.001 | 0.002 |
model = sm.GLM.from_formula("Label_ranking ~ Employees + Region_ranking", family = sm.families.Binomial(), data=r3)
result = model.fit()
result.summary()
Dep. Variable: | Label_ranking | No. Observations: | 577 |
---|---|---|---|
Model: | GLM | Df Residuals: | 574 |
Model Family: | Binomial | Df Model: | 2 |
Link Function: | logit | Scale: | 1.0000 |
Method: | IRLS | Log-Likelihood: | -367.99 |
Date: | Tue, 30 Mar 2021 | Deviance: | 735.99 |
Time: | 22:30:01 | Pearson chi2: | 579. |
No. Iterations: | 4 | ||
Covariance Type: | nonrobust |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | -1.5158 | 0.261 | -5.805 | 0.000 | -2.028 | -1.004 |
Employees | 0.0014 | 0.000 | 4.942 | 0.000 | 0.001 | 0.002 |
Region_ranking | 0.0846 | 0.026 | 3.201 | 0.001 | 0.033 | 0.136 |
model = sm.GLM.from_formula("Label_ranking ~ Employees + Region_ranking + Wage", family = sm.families.Binomial(), data=r3)
result = model.fit()
result.summary()
Dep. Variable: | Label_ranking | No. Observations: | 577 |
---|---|---|---|
Model: | GLM | Df Residuals: | 573 |
Model Family: | Binomial | Df Model: | 3 |
Link Function: | logit | Scale: | 1.0000 |
Method: | IRLS | Log-Likelihood: | -317.61 |
Date: | Tue, 30 Mar 2021 | Deviance: | 635.22 |
Time: | 22:30:01 | Pearson chi2: | 681. |
No. Iterations: | 7 | ||
Covariance Type: | nonrobust |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 1.8997 | 0.508 | 3.738 | 0.000 | 0.904 | 2.896 |
Employees | 0.0003 | 0.000 | 1.285 | 0.199 | -0.000 | 0.001 |
Region_ranking | 0.0735 | 0.028 | 2.600 | 0.009 | 0.018 | 0.129 |
Wage | -1.785e-06 | 2.74e-07 | -6.507 | 0.000 | -2.32e-06 | -1.25e-06 |
model = sm.GLM.from_formula("Label_ranking ~ Region_ranking + Wage", family = sm.families.Binomial(), data=r3)
result = model.fit()
result.summary()
Dep. Variable: | Label_ranking | No. Observations: | 577 |
---|---|---|---|
Model: | GLM | Df Residuals: | 574 |
Model Family: | Binomial | Df Model: | 2 |
Link Function: | logit | Scale: | 1.0000 |
Method: | IRLS | Log-Likelihood: | -318.46 |
Date: | Tue, 30 Mar 2021 | Deviance: | 636.92 |
Time: | 22:30:01 | Pearson chi2: | 707. |
No. Iterations: | 7 | ||
Covariance Type: | nonrobust |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 2.1713 | 0.467 | 4.649 | 0.000 | 1.256 | 3.087 |
Region_ranking | 0.0707 | 0.028 | 2.517 | 0.012 | 0.016 | 0.126 |
Wage | -1.877e-06 | 2.69e-07 | -6.989 | 0.000 | -2.4e-06 | -1.35e-06 |
##Overrepresentation - Organisms
r4 = pd.merge(df_organism, organisms_sueldos, left_on='Organism', right_on='Organismo', how='left')
r4= r4.groupby(['Organism']).first().reset_index()
r4['Region_ranking'] = r4[('region', '')]
r4['Region_ranking'] = r4.Region_ranking.replace({'Region de Arica y Parinacota': 1,
'Region de Tarapaca': 2,
'Region de Antofagasta': 3,
'Region de Atacama': 4,
'Region de Coquimbo': 5,
'Region de Valparaiso':6,
'Region Metropolitana de Santiago': 7,
'Region de OHiggins': 8,
'Region del Maule': 9,
'Region de Nuble': 10,
'Region del Biobio': 11,
'Region de La Araucania':12,
'Region de Los Rios': 13,
'Region de Los Lagos': 14,
'Region de Aysen': 15,
'Region de Magallanes':16})
r4['Label_ranking'] = r4['LABEL']
r4['Label_ranking'] = r4.Label_ranking.replace({'na': 0, 'n(*)': 0, 's(*)': 0,
's(**)':1, 's(***)': 1, 'n(***)': 0, 'n(**)':0})
median = r4[('remuneracionbruta_mensual', 'mean')].median()
median_e = r4[('remuneracionbruta_mensual', 'count')].median()
r4[('remuneracionbruta_mensual', 'mean')] = r4[('remuneracionbruta_mensual', 'mean')].fillna(median)
r4[('remuneracionbruta_mensual', 'count')] = r4[('remuneracionbruta_mensual', 'count')].fillna(median_e)
r4['Wage'] = r4[('remuneracionbruta_mensual', 'mean')]
r4['Employees'] = r4[('remuneracionbruta_mensual', 'count')]
r4.loc[523,'Region_ranking'] = 6
r4.loc[522,'Region_ranking'] = 6
r4.loc[458,'Region_ranking'] = 6
r4.loc[457,'Region_ranking'] = 2
r4.loc[451,'Region_ranking'] = 5
r4.loc[449,'Region_ranking'] = 4
r4.loc[448,'Region_ranking'] = 1
r4.loc[447,'Region_ranking'] = 3
r4.loc[446,'Region_ranking'] = 7
r4.loc[13,'Region_ranking'] = 7
r4.loc[476,'Region_ranking'] = 7
r4.loc[86,'Region_ranking'] = 6
r4.loc[76,'Region_ranking'] = 7
r4
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pandas/core/reshape/merge.py:643: UserWarning: merging between different levels can give an unintended result (1 levels on the left,2 on the right)
Organism | Avergare Distint Names | Current Distint Names | P(MonteCarlo's Distint Names > Current Distint Names) | P(MonteCarlo's Distint Names < Current Distint Names) | LABEL | (Organismo, ) | (region, ) | (remuneracionbruta_mensual, mean) | (remuneracionbruta_mensual, count) | Region_ranking | Label_ranking | Wage | Employees | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AGENCIA CHILENA DE COOPERACIÓN INTERNACIONAL P... | 9.9 | 10 | 0.000000 | 0.135054 | s(***) | AGENCIA CHILENA DE COOPERACIÓN INTERNACIONAL P... | Region Metropolitana de Santiago | 3.208614e+06 | 10.0 | 7.0 | 1 | 3.208614e+06 | 10.0 |
1 | Administradora de los Tribunales Tributarios y... | 1.0 | 1 | 0.000000 | 0.000000 | s(***) | Administradora de los Tribunales Tributarios y... | Region Metropolitana de Santiago | 7.040634e+06 | 1.0 | 7.0 | 1 | 7.040634e+06 | 1.0 |
2 | Agencia de Calidad de la Educación | 6.9 | 7 | 0.000000 | 0.065852 | s(***) | Agencia de Calidad de la Educación | Region Metropolitana de Santiago | 5.319503e+06 | 7.0 | 7.0 | 1 | 5.319503e+06 | 7.0 |
3 | Agencia de Promoción de la Inversión Extranjera | 14.7 | 15 | 0.000000 | 0.283769 | s(***) | Agencia de Promoción de la Inversión Extranjera | Region Metropolitana de Santiago | 5.438907e+06 | 15.0 | 7.0 | 1 | 5.438907e+06 | 15.0 |
4 | Caja de Previsión de la Defensa Nacional (CAPR... | 63.4 | 63 | 0.506435 | 0.338401 | na | Caja de Previsión de la Defensa Nacional (CAPR... | Region Metropolitana de Santiago | 1.806751e+06 | 54.0 | 7.0 | 0 | 1.806751e+06 | 54.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
572 | Superintendencia del Medio Ambiente (SMA) | 5.0 | 5 | 0.000000 | 0.035369 | s(***) | Superintendencia del Medio Ambiente (SMA) | Region Metropolitana de Santiago | 7.896341e+06 | 4.0 | 7.0 | 1 | 7.896341e+06 | 4.0 |
573 | Tesorería General de la República (TGR) | 315.8 | 297 | 0.974456 | 0.019728 | n(**) | Tesorería General de la República (TGR) | Region Metropolitana de Santiago | 1.680884e+06 | 245.0 | 7.0 | 0 | 1.680884e+06 | 245.0 |
574 | Unidad de Análisis Financiero | 4.0 | 4 | 0.000000 | 0.019330 | s(***) | Unidad de Análisis Financiero | Region Metropolitana de Santiago | 7.497702e+06 | 4.0 | 7.0 | 1 | 7.497702e+06 | 4.0 |
575 | Universidad de Aysén | 6.9 | 7 | 0.000000 | 0.089429 | s(***) | Universidad de Aysén | Region de Aysen | 5.332341e+06 | 7.0 | 15.0 | 1 | 5.332341e+06 | 7.0 |
576 | Universidad de Ohiggins | 27.1 | 27 | 0.424515 | 0.294216 | na | Universidad de Ohiggins | Region de OHiggins | 3.772783e+06 | 29.0 | 8.0 | 0 | 3.772783e+06 | 29.0 |
577 rows × 14 columns
model = sm.GLM.from_formula("Label_ranking ~ Employees", family = sm.families.Binomial(), data=r4)
result = model.fit()
result.summary()
Dep. Variable: | Label_ranking | No. Observations: | 577 |
---|---|---|---|
Model: | GLM | Df Residuals: | 575 |
Model Family: | Binomial | Df Model: | 1 |
Link Function: | logit | Scale: | 1.0000 |
Method: | IRLS | Log-Likelihood: | -296.49 |
Date: | Tue, 30 Mar 2021 | Deviance: | 592.98 |
Time: | 22:30:01 | Pearson chi2: | 612. |
No. Iterations: | 5 | ||
Covariance Type: | nonrobust |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | -1.0279 | 0.129 | -7.942 | 0.000 | -1.282 | -0.774 |
Employees | -0.0012 | 0.000 | -2.765 | 0.006 | -0.002 | -0.000 |
model = sm.GLM.from_formula("Label_ranking ~ Wage", family = sm.families.Binomial(), data=r4)
result = model.fit()
result.summary()
Dep. Variable: | Label_ranking | No. Observations: | 577 |
---|---|---|---|
Model: | GLM | Df Residuals: | 575 |
Model Family: | Binomial | Df Model: | 1 |
Link Function: | logit | Scale: | 1.0000 |
Method: | IRLS | Log-Likelihood: | -252.99 |
Date: | Tue, 30 Mar 2021 | Deviance: | 505.98 |
Time: | 22:30:01 | Pearson chi2: | 558. |
No. Iterations: | 4 | ||
Covariance Type: | nonrobust |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | -3.0554 | 0.237 | -12.888 | 0.000 | -3.520 | -2.591 |
Wage | 7.994e-07 | 9.27e-08 | 8.621 | 0.000 | 6.18e-07 | 9.81e-07 |
model = sm.GLM.from_formula("Label_ranking ~ Wage + Region_ranking", family = sm.families.Binomial(), data=r4)
result = model.fit()
result.summary()
Dep. Variable: | Label_ranking | No. Observations: | 577 |
---|---|---|---|
Model: | GLM | Df Residuals: | 574 |
Model Family: | Binomial | Df Model: | 2 |
Link Function: | logit | Scale: | 1.0000 |
Method: | IRLS | Log-Likelihood: | -252.64 |
Date: | Tue, 30 Mar 2021 | Deviance: | 505.28 |
Time: | 22:30:01 | Pearson chi2: | 561. |
No. Iterations: | 4 | ||
Covariance Type: | nonrobust |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | -3.3205 | 0.399 | -8.317 | 0.000 | -4.103 | -2.538 |
Wage | 8.103e-07 | 9.39e-08 | 8.626 | 0.000 | 6.26e-07 | 9.94e-07 |
Region_ranking | 0.0281 | 0.033 | 0.841 | 0.400 | -0.037 | 0.093 |