In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
plt.rc("font", size=14)
from sklearn import linear_model, model_selection, metrics, feature_selection, svm, tree
from sklearn.feature_selection import RFECV
from sklearn.svm import SVC
from sklearn_som.som import SOM
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from imblearn import over_sampling
from imblearn.over_sampling import SMOTE
from pprint import pprint
import seaborn as sns
sns.set (style="white")
sns.set(style="whitegrid", color_codes=True)   
In [2]:
#leggo file csv
data = pd.read_csv ('credit_risk_dataset.csv', sep=',') 
#dimensioni dataset
data.shape
Out[2]:
(32581, 12)
In [3]:
#lista atributi dataset 
list(data.columns)
Out[3]:
['person_age',
 'person_income',
 'person_home_ownership',
 'person_emp_length',
 'loan_intent',
 'loan_grade',
 'loan_amnt',
 'loan_int_rate',
 'loan_status',
 'loan_percent_income',
 'cb_person_default_on_file',
 'cb_person_cred_hist_length']
In [4]:
#osservo le prime righe del dataset 
data.head()
Out[4]:
person_age person_income person_home_ownership person_emp_length loan_intent loan_grade loan_amnt loan_int_rate loan_status loan_percent_income cb_person_default_on_file cb_person_cred_hist_length
0 22 59000 RENT 123.0 PERSONAL D 35000 16.02 1 0.59 Y 3
1 21 9600 OWN 5.0 EDUCATION B 1000 11.14 0 0.10 N 2
2 25 9600 MORTGAGE 1.0 MEDICAL C 5500 12.87 1 0.57 N 3
3 23 65500 RENT 4.0 MEDICAL C 35000 15.23 1 0.53 N 2
4 24 54400 RENT 8.0 MEDICAL C 35000 14.27 1 0.55 Y 4
In [5]:
#conteggi default=0, default=1
data['loan_status'].value_counts()
Out[5]:
0    25473
1     7108
Name: loan_status, dtype: int64
In [6]:
sns.countplot(x='loan_status', data=data)
Out[6]:
<AxesSubplot:xlabel='loan_status', ylabel='count'>
In [7]:
#visualizzo media attributi numerici per ciascuna delle due classi
data.groupby('loan_status').mean()
Out[7]:
person_age person_income person_emp_length loan_amnt loan_int_rate loan_percent_income cb_person_cred_hist_length
loan_status
0 27.807129 70804.361559 4.968745 9237.464178 10.435999 0.148805 5.837475
1 27.474676 49125.652223 4.137562 10850.502954 13.060207 0.246889 5.685003
In [8]:
#visualizzo media attributi numerici per ciascun valore di 'person_home_ownership'
data.groupby('person_home_ownership').mean()
Out[8]:
person_age person_income person_emp_length loan_amnt loan_int_rate loan_status loan_percent_income cb_person_cred_hist_length
person_home_ownership
MORTGAGE 27.980735 81127.121690 5.883907 10574.460726 10.488001 0.125707 0.151328 5.922493
OTHER 26.803738 76387.803738 3.682243 11074.532710 12.030638 0.308411 0.191963 5.327103
OWN 27.698529 57834.812693 5.167635 9029.943885 10.861150 0.074690 0.188777 5.868421
RENT 27.545117 54997.747963 3.849216 8862.331266 11.455334 0.315700 0.182573 5.700535
In [9]:
#alcuni grafici per osservare il numero di default/non default in base ai vari attributi categorici
table=pd.crosstab(data['person_home_ownership'],data['loan_status'])
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
#plt.title('Stacked Bar Chart of person_home_ownership vs loan_status')
plt.xlabel('person_home_ownership')
plt.ylabel('Proportion of defaults')
plt.savefig('Figure_2')
In [10]:
#visualizzo media attributi numerici per ciascun valore di 'loan_intent'
data.groupby('loan_intent').mean()
Out[10]:
person_age person_income person_emp_length loan_amnt loan_int_rate loan_status loan_percent_income cb_person_cred_hist_length
loan_intent
DEBTCONSOLIDATION 27.606293 66470.876247 4.764613 9594.886800 10.983268 0.285879 0.170823 5.719302
EDUCATION 26.588099 64135.199132 4.463911 9482.678599 10.950261 0.172168 0.170184 5.160855
HOMEIMPROVEMENT 29.066574 73549.470458 5.134188 10360.520111 11.201152 0.261026 0.165515 6.481831
MEDICAL 27.998023 61437.227145 4.767170 9259.582441 11.060194 0.267007 0.173044 5.941690
PERSONAL 28.208477 67864.141279 4.888061 9573.772867 10.998221 0.198877 0.169230 6.122804
VENTURE 27.568456 66386.574576 4.892549 9583.777758 10.948275 0.148103 0.170540 5.726875
In [11]:
table=pd.crosstab(data['loan_intent'],data['loan_status'])
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
#plt.title('Stacked Bar Chart of person_home_ownership vs loan_status')
plt.xlabel('loan_intent')
plt.ylabel('Proportion of defaults')
plt.savefig('Figure_3')
In [12]:
#visualizzo media attributi numerici per ciascun valore di 'loan_grade'
data.groupby('loan_grade').mean()
Out[12]:
person_age person_income person_emp_length loan_amnt loan_int_rate loan_status loan_percent_income cb_person_cred_hist_length
loan_grade
A 27.664099 66568.207201 5.102015 8539.273453 7.327651 0.099564 0.153683 5.743899
B 27.686059 66354.839154 4.754761 9995.483686 10.995555 0.162760 0.175288 5.782126
C 27.800557 64921.936203 4.450087 9213.862651 13.463542 0.207340 0.170084 5.865438
D 27.877551 63663.682019 4.704443 10849.241589 15.361448 0.590458 0.191026 5.898235
E 27.868257 70873.106846 4.377101 12915.845436 17.009455 0.644191 0.205996 5.829876
F 28.352697 77008.730290 4.254237 14717.323651 18.609159 0.705394 0.215643 6.128631
G 28.437500 76773.296875 6.125000 17195.703125 20.251525 0.984375 0.243906 6.453125
In [13]:
table=pd.crosstab(data['loan_grade'],data['loan_status'])
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
#plt.title('Stacked Bar Chart of person_home_ownership vs loan_status')
plt.xlabel('loan_grade')
plt.ylabel('Proportion of defaults')
plt.savefig('Figure_4')
In [14]:
#visualizzo media attributi numerici per ciascun valore di 'cb_person_default_on_file'
data.groupby('cb_person_default_on_file').mean()
Out[14]:
person_age person_income person_emp_length loan_amnt loan_int_rate loan_status loan_percent_income cb_person_cred_hist_length
cb_person_default_on_file
N 27.717544 66178.476263 4.843075 9475.055895 10.258913 0.183932 0.168429 5.795648
Y 27.814273 65590.783116 4.542548 10123.359443 14.513800 0.378068 0.178491 5.844212
In [15]:
table=pd.crosstab(data['cb_person_default_on_file'],data['loan_status'])
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
#plt.title('Stacked Bar Chart of person_home_ownership vs loan_status')
plt.xlabel('cb_person_default_on_file')
plt.ylabel('Proportion of defaults')
plt.savefig('Figure_5')
In [16]:
#correlation heatmap per features numeriche
corr=data.corr()
cmap = sns.diverging_palette(270, 10, as_cmap=True)
mask= np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, cmap=cmap, mask=mask, annot=True)
plt.savefig('Figure_6')
In [17]:
#boxplot per gli attributi numerici 
ax = sns.boxplot(x=data['loan_status'], y=data['person_age'], data=data)
plt.savefig('Figure_7')
In [18]:
ax = sns.boxplot(x=data['loan_status'], y=data['person_income'], data=data)
plt.savefig('Figure_8')
In [19]:
ax = sns.boxplot(x=data['loan_status'], y=data['person_emp_length'], data=data)
plt.savefig('Figure_9')
In [20]:
ax = sns.boxplot(x=data['loan_status'], y=data['loan_amnt'], data=data)
plt.savefig('Figure_10')
In [21]:
ax = sns.boxplot(x=data['loan_status'], y=data['loan_int_rate'], data=data)
plt.savefig('Figure_11')
In [22]:
ax = sns.boxplot(x=data['loan_status'], y=data['loan_percent_income'], data=data)
plt.savefig('Figure_12')
In [23]:
ax = sns.boxplot(x=data['loan_status'], y=data['cb_person_cred_hist_length'], data=data)
plt.savefig('Figure_13')
In [24]:
data.describe()
Out[24]:
person_age person_income person_emp_length loan_amnt loan_int_rate loan_status loan_percent_income cb_person_cred_hist_length
count 32581.000000 3.258100e+04 31686.000000 32581.000000 29465.000000 32581.000000 32581.000000 32581.000000
mean 27.734600 6.607485e+04 4.789686 9589.371106 11.011695 0.218164 0.170203 5.804211
std 6.348078 6.198312e+04 4.142630 6322.086646 3.240459 0.413006 0.106782 4.055001
min 20.000000 4.000000e+03 0.000000 500.000000 5.420000 0.000000 0.000000 2.000000
25% 23.000000 3.850000e+04 2.000000 5000.000000 7.900000 0.000000 0.090000 3.000000
50% 26.000000 5.500000e+04 4.000000 8000.000000 10.990000 0.000000 0.150000 4.000000
75% 30.000000 7.920000e+04 7.000000 12200.000000 13.470000 0.000000 0.230000 8.000000
max 144.000000 6.000000e+06 123.000000 35000.000000 23.220000 1.000000 0.830000 30.000000
In [17]:
#null values
#percentuale di null values per ogni attributo
Nan_per = data.isnull().sum()/data.shape[0]*100
Nan_per.round(2)
Out[17]:
person_age                    0.00
person_income                 0.00
person_home_ownership         0.00
person_emp_length             2.75
loan_intent                   0.00
loan_grade                    0.00
loan_amnt                     0.00
loan_int_rate                 9.56
loan_status                   0.00
loan_percent_income           0.00
cb_person_default_on_file     0.00
cb_person_cred_hist_length    0.00
dtype: float64
In [18]:
# stampo moda e mediana per i due attributi che contengono missing values
print('person_emp_length mode {}'.format(data['person_emp_length'].mode()[0]))
print('person_emp_length median {}'.format(data['person_emp_length'].median()))
print('loan_int_rate mode {}'.format(data['loan_int_rate'].mode()[0]))
print('loan_int_rate median {}'.format(data['loan_int_rate'].median()))
person_emp_length mode 0.0
person_emp_length median 4.0
loan_int_rate mode 7.51
loan_int_rate median 10.99
In [19]:
#replace missing vlaues
#rimpiazzo i mising values di 'person_emp_length' con la moda e di 'loan_int_rate' con la mediana 
data['person_emp_length'].fillna(data['person_emp_length'].mode()[0], inplace=True)
data['loan_int_rate'].fillna(data['loan_int_rate'].median(), inplace=True)
In [28]:
data.describe()
Out[28]:
person_age person_income person_emp_length loan_amnt loan_int_rate loan_status loan_percent_income cb_person_cred_hist_length
count 32581.000000 3.258100e+04 32581.000000 32581.000000 32581.000000 32581.000000 32581.000000 32581.000000
mean 27.734600 6.607485e+04 4.658114 9589.371106 11.009620 0.218164 0.170203 5.804211
std 6.348078 6.198312e+04 4.159669 6322.086646 3.081611 0.413006 0.106782 4.055001
min 20.000000 4.000000e+03 0.000000 500.000000 5.420000 0.000000 0.000000 2.000000
25% 23.000000 3.850000e+04 2.000000 5000.000000 8.490000 0.000000 0.090000 3.000000
50% 26.000000 5.500000e+04 4.000000 8000.000000 10.990000 0.000000 0.150000 4.000000
75% 30.000000 7.920000e+04 7.000000 12200.000000 13.110000 0.000000 0.230000 8.000000
max 144.000000 6.000000e+06 123.000000 35000.000000 23.220000 1.000000 0.830000 30.000000
In [29]:
#outliers
data['outlier']=np.where((data['person_age']<=110) & (data['person_emp_length']<=120) & (data['person_income']<=5e6), 'no','yes')                                              
sns.scatterplot(data=data, x= 'person_age', y='person_emp_length',hue='outlier', palette = sns.diverging_palette(10, 240, n=2))
plt.savefig('Figure_20')

#noto che è impossbilie che ci siano persone di età superirore ai 120 
#segnalo anche come outliers tutti quei valori sopra alla diagonale del grafico per cui
#gli anni di impiego risultano maggiori dell'età 
In [149]:
sns.scatterplot(data=data, x= 'person_age', y='person_income',hue='outlier', palette = sns.diverging_palette(10, 240, n=2))
plt.savefig('Figure_21')
In [151]:
sns.scatterplot(data=data, x= 'loan_percent_income', y='person_emp_length',hue='outlier', palette = sns.diverging_palette(10, 240, n=2))
plt.savefig('Figure_22')
In [20]:
#numerical data
#seleziono solo la parte numerica del dataset, escludendo gli attributi categorici
num_data= pd.DataFrame(data[data.select_dtypes(include=['float','int']).columns])
num_data.columns
Out[20]:
Index(['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
       'loan_int_rate', 'loan_status', 'loan_percent_income',
       'cb_person_cred_hist_length'],
      dtype='object')
In [32]:
sns.pairplot(num_data)
Out[32]:
<seaborn.axisgrid.PairGrid at 0x1d63c5e8d60>
In [21]:
#eliminate outliers
#in base ai grafici reppresentati di sopra rimuovo gli outliers

#rimuovo le persone che risultano avere più di 110 anni
Data = data[data['person_age']<=110]
#rimuovo le persone i cui anni di impiego sono superiori ai 120
Data = Data[Data['person_emp_length']<=120]
#rimuovo le persone il cui income è superiore a 5e6
Data = Data[Data['person_income']<=5e6]
#Data è il DATASET SENZA OUTLIERS E CON REPLACING MISSING VALUES


#numerical variables
numData = pd.DataFrame(Data[Data.select_dtypes(include=['float','int']).columns])
#categorical variables
catData = pd.DataFrame(Data[Data.select_dtypes(include=['object']).columns])
catData.columns
Out[21]:
Index(['person_home_ownership', 'loan_intent', 'loan_grade',
       'cb_person_default_on_file'],
      dtype='object')
In [22]:
#converto le variabili categoriche in variabili indicatrici 0-1 ----> ottengo così una varibile per ogni livello 
#delle precedenti varibili categoriche
encoded_catdata = pd.get_dummies(catData)
encData = pd.concat([numData, encoded_catdata],axis=1)
encData.head()
Out[22]:
person_age person_income person_emp_length loan_amnt loan_int_rate loan_status loan_percent_income cb_person_cred_hist_length person_home_ownership_MORTGAGE person_home_ownership_OTHER ... loan_intent_VENTURE loan_grade_A loan_grade_B loan_grade_C loan_grade_D loan_grade_E loan_grade_F loan_grade_G cb_person_default_on_file_N cb_person_default_on_file_Y
1 21 9600 5.0 1000 11.14 0 0.10 2 0 0 ... 0 0 1 0 0 0 0 0 1 0
2 25 9600 1.0 5500 12.87 1 0.57 3 1 0 ... 0 0 0 1 0 0 0 0 1 0
3 23 65500 4.0 35000 15.23 1 0.53 2 0 0 ... 0 0 0 1 0 0 0 0 1 0
4 24 54400 8.0 35000 14.27 1 0.55 4 0 0 ... 0 0 0 1 0 0 0 0 0 1
5 21 9900 2.0 2500 7.14 1 0.25 2 0 0 ... 1 1 0 0 0 0 0 0 1 0

5 rows × 27 columns

In [23]:
#correlation heatmap
#faccio la nuova correlation heatmap comprensiva di tutte le nuove variabili
corr=encData.corr().sort_values('loan_status', axis=1, ascending=False)
corr=corr.sort_values('loan_status', axis=0, ascending=True)
cmap = sns.diverging_palette(270, 10, as_cmap=True)
mask=np.zeros_like(corr)
mask[np.triu_indices_from(mask, k=1)]=True
with sns.axes_style("white"):
    f, ax =plt.subplots(figsize=(16,10))
    ax=sns.heatmap(corr, mask=mask, vmin=corr.loan_status.min(),
                  vmax=corr.drop(['loan_status'], axis=0).loan_status.max(),
                  square=True, annot=True, fmt='.2f',
                  center=0, cmap=cmap, annot_kws={"size":10})
In [24]:
#test_train splitting
#divisione del dataset in train e test set. rispettiamente 70 e 30%
label= encData['loan_status']
features= encData.drop('loan_status', axis=1)
x_train, x_test, y_train, y_test = model_selection.train_test_split(features, label, random_state=0, test_size = 0.30)
In [25]:
#conteggio delle due classi nel test set
y_test.value_counts()
Out[25]:
0    7696
1    2077
Name: loan_status, dtype: int64
In [26]:
#conteggio delle due classi nel training set
y_train.value_counts()
# ----> il training set è fortemente sbilanciato e va bilanciato prima  di testare gli algoritmi
Out[26]:
0    17771
1     5030
Name: loan_status, dtype: int64
In [27]:
#balance train dataset
#oversample utilizzando la tecnica SMOTE
oversample = SMOTE (random_state=0)
os_x_train, os_y_train = oversample.fit_resample(x_train, y_train)
os_x_train = pd.DataFrame(data = os_x_train, columns = x_train.columns)
os_y_train = pd.DataFrame(data = os_y_train, columns= ['loan_status'])

print("length of oversampled data is ",len(os_x_train))
print("Number of no default in oversampled data",len(os_y_train[os_y_train['loan_status']==0]))
print("Number of default",len(os_y_train[os_y_train['loan_status']==1]))
print("Proportion of no default data in oversampled data is ",len(os_y_train[os_y_train['loan_status']==0])/len(os_x_train))
print("Proportion of default data in oversampled data is ",len(os_y_train[os_y_train['loan_status']==1])/len(os_x_train))
length of oversampled data is  35542
Number of no default in oversampled data 17771
Number of default 17771
Proportion of no default data in oversampled data is  0.5
Proportion of default data in oversampled data is  0.5
In [ ]:
 
In [38]:
#MODEL : logistic regression (LR)
os_x_train, os_y_train = oversample.fit_resample(x_train, y_train)
#creo il modello di regressione logistica
LR = LogisticRegression()
#lo fitto sul training set
LR.fit(os_x_train, os_y_train)
Out[38]:
LogisticRegression()
In [39]:
#predict 
#previsioni sul training set
p_train = LR.predict(os_x_train)
#previsioni sul test set
p_test = LR.predict(x_test)

print('Accuracy of logistic regression classifier on oversampled train set: {:.2f}'.format(LR.score(os_x_train, os_y_train)))
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(LR.score(x_test, y_test)))
Accuracy of logistic regression classifier on oversampled train set: 0.74
Accuracy of logistic regression classifier on test set: 0.69
In [40]:
#confusion matrix sul training set. Riportata solo per completezza ma non utilizzata per la valutazione del metodo
confusion_matrix_train = confusion_matrix(os_y_train, p_train)
print(confusion_matrix_train)

#confusion matrix sul test set 
confusion_matrix_test = confusion_matrix(y_test, p_test)
print(confusion_matrix_test)
[[12123  5648]
 [ 3585 14186]]
[[5239 2457]
 [ 557 1520]]
In [41]:
#stampo un iepilogo delle misure di performance sul test set del metodo
print(classification_report(y_test, p_test))
              precision    recall  f1-score   support

           0       0.90      0.68      0.78      7696
           1       0.38      0.73      0.50      2077

    accuracy                           0.69      9773
   macro avg       0.64      0.71      0.64      9773
weighted avg       0.79      0.69      0.72      9773

In [ ]:
 
In [28]:
#scalo train set oversampled
#scalo il train set bilanciato 
scaler = preprocessing.StandardScaler().fit(os_x_train)
os_x_train_scaled = scaler.transform(os_x_train)
x_test_scaled = scaler.transform(x_test)
In [43]:
# MODEL: logistic regression with feature selection  (RFECV)
#ripeto un modello di regressione logistica ma questa volta facendo prima una selezione delle features
#creo modello di regressione logistica
LRFS=LogisticRegression()
#per selezionare il sottoinsieme delle features ottimale utilizzo recursive feature 
#elimination with cross validation (RFECV)
rfecv= RFECV(estimator= LRFS, step=1, cv= StratifiedKFold(2), scoring="accuracy", min_features_to_select=1)
rfecv.fit(os_x_train_scaled, os_y_train)
Out[43]:
RFECV(cv=StratifiedKFold(n_splits=2, random_state=None, shuffle=False),
      estimator=LogisticRegression(), scoring='accuracy')
In [67]:
print("optimal num of features: %d" %rfecv.n_features_)
#trovo che il numero ottimale di features da utilizzare è 15
optimal num of features: 15
In [68]:
#plot results RFECV
In [69]:
#trovo quli sono le 15 features da utilizzare
rfecv.support_
Out[69]:
array([False, False, False, False, False,  True, False,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False, False, False, False,  True,  True])
In [29]:
#salvo le 15 features in selected_col
selected_col = [ 'loan_percent_income', 
       'person_home_ownership_MORTGAGE', 
       'person_home_ownership_OWN', 'person_home_ownership_RENT',
       'loan_intent_DEBTCONSOLIDATION', 'loan_intent_EDUCATION',
       'loan_intent_HOMEIMPROVEMENT', 'loan_intent_MEDICAL',
       'loan_intent_PERSONAL', 'loan_intent_VENTURE', 'loan_grade_A',
       'loan_grade_B', 'loan_grade_C',  'cb_person_default_on_file_N',
       'cb_person_default_on_file_Y']
In [30]:
#creo il nuovo training set contenente solo le features selezionate
FSos_x_train = os_x_train[selected_col]
FSos_x_train
Out[30]:
loan_percent_income person_home_ownership_MORTGAGE person_home_ownership_OWN person_home_ownership_RENT loan_intent_DEBTCONSOLIDATION loan_intent_EDUCATION loan_intent_HOMEIMPROVEMENT loan_intent_MEDICAL loan_intent_PERSONAL loan_intent_VENTURE loan_grade_A loan_grade_B loan_grade_C cb_person_default_on_file_N cb_person_default_on_file_Y
0 0.160000 1 0 0 0 0 0 0 1 0 0 1 0 1 0
1 0.140000 0 0 1 1 0 0 0 0 0 0 0 1 0 1
2 0.110000 0 0 1 0 0 0 0 0 1 1 0 0 1 0
3 0.070000 0 1 0 0 1 0 0 0 0 0 1 0 1 0
4 0.120000 0 0 1 0 1 0 0 0 0 0 0 1 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
35537 0.370000 0 0 1 0 0 0 0 0 0 0 0 0 0 0
35538 0.400000 0 0 1 0 0 0 0 0 0 0 1 0 1 0
35539 0.090000 1 0 0 0 0 0 1 0 0 0 1 0 1 0
35540 0.130000 0 0 0 0 0 0 0 0 0 0 0 0 0 1
35541 0.085144 0 0 0 0 0 0 0 0 0 0 0 0 1 0

35542 rows × 15 columns

In [31]:
#e il nuovo set set contenente solo le 15 features selezionate
FSx_test = x_test[selected_col]
FSx_test
Out[31]:
loan_percent_income person_home_ownership_MORTGAGE person_home_ownership_OWN person_home_ownership_RENT loan_intent_DEBTCONSOLIDATION loan_intent_EDUCATION loan_intent_HOMEIMPROVEMENT loan_intent_MEDICAL loan_intent_PERSONAL loan_intent_VENTURE loan_grade_A loan_grade_B loan_grade_C cb_person_default_on_file_N cb_person_default_on_file_Y
8231 0.18 0 1 0 0 0 0 1 0 0 1 0 0 1 0
5444 0.09 0 0 1 0 0 1 0 0 0 0 0 1 0 1
7881 0.05 1 0 0 0 0 0 0 0 1 0 0 1 1 0
13540 0.16 0 0 1 0 0 0 0 0 1 1 0 0 1 0
256 0.41 1 0 0 1 0 0 0 0 0 0 0 1 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
28732 0.16 1 0 0 1 0 0 0 0 0 1 0 0 1 0
25828 0.15 0 0 1 0 0 0 0 0 1 0 0 1 0 1
27232 0.03 1 0 0 0 1 0 0 0 0 0 0 1 1 0
11159 0.20 0 0 1 0 1 0 0 0 0 1 0 0 1 0
15315 0.05 1 0 0 0 1 0 0 0 0 1 0 0 1 0

9773 rows × 15 columns

In [47]:
#fitto il modello di regressione sul training set con numero di features ridotte
LRFS = LogisticRegression()
LRFS.fit(FSos_x_train, os_y_train)
Out[47]:
LogisticRegression()
In [48]:
#previsione sul training set
p_trainFS = LRFS.predict(FSos_x_train)
#previsione sul test set
p_testFS = LRFS.predict(FSx_test)

print('Accuracy of logistic regression classifier on oversampled train set: {:.2f}'.format(LRFS.score(FSos_x_train, os_y_train)))
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(LRFS.score(FSx_test, y_test)))
Accuracy of logistic regression classifier on oversampled train set: 0.91
Accuracy of logistic regression classifier on test set: 0.87
In [49]:
#confusion matrix sul train set
confusion_matrix_train = confusion_matrix(os_y_train, p_trainFS)
print(confusion_matrix_train)
#confusion matrix sul test set. Uso questa per giudicare la bontà del modello
confusion_matrix_test = confusion_matrix(y_test, p_testFS)
print(confusion_matrix_test)
[[16758  1013]
 [ 2346 15425]]
[[7272  424]
 [ 887 1190]]
In [50]:
#stampo un riepilogo delle misure di performance 
print(classification_report(y_test, p_testFS))
              precision    recall  f1-score   support

           0       0.89      0.94      0.92      7696
           1       0.74      0.57      0.64      2077

    accuracy                           0.87      9773
   macro avg       0.81      0.76      0.78      9773
weighted avg       0.86      0.87      0.86      9773

In [79]:
#MODEL : SVM (grid search for best param)
#stabilisco una griglia di parametri su cui andrò a testare il modello
param_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]
In [55]:
# Grid search for best hyperparameters
# creo il modello base senza ancora i parametri ottimali
SVM = SVC()
#imposto la random search sulla griglia definita precedentemente
SVM_gridsearch = GridSearchCV(SVM, param_grid)
# Fitto il modello random search sul training set
SVM_gridsearch.fit(os_x_train_scaled, os_y_train)
Out[55]:
GridSearchCV(estimator=SVC(),
             param_grid=[{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
                         {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                          'kernel': ['rbf']}])
In [56]:
#stampo la combinazione di parametri ottimale
SVM_gridsearch.best_params_
Out[56]:
{'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
In [ ]:
#C: 1000 , gamma : 0.001, kernel 'rbf'
In [51]:
#definisco il modello Support vector classifer utilizzando i parametri appena trovati
SVM = SVC(C=1000, gamma=0.001, kernel='rbf',probability=True)
In [52]:
#fitto il modello sul training set
SVM.fit(os_x_train_scaled, os_y_train)
Out[52]:
SVC(C=1000, gamma=0.001, probability=True)
In [53]:
#previsioni su train e test set
p_trainSVM = SVM.predict(os_x_train_scaled)
p_testSVM = SVM.predict(x_test_scaled)

print('Accuracy of SVM classifier on oversampled train set: {:.2f}'.format(SVM.score(os_x_train_scaled, os_y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'.format(SVM.score(x_test_scaled, y_test)))
Accuracy of SVM classifier on oversampled train set: 0.94
Accuracy of SVM classifier on test set: 0.91
In [54]:
#confusion matrix su train set
confusion_matrix_train = confusion_matrix(os_y_train, p_trainSVM)
print(confusion_matrix_train)
#confusion matrix su test set
confusion_matrix_test = confusion_matrix(y_test, p_testSVM)
print(confusion_matrix_test)
[[17429   342]
 [ 1778 15993]]
[[7527  169]
 [ 731 1346]]
In [55]:
#stampo riepilogo delle misure di performance
print(classification_report(y_test, p_testSVM))
              precision    recall  f1-score   support

           0       0.91      0.98      0.94      7696
           1       0.89      0.65      0.75      2077

    accuracy                           0.91      9773
   macro avg       0.90      0.81      0.85      9773
weighted avg       0.91      0.91      0.90      9773

In [ ]:
 
In [95]:
#SVM with feature selection
#utilizzo il dataset con le features ridotte trovato prima 
#scalo anche questo dataset
scaler = preprocessing.StandardScaler().fit(FSos_x_train)
FSos_x_train_scaled = scaler.transform(FSos_x_train)
FSx_test_scaled = scaler.transform(FSx_test)
In [96]:
#stabilisco una griglia di parametri su cui andrò a testare il modello
param_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]
In [97]:
# Grid search for best hyperparameters
# creo il modello base senza ancora i parametri ottimali
SVMFS = SVC()
#imposto la random search sulla griglia definita precedentemente
SVMFS_gridsearch = GridSearchCV(SVMFS, param_grid)
# Fitto il modello random search sul training set
SVMFS_gridsearch.fit(FSos_x_train_scaled, os_y_train)
Out[97]:
GridSearchCV(estimator=SVC(),
             param_grid=[{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
                         {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                          'kernel': ['rbf']}])
In [98]:
#stampo la combinazione di parametri ottimale
SVMFS_gridsearch.best_params_
Out[98]:
{'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
In [99]:
#definisco il modello Support vector classifier
SVMFS = SVC(C=1000, gamma=0.001, kernel='rbf',probability=True)
In [100]:
#fitto il modello sul dataset con features ridotte
SVMFS.fit(FSos_x_train_scaled, os_y_train)
Out[100]:
SVC(C=1000, gamma=0.001, probability=True)
In [102]:
#previsioni sul train e test set
p_trainSVMFS = SVMFS.predict(FSos_x_train_scaled)
p_testSVMFS = SVMFS.predict(FSx_test_scaled)

print('Accuracy of SVM classifier on oversampled train set: {:.2f}'.format(SVMFS.score(FSos_x_train_scaled, os_y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'.format(SVMFS.score(FSx_test_scaled, y_test)))
#93 89
Accuracy of SVM classifier on oversampled train set: 0.93
Accuracy of SVM classifier on test set: 0.89
In [103]:
#confusion matrix su train set 
confusion_matrix_train = confusion_matrix(os_y_train, p_trainSVMFS)
print(confusion_matrix_train)
#confusion matrix su test set 
confusion_matrix_test = confusion_matrix(y_test, p_testSVMFS)
print(confusion_matrix_test)
[[17234   537]
 [ 2038 15733]]
[[7444  252]
 [ 784 1293]]
In [104]:
#stampo misure di performance
print(classification_report(y_test, p_testSVMFS))
              precision    recall  f1-score   support

           0       0.90      0.97      0.93      7696
           1       0.84      0.62      0.71      2077

    accuracy                           0.89      9773
   macro avg       0.87      0.79      0.82      9773
weighted avg       0.89      0.89      0.89      9773

In [ ]:
 
In [30]:
# MODEL: KNN 
#definisco il modello KNN per ora con paramentri arbitrari
KNN = KNeighborsClassifier(n_neighbors=10, metric= 'minkowski', p=2)
#fitto il modello sul training set completo di tutte le features
KNN.fit(os_x_train_scaled, os_y_train)
In [27]:
#previsioni sul train e test set 
p_trainKNN = KNN.predict(os_x_train_scaled)
p_testKNN = KNN.predict(x_test_scaled)

print('Accuracy of KNN classifier on oversampled train set: {:.2f}'.format(KNN.score(os_x_train_scaled, os_y_train)))
print('Accuracy of KNN classifier on test set: {:.2f}'.format(KNN.score(x_test_scaled, y_test)))
Accuracy of KNN classifier on oversampled train set: 0.94
Accuracy of KNN classifier on test set: 0.89
In [28]:
#confusion matrix su train set
confusion_matrix_train = confusion_matrix(os_y_train, p_trainKNN)
print(confusion_matrix_train)
#confusion matrix sul test set
confusion_matrix_test = confusion_matrix(y_test, p_testKNN)
print(confusion_matrix_test)
[[17541   230]
 [ 2034 15737]]
[[7561  135]
 [ 893 1184]]
In [29]:
#stampo le performance measures del modello
print(classification_report(y_test, p_testKNN))
              precision    recall  f1-score   support

           0       0.89      0.98      0.94      7696
           1       0.90      0.57      0.70      2077

    accuracy                           0.89      9773
   macro avg       0.90      0.78      0.82      9773
weighted avg       0.90      0.89      0.89      9773

In [49]:
#il parametro fondamentale del KNN è il numero di vicini K
#trovo quale sia il K ottimale utilizzando, come per il modello precedente, una random search su una griglia da me definita
#best choice of n_neighbors using grid search
#scelgo come punti della griglia 5, 10, 15,20,...,40,45,50
parameters = {"n_neighbors": range(5,50,5)}
gridsearch = GridSearchCV (KNeighborsClassifier(), parameters)
gridsearch.fit(os_x_train_scaled, os_y_train)
In [45]:
#stampo K ottimale
gridsearch.best_params_
Out[45]:
{'n_neighbors': 5}
In [48]:
#provo a migliorare ulteriormente il risultato
#definisco una nuova griglia intorno a k=5
parameters = {"n_neighbors": range(2,9)}
gridsearch = GridSearchCV (KNeighborsClassifier(), parameters)
gridsearch.fit(os_x_train_scaled, os_y_train)
In [47]:
gridsearch.best_params_
#il k ottimale risulta ancora essere 5
Out[47]:
{'n_neighbors': 5}
In [56]:
#creo modello con K ottimale
KNN = KNeighborsClassifier(n_neighbors=5, metric= 'minkowski', p=2)
#e lo fitto sul train set
KNN.fit(os_x_train_scaled, os_y_train)
Out[56]:
KNeighborsClassifier()
In [57]:
#faccio le previsioni su train e test set
p_trainKNN = KNN.predict(os_x_train_scaled)
p_testKNN = KNN.predict(x_test_scaled)

print('Accuracy of KNN classifier on oversampled train set: {:.2f}'.format(KNN.score(os_x_train_scaled, os_y_train)))
print('Accuracy of KNN classifier on test set: {:.2f}'.format(KNN.score(x_test_scaled, y_test)))
Accuracy of KNN classifier on oversampled train set: 0.95
Accuracy of KNN classifier on test set: 0.89
In [58]:
#confusion matrix su train set
confusion_matrix_train = confusion_matrix(os_y_train, p_trainKNN)
print(confusion_matrix_train)
#confusion matrix su test set
confusion_matrix_test = confusion_matrix(y_test, p_testKNN)
print(confusion_matrix_test)
[[17426   345]
 [ 1556 16215]]
[[7439  257]
 [ 797 1280]]
In [59]:
#riepilogo misure di performance del modello
print(classification_report(y_test, p_testKNN))
              precision    recall  f1-score   support

           0       0.90      0.97      0.93      7696
           1       0.83      0.62      0.71      2077

    accuracy                           0.89      9773
   macro avg       0.87      0.79      0.82      9773
weighted avg       0.89      0.89      0.89      9773

In [64]:
#KNN CON FEATURE SELECTION
#ripeto lo stesso modello ma questa volta sul dataset di sole 15 features
#scalo questo dataset
scaler = preprocessing.StandardScaler().fit(FSos_x_train)
FSos_x_train_scaled = scaler.transform(FSos_x_train)
FSx_test_scaled = scaler.transform(FSx_test)
In [122]:
#definisco una nuova griglia intorno a k=10
parameters = {"n_neighbors": range(6,14)}
gridsearch = GridSearchCV (KNeighborsClassifier(), parameters)
gridsearch.fit(FSos_x_train_scaled, os_y_train)
Out[122]:
GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(6, 14)})
In [123]:
gridsearch.best_params_
#k=9 optimal value
Out[123]:
{'n_neighbors': 9}
In [65]:
#creo il modello e lo fitto
KNNFS = KNeighborsClassifier(n_neighbors=9, metric= 'minkowski', p=2)
KNNFS.fit(FSos_x_train_scaled, os_y_train)
Out[65]:
KNeighborsClassifier(n_neighbors=9)
In [66]:
#eseguo le previsioni sul train e test set
p_trainKNNFS = KNNFS.predict(FSos_x_train_scaled)
p_testKNNFS = KNNFS.predict(FSx_test_scaled)

print('Accuracy of KNN classifier on oversampled train set: {:.2f}'.format(KNNFS.score(FSos_x_train_scaled, os_y_train)))
print('Accuracy of KNN classifier on test set: {:.2f}'.format(KNNFS.score(FSx_test_scaled, y_test)))
Accuracy of KNN classifier on oversampled train set: 0.93
Accuracy of KNN classifier on test set: 0.89
In [67]:
#confusion matrix su train set
confusion_matrix_train = confusion_matrix(os_y_train, p_trainKNNFS)
print(confusion_matrix_train)
#confusion matrix su test set
confusion_matrix_test = confusion_matrix(y_test, p_testKNNFS)
print(confusion_matrix_test)
[[17355   416]
 [ 1933 15838]]
[[7439  257]
 [ 794 1283]]
In [68]:
#riepilogo misure di performance
print(classification_report(y_test, p_testKNNFS))
              precision    recall  f1-score   support

           0       0.90      0.97      0.93      7696
           1       0.83      0.62      0.71      2077

    accuracy                           0.89      9773
   macro avg       0.87      0.79      0.82      9773
weighted avg       0.89      0.89      0.89      9773

In [ ]:
 
In [77]:
# MODEL: RANDOM FOREST (RF)
#creo modello random forest, setto un seme per ottenere sempre lo stesso risultato essendo questo un metodo non deterministico
RF = RandomForestClassifier(random_state=0)
#per ora ho lasciato i parametri di default
#vado a vedere quali sono tali parametri 
print('Parameters currently in use:\n')
pprint(RF.get_params())
Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}
In [49]:
# fitto la random forestcon i parametri di default
RF.fit(os_x_train, os_y_train)
Out[49]:
RandomForestClassifier(random_state=0)
In [50]:
#faccio previsioni su train e test set
p_train_RF = RF.predict(os_x_train)
p_test_RF = RF.predict(x_test)

print('Accuracy of random forest on oversampled train set: {:.2f}'.format(rf.score(os_x_train, os_y_train)))
print('Accuracy of random forest on test set: {:.2f}'.format(rf.score(x_test, y_test)))
#il modello evidentemente, vista l'accuratezza pari a 1 sul train set splitta fino a quando tutti i leaves contengono
#solo un'osservazione ----> i paramentri vanno cambiati
Accuracy of random forest on oversampled train set: 1.00
Accuracy of random forest on test set: 0.93
In [51]:
print(p_train_RF)
[0 0 0 ... 1 1 1]
In [53]:
#confusion matrix su train set
confusion_matrix_train = confusion_matrix(os_y_train, p_train_RF)
print(confusion_matrix_train)
#confusion matrix su test set
confusion_matrix_test = confusion_matrix(y_test, p_test_RF)
print(confusion_matrix_test)
[[17771     0]
 [    0 17771]]
[[7598   98]
 [ 582 1495]]
In [56]:
#riepilogo misure d performance del modello
print(classification_report(y_test, p_test_RF))
              precision    recall  f1-score   support

           0       0.93      0.99      0.96      7696
           1       0.94      0.72      0.81      2077

    accuracy                           0.93      9773
   macro avg       0.93      0.85      0.89      9773
weighted avg       0.93      0.93      0.93      9773

In [ ]:
#cambiamo i parametri del modello
In [79]:
#come per i modelli precedenti troviamo i parametri ottimali eseguendo una random search su una griglia
# definisco la random hyperparameter grid
# numero di alberi nella random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 5)]
# numero di features considerato ad ogni split
max_features = ['auto', 'sqrt']
# massima profondità dell'albero
max_depth = [int(x) for x in np.linspace(10, 80, num = 4)]
max_depth.append(None)
# minimo numero di samples per splittare un nodo
min_samples_split = [2, 5, 10]
# minimo numero di samples richiesto a ogni nodo terminale dell'albero
min_samples_leaf = [ 2, 4]
# metodo per selezionare samples per training di ogni albero
bootstrap = [True]
# creo la random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)
{'bootstrap': [True],
 'max_depth': [10, 33, 56, 80, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 650, 1100, 1550, 2000]}
In [49]:
# uso la random grid per la ricerca dei paramentri ottimali
# prima creo il modello base su cui voglio fare tuning
RF = RandomForestClassifier()
# Random search dei parametri, usando 3 fold cross validation, 
RF_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fitto il modello random search
RF_random.fit(os_x_train, os_y_train)
Fitting 3 folds for each of 50 candidates, totalling 150 fits
Out[49]:
RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=50,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True],
                                        'max_depth': [10, 33, 56, 80, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 650, 1100, 1550,
                                                         2000]},
                   random_state=42, verbose=2)
In [50]:
#trovo i best parameters
RF_random.best_params_
Out[50]:
{'n_estimators': 200,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 33,
 'bootstrap': True}
In [69]:
#RANDOM FOREST with best parameters
#definisco il modello con i parametri ottimali
RF = RandomForestClassifier(random_state=0, n_estimators= 200,
min_samples_split= 2,min_samples_leaf= 2,max_features= 'sqrt',
max_depth= 33,bootstrap= True )
In [70]:
#e lo fitto sul train set
RF.fit(os_x_train, os_y_train)
Out[70]:
RandomForestClassifier(max_depth=33, max_features='sqrt', min_samples_leaf=2,
                       n_estimators=200, random_state=0)
In [71]:
#effettuo previsioni su train e test set
p_train_RF = RF.predict(os_x_train)
p_test_RF = RF.predict(x_test)

print('Accuracy of random forest on oversampled train set: {:.2f}'.format(RF.score(os_x_train, os_y_train)))
print('Accuracy of random forest on test set: {:.2f}'.format(RF.score(x_test, y_test)))
Accuracy of random forest on oversampled train set: 0.98
Accuracy of random forest on test set: 0.93
In [72]:
#confusion matrix su train set
confusion_matrix_train = confusion_matrix(os_y_train, p_train_RF)
print(confusion_matrix_train)
#confusion matrix su test set
confusion_matrix_test = confusion_matrix(y_test, p_test_RF)
print(confusion_matrix_test)
[[17753    18]
 [  734 17037]]
[[7629   67]
 [ 597 1480]]
In [73]:
#riepilogo misure di performance
print(classification_report(y_test, p_test_RF))
              precision    recall  f1-score   support

           0       0.93      0.99      0.96      7696
           1       0.96      0.71      0.82      2077

    accuracy                           0.93      9773
   macro avg       0.94      0.85      0.89      9773
weighted avg       0.93      0.93      0.93      9773

In [87]:
#cerco di migliorare ulteriormente i risultati utilizzando una grid search
#baso la mia nuova griglia sui risultati otteniti prima, andando a cercare 'nelle vicinanze' di quello che,
#secondo la grid search di prima, era il miglior modello
#definisco la nuova griglia
param_grid = {
    'bootstrap': [True],
    'max_depth': [25,30,33,35],
    'max_features': ['sqrt'],
    'min_samples_leaf': [2,5],
    'min_samples_split': [2,5],
    'n_estimators': [200]
}
In [88]:
# modello grid search 
grid_search = GridSearchCV(estimator = RandomForestClassifier(), param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)
In [89]:
# Fitto la grid search ai dati
grid_search.fit(os_x_train, os_y_train)
#trovo i best parameters
grid_search.best_params_
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Out[89]:
{'bootstrap': True,
 'max_depth': 33,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 200}
In [56]:
#teniamo RF come best parameter model in quanto risulta ancora il miglio modello
In [ ]:
 
In [128]:
#MODEL: RANDOM FOREST con feature selection
#uso di nuovo la random forest ma questa volta sul dataset con features ridotte

#come per i modelli precedenti troviamo i parametri ottimali eseguendo una random search su una griglia
# definisco la random hyperparameter grid
# numero di alberi nella foresta
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 5)]
# numero di features considerate ad ogni split
max_features = ['auto', 'sqrt']
# profondità massima dell'albero
max_depth = [int(x) for x in np.linspace(10, 80, num = 4)]
max_depth.append(None)
# minmimo numero di samples richiesto per lo split di un nodo
min_samples_split = [2, 5, 10]
# minimo numero di samples richiesto a ogni leaf
min_samples_leaf = [ 2, 4]
# metodo per seleionare i samples per il training di ciascun albero
bootstrap = [True]
# Creo la random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)
{'bootstrap': [True],
 'max_depth': [10, 33, 56, 80, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 650, 1100, 1550, 2000]}
In [130]:
#uso la random grid per trovare i best hyperparameters
#creo il modello base su cui fare tuning
RFFS = RandomForestClassifier()
# Random search dei parametri, usando 3 fold cross validation, 
RFFS_random = RandomizedSearchCV(estimator = RFFS, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fitto il modello random search
RFFS_random.fit(FSos_x_train, os_y_train)
Fitting 3 folds for each of 50 candidates, totalling 150 fits
Out[130]:
RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=50,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True],
                                        'max_depth': [10, 33, 56, 80, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 650, 1100, 1550,
                                                         2000]},
                   random_state=42, verbose=2)
In [131]:
#trovo i parametri migliori secondo il modello
RFFS_random.best_params_
Out[131]:
{'n_estimators': 1550,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 80,
 'bootstrap': True}
In [74]:
#definisco il nuovo modello utilizzando i parametri trovati
RFFS = RandomForestClassifier(random_state=0, n_estimators= 1550,
 min_samples_split= 10,min_samples_leaf= 4,max_features= 'sqrt',
 max_depth= 80,bootstrap= True )
In [75]:
#fitto il modello sul train set con numero di features ridotto
RFFS.fit(FSos_x_train, os_y_train)
Out[75]:
RandomForestClassifier(max_depth=80, max_features='sqrt', min_samples_leaf=4,
                       min_samples_split=10, n_estimators=1550, random_state=0)
In [76]:
#eseguo le previsioni
p_train_RFFS = RFFS.predict(FSos_x_train)
p_test_RFFS = RFFS.predict(FSx_test)

print('Accuracy of random forest on oversampled train set: {:.2f}'.format(RFFS.score(FSos_x_train, os_y_train)))
print('Accuracy of random forest on test set: {:.2f}'.format(RFFS.score(FSx_test, y_test)))
#94 98
Accuracy of random forest on oversampled train set: 0.94
Accuracy of random forest on test set: 0.89
In [77]:
#confusion matrix su train set
confusion_matrix_train = confusion_matrix(os_y_train, p_train_RFFS)
print(confusion_matrix_train)
#confusion matrix su test set
confusion_matrix_test = confusion_matrix(y_test, p_test_RFFS)
print(confusion_matrix_test)
#7405   291
#786   1291
[[17391   380]
 [ 1880 15891]]
[[7457  239]
 [ 793 1284]]
In [78]:
#misure di performance
print(classification_report(y_test, p_test_RFFS))
#82  62   71   89
              precision    recall  f1-score   support

           0       0.90      0.97      0.94      7696
           1       0.84      0.62      0.71      2077

    accuracy                           0.89      9773
   macro avg       0.87      0.79      0.82      9773
weighted avg       0.89      0.89      0.89      9773

In [ ]:
 
In [32]:
#MODEL: ADABOOST (AB)
#il metodo ada boost è un metodo ensemble basato su classificatori di base che solitamente sono scelti come 
#alberi con un solo split
#definisco quindi il classificatore di base, settando il seme per ottenere sempre lo stesso risultato e 
#fissando a 1 la profondità massima dell'albero
#base estimator: decision tree
decTree = tree.DecisionTreeClassifier(random_state=0, max_depth=1)
#fitto l'albero sul train set 
decTree.fit(os_x_train, os_y_train)
#rappresentazione dell'albero
tree.plot_tree(decTree)
Out[32]:
[Text(0.5, 0.75, 'X[5] <= 0.3\ngini = 0.5\nsamples = 35542\nvalue = [17771, 17771]'),
 Text(0.25, 0.25, 'gini = 0.476\nsamples = 27833\nvalue = [16974, 10859]'),
 Text(0.75, 0.25, 'gini = 0.185\nsamples = 7709\nvalue = [797, 6912]')]
In [ ]:
#adaboost classifier
#a questo punto posso definire il modello adaboost
AB = AdaBoostClassifier(n_estimators=50, base_estimator=decTree, learning_rate=1)
#fitto il modello sul train set
AB.fit(os_x_train, os_y_train)
In [ ]:
#previsioni su train e test set
p_train_AB = AB.predict(os_x_train)
p_test_AB = AB.predict(x_test)

print('Accuracy of random forest on oversampled train set: {:.2f}'.format(AB.score(os_x_train, os_y_train)))
print('Accuracy of random forest on test set: {:.2f}'.format(AB.score(x_test, y_test)))
In [ ]:
#confusion matrix su train set
confusion_matrix_train = confusion_matrix(os_y_train, p_train_AB)
print(confusion_matrix_train)
#confusion matrix su test set
confusion_matrix_test = confusion_matrix(y_test, p_test_AB)
print(confusion_matrix_test)
In [105]:
#hyperparameter tuning
#cerco parametri ottimali 

#definisco la griglia:
# Number of trees
n_estimators = [int(x) for x in np.linspace(start = 0, stop = 1000, num = 50)]
#learning rate
learning_rate=[0.4,0.5,0.6,0.7,0.8,0.9,1]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'learning_rate': learning_rate}
In [106]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
AB = AdaBoostClassifier()
# Random search of parameters, using 3 fold cross validation, 
AB_random = RandomizedSearchCV(estimator = AB, param_distributions = random_grid, n_iter = 50, cv = 5, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
AB_random.fit(os_x_train, os_y_train)
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Out[106]:
RandomizedSearchCV(cv=5, estimator=AdaBoostClassifier(), n_iter=50, n_jobs=-1,
                   param_distributions={'learning_rate': [0.4, 0.5, 0.6, 0.7,
                                                          0.8, 0.9, 1],
                                        'n_estimators': [0, 20, 40, 61, 81, 102,
                                                         122, 142, 163, 183,
                                                         204, 224, 244, 265,
                                                         285, 306, 326, 346,
                                                         367, 387, 408, 428,
                                                         448, 469, 489, 510,
                                                         530, 551, 571, 591, ...]},
                   random_state=42, verbose=2)
In [107]:
#trovo parametri ottiamli
AB_random.best_params_   
Out[107]:
{'n_estimators': 897, 'learning_rate': 0.9}
In [38]:
#MODEL: ADABOOST with best parameters
#definisco il modello con i best parameters trovati
AB = AdaBoostClassifier(n_estimators=897, base_estimator=decTree, learning_rate=0.9)
AB.fit(os_x_train, os_y_train)
In [34]:
#eseguo le previsioni su train e test set
p_train_AB = AB.predict(os_x_train)
p_test_AB = AB.predict(x_test)

print('Accuracy of random forest on oversampled train set: {:.2f}'.format(AB.score(os_x_train, os_y_train)))
print('Accuracy of random forest on test set: {:.2f}'.format(AB.score(x_test, y_test)))
Accuracy of random forest on oversampled train set: 0.93
Accuracy of random forest on test set: 0.89
In [35]:
#confusion matrix su train set
confusion_matrix_train = confusion_matrix(os_y_train, p_train_AB)
print(confusion_matrix_train)
#confusion matrix su test set
confusion_matrix_test = confusion_matrix(y_test, p_test_AB)
print(confusion_matrix_test)
[[16912   859]
 [ 1627 16144]]
[[7302  394]
 [ 671 1406]]
In [36]:
#riepilogo performance measures
print(classification_report(y_test, p_test_AB))
              precision    recall  f1-score   support

           0       0.92      0.95      0.93      7696
           1       0.78      0.68      0.73      2077

    accuracy                           0.89      9773
   macro avg       0.85      0.81      0.83      9773
weighted avg       0.89      0.89      0.89      9773

In [37]:
#cerco di migliorere ulteriormente adaBoost usando grid search
#la nuova griglia è scelta in modo da essere un intorno del punto ottimale trovato precedentemente
param_grid = {
    'n_estimators':[850,897,900,950],
    'learning_rate':[0.85,0.88,0.9,0.92,0.95]
}
In [114]:
# creo il modello grid search 
grid_search = GridSearchCV(estimator = AdaBoostClassifier(), param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)
In [115]:
# Fittto grid search sui dati
grid_search.fit(os_x_train, os_y_train)
#stampo paramentri ottimali
grid_search.best_params_
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Out[115]:
{'learning_rate': 0.92, 'n_estimators': 950}
In [81]:
#definisco modello con i nuovi parametri 
AB = AdaBoostClassifier(n_estimators=950, base_estimator=decTree, learning_rate=0.92)
#e lo fitto sul train set
AB.fit(os_x_train, os_y_train)
Out[81]:
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1,
                                                         random_state=0),
                   learning_rate=0.92, n_estimators=950)
In [82]:
#eseguo le previsioni
p_train_AB = AB.predict(os_x_train)
p_test_AB = AB.predict(x_test)

print('Accuracy of random forest on oversampled train set: {:.2f}'.format(AB.score(os_x_train, os_y_train)))
print('Accuracy of random forest on test set: {:.2f}'.format(AB.score(x_test, y_test)))
Accuracy of random forest on oversampled train set: 0.93
Accuracy of random forest on test set: 0.89
In [83]:
#confusion matrix su train set
confusion_matrix_train = confusion_matrix(os_y_train, p_train_AB)
print(confusion_matrix_train)
#confusion matrix su test set
confusion_matrix_test = confusion_matrix(y_test, p_test_AB)
print(confusion_matrix_test)
[[16923   848]
 [ 1628 16143]]
[[7297  399]
 [ 678 1399]]
In [84]:
#riepilogo misure di performance
print(classification_report(y_test, p_test_AB))
              precision    recall  f1-score   support

           0       0.91      0.95      0.93      7696
           1       0.78      0.67      0.72      2077

    accuracy                           0.89      9773
   macro avg       0.85      0.81      0.83      9773
weighted avg       0.89      0.89      0.89      9773

In [ ]:
 
In [85]:
#MODEL: ADABOOST con feature selection
#ripeto lo stesso modello ma usando il dataset contente solo 15 features
#definisco lo stimatore base 
#base estimator: decision tree
decTree3 = tree.DecisionTreeClassifier(random_state=0, max_depth=1)

decTree3.fit(FSos_x_train, os_y_train)
tree.plot_tree(decTree)

#hyperparameter tuning
#cerco parametri ottimali 

#definisco la griglia:
# Number of trees
n_estimators = [int(x) for x in np.linspace(start = 0, stop = 1000, num = 50)]
#learning rate
learning_rate=[0.4,0.5,0.6,0.7,0.8,0.9,1]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'learning_rate': learning_rate}
In [86]:
# Uso random grid per cercare i best parameters 
# creo il modello base su cui fare tuning
ABFS = AdaBoostClassifier()
# Random search dei parametri, usando 3 fold cross validation, 
ABFS_random = RandomizedSearchCV(estimator = ABFS, param_distributions = random_grid, n_iter = 50, cv = 5, verbose=2, random_state=42, n_jobs = -1)
# fitto il modello random search
ABFS_random.fit(FSos_x_train, os_y_train)
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Out[86]:
RandomizedSearchCV(cv=5, estimator=AdaBoostClassifier(), n_iter=50, n_jobs=-1,
                   param_distributions={'learning_rate': [0.4, 0.5, 0.6, 0.7,
                                                          0.8, 0.9, 1],
                                        'n_estimators': [0, 20, 40, 61, 81, 102,
                                                         122, 142, 163, 183,
                                                         204, 224, 244, 265,
                                                         285, 306, 326, 346,
                                                         367, 387, 408, 428,
                                                         448, 469, 489, 510,
                                                         530, 551, 571, 591, ...]},
                   random_state=42, verbose=2)
In [87]:
#trovo parametri ottiamli
ABFS_random.best_params_ 
Out[87]:
{'n_estimators': 102, 'learning_rate': 0.4}
In [88]:
#MODEL: ADABOOST with best parameters
#definisco il modello con i best parameters trovati
ABFS = AdaBoostClassifier(n_estimators= 102  , base_estimator=decTree3, learning_rate= 0.4)
ABFS.fit(FSos_x_train, os_y_train)
Out[88]:
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1,
                                                         random_state=0),
                   learning_rate=0.4, n_estimators=102)
In [89]:
#eseguo le previsioni su train e test set
p_train_ABFS = ABFS.predict(FSos_x_train)
p_test_ABFS = ABFS.predict(FSx_test)

print('Accuracy of random forest on oversampled train set: {:.2f}'.format(ABFS.score(FSos_x_train, os_y_train)))
print('Accuracy of random forest on test set: {:.2f}'.format(ABFS.score(FSx_test, y_test)))
Accuracy of random forest on oversampled train set: 0.92
Accuracy of random forest on test set: 0.88
In [90]:
#confusion matrix su train set
confusion_matrix_train = confusion_matrix(os_y_train, p_train_ABFS)
print(confusion_matrix_train)
#confusion matrix su test set
confusion_matrix_test = confusion_matrix(y_test, p_test_ABFS)
print(confusion_matrix_test)
#7245  451
#748  1329
[[16619  1152]
 [ 1839 15932]]
[[7205  491]
 [ 679 1398]]
In [91]:
#riepilogo misure di performance
print(classification_report(y_test, p_test_ABFS))
#75   64   69  88
              precision    recall  f1-score   support

           0       0.91      0.94      0.92      7696
           1       0.74      0.67      0.70      2077

    accuracy                           0.88      9773
   macro avg       0.83      0.80      0.81      9773
weighted avg       0.88      0.88      0.88      9773

In [92]:
#confronto i metodi usati utilizzando le curve ROC
#confronto per tutte e 5 le tipologie di modello (regressione logistica, SVM, KNN, random forest, AdaBoost) i modelli
#ottenuti con i best parameters e utilizzando tutte le features

#roc (all features)

#creo la figura
fig = plt.figure(figsize=(8,5))
plt.plot([0, 1], [0, 1],'r--')


# LOGISTIC REGRESSION (logreg)
#previsioni delle prob di default e di non default del modello regressione logistica
preds_proba_LR = LR.predict_proba(x_test)
#il risultato è una matrice con due colonne. Ciascuna colonna contiene rispettivamente la prob di default e la 
#prob di non default di quell'inidividuo

#di questa matrice prendo solo la prima colonna, la seconda sarà semplicemente (1-prima colonna)
probsLR = preds_proba_LR[:, 1]
#disegno roc curve per regressione logistica
fpr, tpr, thresh = metrics.roc_curve(y_test, probsLR)
aucLR = roc_auc_score(y_test, probsLR)
plt.plot(fpr, tpr, label=f'LR, AUC = {str(round(aucLR,3))}')


#ripeto gli stessi passaggi per gli altri 4 metodi
# SVM 
preds_proba_SVM = SVM.predict_proba(x_test_scaled)
probsSVM = preds_proba_SVM[:, 1]
fpr, tpr, thresh = metrics.roc_curve(y_test, probsSVM)
auclg = roc_auc_score(y_test, probsSVM)
plt.plot(fpr, tpr, label=f'SVM, AUC = {str(round(auclg,3))}')


# KNN 
preds_proba_KNN = KNN.predict_proba(x_test_scaled)
probsKNN = preds_proba_KNN[:, 1]
fpr, tpr, thresh = metrics.roc_curve(y_test, probsKNN)
auclg = roc_auc_score(y_test, probsKNN)
plt.plot(fpr, tpr, label=f'KNN, AUC = {str(round(auclg,3))}')


#RANDOM FOREST (RF)
preds_proba_RF = RF.predict_proba(x_test)
probsRF = preds_proba_RF[:, 1]
fpr, tpr, thresh = metrics.roc_curve(y_test, probsRF)
auclg = roc_auc_score(y_test, probsRF)
plt.plot(fpr, tpr, label=f'RF, AUC = {str(round(auclg,3))}')


# ADABOOST (AB)
preds_proba_AB = AB.predict_proba(x_test)
probsAB = preds_proba_AB[:, 1]
fpr, tpr, thresh = metrics.roc_curve(y_test, probsAB)
auclg = roc_auc_score(y_test, probsAB)
plt.plot(fpr, tpr, label=f'AB, AUC = {str(round(auclg,3))}')

plt.ylabel("True Positive Rate", fontsize=12)
plt.xlabel("False Positive Rate", fontsize=12)
plt.title("ROC curve")
plt.rcParams['axes.titlesize'] = 16
plt.legend()
plt.show()
In [105]:
#confronto le curve ROC dei modelli basati solo su 15 features (empre considerando i modelli con best parameters)

#roc (feature selection)
fig = plt.figure(figsize=(8,5))
plt.plot([0, 1], [0, 1],'r--')


# RFECV LOGISTIC REGRESSION (LRFS)
preds_proba_LRFS = LRFS.predict_proba(FSx_test)
probsLRFS = preds_proba_LRFS[:, 1]
fpr, tpr, thresh = metrics.roc_curve(y_test, probsLRFS)
auclg = roc_auc_score(y_test, probsLRFS)
plt.plot(fpr, tpr, label=f'LR, AUC = {str(round(auclg,3))}')


# SVM (SVMFS)
preds_proba_SVMFS = SVMFS.predict_proba(FSx_test_scaled)
probsSVMFS = preds_proba_SVMFS[:, 1]
fpr, tpr, thresh = metrics.roc_curve(y_test, probsSVMFS)
auclg = roc_auc_score(y_test, probsSVMFS)
plt.plot(fpr, tpr, label=f'SVM, AUC = {str(round(auclg,3))}')


# KNN   (KNNFS)
preds_proba_KNNFS = KNNFS.predict_proba(FSx_test_scaled)
probsKNNFS = preds_proba_KNNFS[:, 1]
fpr, tpr, thresh = metrics.roc_curve(y_test, probsKNNFS)
auclg = roc_auc_score(y_test, probsKNNFS)
plt.plot(fpr, tpr, label=f'KNN, AUC = {str(round(auclg,3))}')

 
#RANDOM FOREST (RFFS)
preds_proba_RFFS = RFFS.predict_proba(FSx_test)
probsRFFS = preds_proba_RFFS[:, 1]
fpr, tpr, thresh = metrics.roc_curve(y_test, probsRFFS)
auclg = roc_auc_score(y_test, probsRFFS)
plt.plot(fpr, tpr, label=f'RF, AUC = {str(round(auclg,3))}')


# ADABOOST(ABFS)
preds_proba_ABFS = ABFS.predict_proba(FSx_test)
probsABFS = preds_proba_ABFS[:, 1]
fpr, tpr, thresh = metrics.roc_curve(y_test, probsABFS)
auclg = roc_auc_score(y_test, probsABFS)
plt.plot(fpr, tpr, label=f'AB, AUC = {str(round(auclg,3))}')

plt.ylabel("True Positive Rate", fontsize=12)
plt.xlabel("False Positive Rate", fontsize=12)
plt.title("ROC curve")
plt.rcParams['axes.titlesize'] = 16
plt.legend()
plt.show()
In [ ]:
 
In [106]:
# CLUSTER-BASED METHODS
# utilizzo nuovamente gli algoritmi di apprendimento supervisionato visti sopra, ma questa volta li applico
# sui cluster invece che sul dataset completo
In [40]:
#importo delle librerie utili per la divisione in cluster
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
In [41]:
#K-MEANS clustering
#prima di effettuare il K-means clustering standardizzo le features selezionate con RFECV  
scaler = preprocessing.StandardScaler().fit(features[selected_col])
features_scaled = scaler.transform(features[selected_col])
In [43]:
#due possibili metodi per trovare il numero ideale di cluster K:

#PRIMO METODO: elbow method
#scelgo il K che coincide con il 'gomito' del grafico che ha in ascissa il numero di cluster e in ordinata l'SSE
kmeans_kwargs = {
        "init": "random",
        "n_init": 10,
        "max_iter": 300,
        "random_state": 42,
        }
   
# creo una lista contenente i valori degli SSE per ciascun k
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(features_scaled)
    sse.append(kmeans.inertia_)
In [44]:
plt.style.use("fivethirtyeight")
plt.plot(range(1, 11), sse)
plt.xticks(range(1, 11))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()
In [45]:
#trovo il k ottimale in corrispondenza del gomito tramite 'kneelocator'
kl = KneeLocator( range(1, 11), sse, curve="convex", direction="decreasing")
kl.elbow
#trovo che il numero ottimale di cluster è 4
Out[45]:
4
In [253]:
#SECONDO METODO: silhouette coefficient
#sceglo il K che massimizzi la silhouette
silhouette_coefficients = []

for k in range(2, 6):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(features_scaled)
    score = silhouette_score(features_scaled, kmeans.labels_)
    silhouette_coefficients.append(score)
In [255]:
plt.style.use("fivethirtyeight")
plt.plot(range(2, 6), silhouette_coefficients)
plt.xticks(range(2, 6))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()
In [46]:
#4 è un massimo locale per il coefficiente di silhouette
#unendo i due approcci visti, il numero ottimale di cluster sembrerebbe essere k=4

#divisione del dataset nei cluster 
#applico algoritmo k_means
kmeans = KMeans(n_clusters=4, random_state=0).fit(features_scaled)
#vedo i labels attribuiti a ciascuna osservazione (label 0 per il primo cluster, label 1 per il secondo,...)
kmeans.labels_
Out[46]:
array([0, 1, 2, ..., 0, 0, 0])
In [47]:
#test_train splitting
#dovendo di nuovo 'riaddestrare' gli algoritmi supervisionati, splitto ciascun cluster in train e test set
encData['cluster']=kmeans.labels_
encData1 = encData[encData['cluster']==0]
encData2 = encData[encData['cluster']==1]
encData3 = encData[encData['cluster']==2]
encData4 = encData[encData['cluster']==3]
label1= encData1['loan_status']
features1= encData1.drop('loan_status', axis=1)
features1=features1.drop('cluster',axis=1)
label2= encData2['loan_status']
features2= encData2.drop('loan_status', axis=1)
features2= features2.drop('cluster', axis=1)
label3= encData3['loan_status']
features3= encData3.drop('loan_status', axis=1)
features3=features3.drop('cluster',axis=1)
label4= encData4['loan_status']
features4= encData4.drop('loan_status', axis=1)
features4=features4.drop('cluster',axis=1)
In [48]:
#vedo conteggi di default = 0 e default =1 per il primo cluster
label1.value_counts()
Out[48]:
0    8747
1    1701
Name: loan_status, dtype: int64
In [49]:
#vedo conteggi di default = 0 e default =1 per il secondo cluster
label2.value_counts()
Out[49]:
0    6617
1     794
Name: loan_status, dtype: int64
In [50]:
#vedo conteggi di default = 0 e default =1 per il terzo cluster
label3.value_counts()
Out[50]:
0    6530
1    2441
Name: loan_status, dtype: int64
In [51]:
#vedo conteggi di default = 0 e default =1 per il quarto cluster
label4.value_counts()
Out[51]:
0    3573
1    2171
Name: loan_status, dtype: int64
In [52]:
#divido il primo cluster in train e test set (rispettivamente 70 e 30%)
x_train1, x_test1, y_train1, y_test1 = model_selection.train_test_split(features1, label1, random_state=0, test_size = 0.30)
#divido il secondo cluster in train e test set (rispettivamente 70 e 30%)
x_train2, x_test2, y_train2, y_test2 = model_selection.train_test_split(features2, label2, random_state=0, test_size = 0.30)
#divido il terzo cluster in train e test set (rispettivamente 70 e 30%)
x_train3, x_test3, y_train3, y_test3 = model_selection.train_test_split(features3, label3, random_state=0, test_size = 0.30)
#divido il quarto cluster in train e test set (rispettivamente 70 e 30%)
x_train4, x_test4, y_train4, y_test4 = model_selection.train_test_split(features4, label4, random_state=0, test_size = 0.30)
In [53]:
#bilancio i quattro train set 
oversample = SMOTE (random_state=0)
os_x_train1, os_y_train1 = oversample.fit_resample(x_train1, y_train1)
os_x_train1 = pd.DataFrame(data = os_x_train1, columns = x_train1.columns)
os_y_train1 = pd.DataFrame(data = os_y_train1, columns= ['loan_status'])

os_x_train2, os_y_train2 = oversample.fit_resample(x_train2, y_train2)
os_x_train2 = pd.DataFrame(data = os_x_train2, columns = x_train2.columns)
os_y_train2 = pd.DataFrame(data = os_y_train2, columns= ['loan_status'])

os_x_train3, os_y_train3 = oversample.fit_resample(x_train3, y_train3)
os_x_train3 = pd.DataFrame(data = os_x_train3, columns = x_train3.columns)
os_y_train3 = pd.DataFrame(data = os_y_train3, columns= ['loan_status'])

os_x_train4, os_y_train4 = oversample.fit_resample(x_train4, y_train4)
os_x_train4 = pd.DataFrame(data = os_x_train4, columns = x_train4.columns)
os_y_train4 = pd.DataFrame(data = os_y_train4, columns= ['loan_status'])


#concateno i label dei quattro train set e dei quattro test set(servirà dopo per valutare le performances dei metodi post-cluster)
os_y_trainT = pd.concat([os_y_train1,os_y_train2,os_y_train3,os_y_train4], ignore_index=True)
y_testT = pd.concat([y_test1,y_test2, y_test3, y_test4], ignore_index=True)
In [54]:
print("length of oversampled data is ",len(os_x_train1))
print("Number of no default in oversampled data",len(os_y_train1[os_y_train1['loan_status']==0]))
print("Number of default",len(os_y_train1[os_y_train1['loan_status']==1]))
print("Proportion of no default data in oversampled data is ",len(os_y_train1[os_y_train1['loan_status']==0])/len(os_x_train1))
print("Proportion of default data in oversampled data is ",len(os_y_train1[os_y_train1['loan_status']==1])/len(os_x_train1))
length of oversampled data is  12226
Number of no default in oversampled data 6113
Number of default 6113
Proportion of no default data in oversampled data is  0.5
Proportion of default data in oversampled data is  0.5
In [55]:
print("length of oversampled data is ",len(os_x_train2))
print("Number of no default in oversampled data",len(os_y_train2[os_y_train2['loan_status']==0]))
print("Number of default",len(os_y_train2[os_y_train2['loan_status']==1]))
print("Proportion of no default data in oversampled data is ",len(os_y_train2[os_y_train2['loan_status']==0])/len(os_x_train2))
print("Proportion of default data in oversampled data is ",len(os_y_train2[os_y_train2['loan_status']==1])/len(os_x_train2))
length of oversampled data is  9304
Number of no default in oversampled data 4652
Number of default 4652
Proportion of no default data in oversampled data is  0.5
Proportion of default data in oversampled data is  0.5
In [56]:
#scalo i train set
scaler = preprocessing.StandardScaler().fit(os_x_train1)
os_x_train1_scaled = scaler.transform(os_x_train1)
x_test1_scaled = scaler.transform(x_test1)

scaler = preprocessing.StandardScaler().fit(os_x_train2)
os_x_train2_scaled = scaler.transform(os_x_train2)
x_test2_scaled = scaler.transform(x_test2)

scaler = preprocessing.StandardScaler().fit(os_x_train3)
os_x_train3_scaled = scaler.transform(os_x_train3)
x_test3_scaled = scaler.transform(x_test3)

scaler = preprocessing.StandardScaler().fit(os_x_train4)
os_x_train4_scaled = scaler.transform(os_x_train4)
x_test4_scaled = scaler.transform(x_test4)
In [57]:
#salvo il vettore di features selezionate dalla feature selection
selected_col = [ 'loan_percent_income', 
       'person_home_ownership_MORTGAGE', 
       'person_home_ownership_OWN', 'person_home_ownership_RENT',
       'loan_intent_DEBTCONSOLIDATION', 'loan_intent_EDUCATION',
       'loan_intent_HOMEIMPROVEMENT', 'loan_intent_MEDICAL',
       'loan_intent_PERSONAL', 'loan_intent_VENTURE', 'loan_grade_A',
       'loan_grade_B', 'loan_grade_C',  'cb_person_default_on_file_N',
       'cb_person_default_on_file_Y']
In [58]:
#salvo il dataset (train) ridotto alle sole features selezionate dalla REFCV
FSos_x_train1 = os_x_train1[selected_col]
FSos_x_train2 = os_x_train2[selected_col]
FSos_x_train3 = os_x_train3[selected_col]
FSos_x_train4 = os_x_train4[selected_col]
In [59]:
#salvo il dataset (test) ridotto alle sole features selezionate dalla REFCV
FSx_test1 = x_test1[selected_col]
FSx_test2 = x_test2[selected_col]
FSx_test3 = x_test3[selected_col]
FSx_test4 = x_test4[selected_col]
In [32]:
#MODELLO: REGRESSIONE LGOISTICA
#sul primo cluster
LR1 = LogisticRegression()
LR1.fit(FSos_x_train1, os_y_train1)
In [33]:
#sul secondo cluster
LR2 = LogisticRegression()
LR2.fit(FSos_x_train2, os_y_train2)
In [34]:
#sul terzo cluster
LR3 = LogisticRegression()
LR3.fit(FSos_x_train3, os_y_train3)
In [35]:
#sul quarto cluster
LR4 = LogisticRegression()
LR4.fit(FSos_x_train4, os_y_train4)
In [64]:
#previsioni su train e test set del primo cluster
p_trainLR1 = LR1.predict(FSos_x_train1)
p_testLR1 = LR1.predict(FSx_test1)

print('Accuracy of logistic regression classifier on oversampled train set: {:.2f}'.format(LR1.score(FSos_x_train1, os_y_train1)))
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(LR1.score(FSx_test1, y_test1)))
Accuracy of logistic regression classifier on oversampled train set: 0.91
Accuracy of logistic regression classifier on test set: 0.89
In [65]:
#previsioni su train e test set del secondo cluster
p_trainLR2 = LR2.predict(FSos_x_train2)
p_testLR2 = LR2.predict(FSx_test2)

print('Accuracy of logistic regression classifier on oversampled train set: {:.2f}'.format(LR2.score(FSos_x_train2, os_y_train2)))
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(LR2.score(FSx_test2, y_test2)))
Accuracy of logistic regression classifier on oversampled train set: 0.93
Accuracy of logistic regression classifier on test set: 0.90
In [66]:
#previsioni su train e test set del terzo cluster
p_trainLR3 = LR3.predict(FSos_x_train3)
p_testLR3 = LR3.predict(FSx_test3)

print('Accuracy of logistic regression classifier on oversampled train set: {:.2f}'.format(LR3.score(FSos_x_train3, os_y_train3)))
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(LR3.score(FSx_test3, y_test3)))
Accuracy of logistic regression classifier on oversampled train set: 0.90
Accuracy of logistic regression classifier on test set: 0.88
In [67]:
#previsioni su train e test set del quarto cluster
p_trainLR4 = LR4.predict(FSos_x_train4)
p_testLR4 = LR4.predict(FSx_test4)

print('Accuracy of logistic regression classifier on oversampled train set: {:.2f}'.format(LR4.score(FSos_x_train4, os_y_train4)))
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(LR4.score(FSx_test4, y_test4)))
Accuracy of logistic regression classifier on oversampled train set: 0.82
Accuracy of logistic regression classifier on test set: 0.81
In [68]:
#concateno le previsioni sui test set in modo da poter costruire la matrice di confusione
p_testLR_T = np.concatenate((p_testLR1, p_testLR2,p_testLR3, p_testLR4), axis=0)
In [69]:
#matrice di confusione sul totale dei test set 
confusion_matrix_test = confusion_matrix(y_testT, p_testLR_T)
print(confusion_matrix_test)
[[7163  465]
 [ 766 1381]]
In [71]:
#misure di performance  
print(classification_report(y_testT, p_testLR_T))
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      7628
           1       0.75      0.64      0.69      2147

    accuracy                           0.87      9775
   macro avg       0.83      0.79      0.81      9775
weighted avg       0.87      0.87      0.87      9775

In [72]:
#noto migliormanenti rispetto al metodo applicato sul dataset completo
In [ ]:
 
In [73]:
#SVM 
#scalo i training sets prima di applicare SVM
scaler = preprocessing.StandardScaler().fit(FSos_x_train1)
FSos_x_train1_scaled = scaler.transform(FSos_x_train1)
FSx_test1_scaled = scaler.transform(FSx_test1)

scaler = preprocessing.StandardScaler().fit(FSos_x_train2)
FSos_x_train2_scaled = scaler.transform(FSos_x_train2)
FSx_test2_scaled = scaler.transform(FSx_test2)

scaler = preprocessing.StandardScaler().fit(FSos_x_train3)
FSos_x_train3_scaled = scaler.transform(FSos_x_train3)
FSx_test3_scaled = scaler.transform(FSx_test3)

scaler = preprocessing.StandardScaler().fit(FSos_x_train4)
FSos_x_train4_scaled = scaler.transform(FSos_x_train4)
FSx_test4_scaled = scaler.transform(FSx_test4)
In [126]:
#grid search per la ricerca dei best hyperparameters
#stabilisco una griglia di parametri su cui andrò a testare il modello
param_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]
In [36]:
# creo il modello base su cui applicare il tuning
SVM = SVC()
#imposto la random search sulla griglia definita precedentemente
SVM_gridsearch1 = GridSearchCV(SVM, param_grid)
SVM_gridsearch2 = GridSearchCV(SVM, param_grid)
SVM_gridsearch3 = GridSearchCV(SVM, param_grid)
SVM_gridsearch4 = GridSearchCV(SVM, param_grid)
# Fitto il modello random search sul training set
SVM_gridsearch1.fit(FSos_x_train1_scaled, os_y_train1)
SVM_gridsearch2.fit(FSos_x_train2_scaled, os_y_train2)
SVM_gridsearch3.fit(FSos_x_train3_scaled, os_y_train3)
SVM_gridsearch4.fit(FSos_x_train4_scaled, os_y_train4)
In [37]:
#stampo la combinazione di parametri ottimale
SVM_gridsearch1.best_params_
In [38]:
SVM_gridsearch2.best_params_
In [39]:
SVM_gridsearch3.best_params_
In [40]:
SVM_gridsearch4.best_params_
In [41]:
#definisco il modello Support vector classifer utilizzando i parametri appena trovati
SVM1 = SVC(C= 1000  , gamma= 0.001 , kernel='rbf',probability= True  )
SVM2 = SVC(C= 1000  , gamma= 0.001  , kernel='rbf',probability= True  )
SVM3 = SVC(C= 1000  , gamma= 0.001 , kernel='rbf',probability= True  )
SVM4 = SVC(C= 1000  , gamma= 0.001  , kernel='rbf',probability= True  )

#fitto il modello sul training set
SVM1.fit(FSos_x_train1_scaled, os_y_train1)
SVM2.fit(FSos_x_train2_scaled, os_y_train2)
SVM3.fit(FSos_x_train3_scaled, os_y_train3)
SVM4.fit(FSos_x_train4_scaled, os_y_train4)
In [138]:
#previsioni su train e test set numero 1
p_trainSVM1 = SVM1.predict(FSos_x_train1_scaled)
p_testSVM1 = SVM1.predict(FSx_test1_scaled)

print('Accuracy of SVM classifier on oversampled train set: {:.2f}'.format(SVM1.score(FSos_x_train1_scaled, os_y_train1)))
print('Accuracy of SVM classifier on test set: {:.2f}'.format(SVM1.score(FSx_test1_scaled, y_test1)))
Accuracy of SVM classifier on oversampled train set: 0.91
Accuracy of SVM classifier on test set: 0.90
In [139]:
#previsioni su train e test set numero 2
p_trainSVM2 = SVM2.predict(FSos_x_train2_scaled)
p_testSVM2 = SVM2.predict(FSx_test2_scaled)

print('Accuracy of SVM classifier on oversampled train set: {:.2f}'.format(SVM2.score(FSos_x_train2_scaled, os_y_train2)))
print('Accuracy of SVM classifier on test set: {:.2f}'.format(SVM2.score(FSx_test2_scaled, y_test2)))
Accuracy of SVM classifier on oversampled train set: 0.94
Accuracy of SVM classifier on test set: 0.91
In [140]:
#previsioni su train e test set numero 3
p_trainSVM3 = SVM3.predict(FSos_x_train3_scaled)
p_testSVM3 = SVM3.predict(FSx_test3_scaled)

print('Accuracy of SVM classifier on oversampled train set: {:.2f}'.format(SVM3.score(FSos_x_train3_scaled, os_y_train3)))
print('Accuracy of SVM classifier on test set: {:.2f}'.format(SVM3.score(FSx_test3_scaled, y_test3)))
Accuracy of SVM classifier on oversampled train set: 0.91
Accuracy of SVM classifier on test set: 0.89
In [141]:
#previsioni su train e test set numero 4
p_trainSVM4 = SVM4.predict(FSos_x_train4_scaled)
p_testSVM4 = SVM4.predict(FSx_test4_scaled)

print('Accuracy of SVM classifier on oversampled train set: {:.2f}'.format(SVM4.score(FSos_x_train4_scaled, os_y_train4)))
print('Accuracy of SVM classifier on test set: {:.2f}'.format(SVM4.score(FSx_test4_scaled, y_test4)))
Accuracy of SVM classifier on oversampled train set: 0.86
Accuracy of SVM classifier on test set: 0.83
In [142]:
#concateno le previsioni sui 4 test sets
p_testSVM_T = np.concatenate((p_testSVM1, p_testSVM2,p_testSVM3, p_testSVM4), axis=0)
In [143]:
#costruisco la matrice di onfusione utilizzando il vettore p_testSVM_T appena costruito
confusion_matrix_test = confusion_matrix(y_testT, p_testSVM_T)
print(confusion_matrix_test)
[[7276  352]
 [ 737 1410]]
In [144]:
#misure di performance
print(classification_report(y_testT, p_testSVM_T))
              precision    recall  f1-score   support

           0       0.91      0.95      0.93      7628
           1       0.80      0.66      0.72      2147

    accuracy                           0.89      9775
   macro avg       0.85      0.81      0.83      9775
weighted avg       0.88      0.89      0.88      9775

In [ ]:
 
In [74]:
#KNN
#scalo i train set
scaler = preprocessing.StandardScaler().fit(FSos_x_train1)
FSos_x_train1_scaled = scaler.transform(FSos_x_train1)
FSx_test1_scaled = scaler.transform(FSx_test1)

scaler = preprocessing.StandardScaler().fit(FSos_x_train2)
FSos_x_train2_scaled = scaler.transform(FSos_x_train2)
FSx_test2_scaled = scaler.transform(FSx_test2)

scaler = preprocessing.StandardScaler().fit(FSos_x_train3)
FSos_x_train3_scaled = scaler.transform(FSos_x_train3)
FSx_test3_scaled = scaler.transform(FSx_test3)

scaler = preprocessing.StandardScaler().fit(FSos_x_train4)
FSos_x_train4_scaled = scaler.transform(FSos_x_train4)
FSx_test4_scaled = scaler.transform(FSx_test4)
In [82]:
#miglior scelta di n_neighbors utilizzando grid search

#scelgo, per ciascuno dei clusters, il K ottimale 

#definisco griglia su cui effettuare ricerca
parameters = {"n_neighbors": range(5,30,2)}
gridsearch1 = GridSearchCV (KNeighborsClassifier(), parameters)
gridsearch1.fit(os_x_train1_scaled, os_y_train1)

gridsearch2 = GridSearchCV (KNeighborsClassifier(), parameters)
gridsearch2.fit(os_x_train2_scaled, os_y_train2)

gridsearch3 = GridSearchCV (KNeighborsClassifier(), parameters)
gridsearch3.fit(os_x_train3_scaled, os_y_train3)

gridsearch4 = GridSearchCV (KNeighborsClassifier(), parameters)
gridsearch4.fit(os_x_train4_scaled, os_y_train4)
In [78]:
#k ottimo per il primo cluster
gridsearch1.best_params_
Out[78]:
{'n_neighbors': 7}
In [79]:
#k ottimo per il secondo cluster
gridsearch2.best_params_
Out[79]:
{'n_neighbors': 5}
In [80]:
#k ottimo per il terzo cluster
gridsearch3.best_params_
Out[80]:
{'n_neighbors': 5}
In [81]:
#k ottimo per il quarto cluster
gridsearch4.best_params_
Out[81]:
{'n_neighbors': 11}
In [42]:
#definisco i 4 KNN e li fitto sui train set
KNN1 = KNeighborsClassifier(n_neighbors=7, metric= 'minkowski', p=2)
KNN2 = KNeighborsClassifier(n_neighbors=5, metric= 'minkowski', p=2)
KNN3 = KNeighborsClassifier(n_neighbors=5, metric= 'minkowski', p=2)
KNN4 = KNeighborsClassifier(n_neighbors=11, metric= 'minkowski', p=2)
KNN1.fit(FSos_x_train1_scaled, os_y_train1)
KNN2.fit(FSos_x_train2_scaled, os_y_train2)
KNN3.fit(FSos_x_train3_scaled, os_y_train3)
KNN4.fit(FSos_x_train4_scaled, os_y_train4)
In [223]:
#prediction su train e test set primo cluster
p_trainKNN1 = KNN1.predict(FSos_x_train1_scaled)
p_testKNN1 = KNN1.predict(FSx_test1_scaled)

print('Accuracy of KNN classifier on oversampled train set: {:.2f}'.format(KNN1.score(FSos_x_train1_scaled, os_y_train1)))
print('Accuracy of KNN classifier on test set: {:.2f}'.format(KNN1.score(FSx_test1_scaled, y_test1)))
Accuracy of KNN classifier on oversampled train set: 0.93
Accuracy of KNN classifier on test set: 0.92
In [224]:
#prediction su train e test set secondo cluster
p_trainKNN2 = KNN2.predict(FSos_x_train2_scaled)
p_testKNN2 = KNN2.predict(FSx_test2_scaled)

print('Accuracy of KNN classifier on oversampled train set: {:.2f}'.format(KNN2.score(FSos_x_train2_scaled, os_y_train2)))
print('Accuracy of KNN classifier on test set: {:.2f}'.format(KNN2.score(FSx_test2_scaled, y_test2)))
Accuracy of KNN classifier on oversampled train set: 0.95
Accuracy of KNN classifier on test set: 0.91
In [225]:
#prediction su train e test set terzo cluster
p_trainKNN3 = KNN3.predict(FSos_x_train3_scaled)
p_testKNN3 = KNN3.predict(FSx_test3_scaled)

print('Accuracy of KNN classifier on oversampled train set: {:.2f}'.format(KNN3.score(FSos_x_train3_scaled, os_y_train3)))
print('Accuracy of KNN classifier on test set: {:.2f}'.format(KNN3.score(FSx_test3_scaled, y_test3)))
Accuracy of KNN classifier on oversampled train set: 0.92
Accuracy of KNN classifier on test set: 0.89
In [226]:
#prediction su train e test set quarto cluster
p_trainKNN4 = KNN4.predict(FSos_x_train4_scaled)
p_testKNN4 = KNN4.predict(FSx_test4_scaled)

print('Accuracy of KNN classifier on oversampled train set: {:.2f}'.format(KNN4.score(FSos_x_train4_scaled, os_y_train4)))
print('Accuracy of KNN classifier on test set: {:.2f}'.format(KNN4.score(FSx_test4_scaled, y_test4)))
Accuracy of KNN classifier on oversampled train set: 0.87
Accuracy of KNN classifier on test set: 0.82
In [227]:
#concateno le prediction sui test set in modo da poter costruire la matrice di confusione
p_testKNN_T = np.concatenate((p_testKNN1, p_testKNN2,p_testKNN3, p_testKNN4), axis=0)
In [228]:
#confusion matrix 
confusion_matrix_test = confusion_matrix(y_testT, p_testKNN_T)
print(confusion_matrix_test)
[[7355  273]
 [ 796 1351]]
In [229]:
#misure di performance
print(classification_report(y_testT, p_testKNN_T))
              precision    recall  f1-score   support

           0       0.90      0.96      0.93      7628
           1       0.83      0.63      0.72      2147

    accuracy                           0.89      9775
   macro avg       0.87      0.80      0.82      9775
weighted avg       0.89      0.89      0.88      9775

In [ ]:
 
In [75]:
#MODEL: RANDOM FOREST
# definisco la random hyperparameter grid
# numero di alberi nella forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 5)]
# numero di features considerate ad ogni split
max_features = ['auto', 'sqrt']
# massima profondità dell'albero
max_depth = [int(x) for x in np.linspace(10, 80, num = 4)]
max_depth.append(None)
# numero minimo di samples richiesto ad ogni split di un nodo
min_samples_split = [2, 5, 10]
# minimo numero di samples richiesto in ogni leaf
min_samples_leaf = [ 2, 4]
# metodo per selezionare samples per training degli alberi
bootstrap = [True]
# Creo la random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)
{'bootstrap': [True],
 'max_depth': [10, 33, 56, 80, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 650, 1100, 1550, 2000]}
In [43]:
# uso la random grid per cercare i best parameters
# creo il modello base su cui fare tuning
RF1 = RandomForestClassifier()
RF2 = RandomForestClassifier()
RF3 = RandomForestClassifier()
RF4 = RandomForestClassifier()

# Random search dei parametri, usando 3 fold cross validation 
RF1_random = RandomizedSearchCV(estimator = RF1, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)
RF2_random = RandomizedSearchCV(estimator = RF2, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)
RF3_random = RandomizedSearchCV(estimator = RF3, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)
RF4_random = RandomizedSearchCV(estimator = RF4, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fitto il modello random search
RF1_random.fit(FSos_x_train1, os_y_train1)
RF2_random.fit(FSos_x_train2, os_y_train2)
RF3_random.fit(FSos_x_train3, os_y_train3)
RF4_random.fit(FSos_x_train4, os_y_train4)
In [93]:
#trovo best parameters per il primo modello (riferito al cluster 1) 
RF1_random.best_params_
Out[93]:
{'n_estimators': 1100,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True}
In [94]:
#trovo best parameter per il secondo modello (riferito al cluster 2)
RF2_random.best_params_
Out[94]:
{'n_estimators': 2000,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': None,
 'bootstrap': True}
In [95]:
#trovo best parameter per il terzo modello (riferito al cluster 3)
RF3_random.best_params_
Out[95]:
{'n_estimators': 650,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 80,
 'bootstrap': True}
In [96]:
#trovo best parameter per il quarto modello (riferito al cluster 4)
RF4_random.best_params_
Out[96]:
{'n_estimators': 200,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': 10,
 'bootstrap': True}
In [76]:
#cerco di migliorare ulteriormente i risultati utilizzando una grid search
#baso la mia nuova griglia sui risultati otteniti prima, andando a cercare 'nelle vicinanze' di quello che,
#secondo la grid search di prima, era il miglior modello
#definisco la nuova griglia
param_grid1 = {
    'bootstrap': [True],
   'max_depth': [8,10,12],
  'max_features': ['sqrt'],
     'min_samples_leaf': [2,4,6],
   'min_samples_split': [10,15],
   'n_estimators': [1000,1100,1200]
}
param_grid2 = {
    'bootstrap': [True],
    'max_depth': [None],
    'max_features': ['auto'],
    'min_samples_leaf': [2,4],
    'min_samples_split': [4,5,6],
    'n_estimators': [1900,2000,2100]
}
param_grid3 = {
    'bootstrap': [True],
    'max_depth': [70,80,90],
    'max_features': ['sqrt'],
    'min_samples_leaf': [2,4],
    'min_samples_split': [2,3,4],
    'n_estimators': [600,650,700]
}
param_grid4 = {
    'bootstrap': [True],
    'max_depth': [8,10,12],
    'max_features': ['auto'],
    'min_samples_leaf': [2,4],
    'min_samples_split': [4,5,6],
    'n_estimators': [150,200,250]
}
In [44]:
# Costruisco i modelli grid search
grid_search1 = GridSearchCV(estimator = RandomForestClassifier() , param_grid = param_grid1, 
                          cv = 5, n_jobs = -1, verbose = 2)
grid_search2 = GridSearchCV(estimator = RandomForestClassifier(), param_grid = param_grid2, 
                          cv = 5, n_jobs = -1, verbose = 2)
grid_search3 = GridSearchCV(estimator = RandomForestClassifier() , param_grid = param_grid3, 
                          cv = 5, n_jobs = -1, verbose = 2)
grid_search4 = GridSearchCV(estimator = RandomForestClassifier(), param_grid = param_grid4, 
                          cv = 5, n_jobs = -1, verbose = 2)
# li fitto ai dati 
grid_search1.fit(FSos_x_train1, os_y_train1)
grid_search2.fit(FSos_x_train2, os_y_train2)
grid_search3.fit(FSos_x_train3, os_y_train3)
grid_search4.fit(FSos_x_train4, os_y_train4)
In [78]:
# trovo i best parameters per il modello riferito al cluster 1
grid_search1.best_params_
Out[78]:
{'bootstrap': True,
 'max_depth': 12,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 15,
 'n_estimators': 1200}
In [79]:
# trovo i best parameters per il modello riferito al cluster 2
grid_search2.best_params_
#true 10 sqrt leaf: 2 split: 4 1550
Out[79]:
{'bootstrap': True,
 'max_depth': None,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 6,
 'n_estimators': 2000}
In [80]:
# trovo i best parameters per il modello riferito al cluster 3 
grid_search3.best_params_
Out[80]:
{'bootstrap': True,
 'max_depth': 70,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 4,
 'n_estimators': 600}
In [81]:
#trovo i best parameter per il modello riferito al cluster 4
grid_search4.best_params_
Out[81]:
{'bootstrap': True,
 'max_depth': 8,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 200}
In [45]:
#costruisco le 4 random forest 
RF1 = RandomForestClassifier(random_state=0, n_estimators= 1200,
 min_samples_split= 15,min_samples_leaf= 2,max_features= 'sqrt',
 max_depth=12, bootstrap= True )

RF2 = RandomForestClassifier(random_state=0, n_estimators= 2000,
 min_samples_split= 6,min_samples_leaf= 2,max_features= 'auto',
 max_depth= None,bootstrap= True )

RF3 = RandomForestClassifier(random_state=0, n_estimators= 600,
 min_samples_split= 4, min_samples_leaf= 4,max_features= 'sqrt',
 max_depth= 70,bootstrap= True )

RF4 = RandomForestClassifier(random_state=0, n_estimators= 200,
 min_samples_split= 4,min_samples_leaf= 2,max_features= 'auto',
 max_depth= 8,bootstrap= True )

#e le fitto sui traing set 
RF1.fit(FSos_x_train1, os_y_train1)
RF2.fit(FSos_x_train2, os_y_train2)
RF3.fit(FSos_x_train3, os_y_train3)
RF4.fit(FSos_x_train4, os_y_train4)
In [83]:
#previsioni sul primo cluster
p_trainRF1 = RF1.predict(FSos_x_train1)
p_testRF1 = RF1.predict(FSx_test1)

print('Accuracy of RF classifier on oversampled train set: {:.2f}'.format(RF1.score(FSos_x_train1, os_y_train1)))
print('Accuracy of RF classifier on test set: {:.2f}'.format(RF1.score(FSx_test1, y_test1))) 
Accuracy of RF classifier on oversampled train set: 0.93
Accuracy of RF classifier on test set: 0.92
In [84]:
#previsioni sul secondo cluster
p_trainRF2 = RF2.predict(FSos_x_train2)
p_testRF2 = RF2.predict(FSx_test2)

print('Accuracy of RF classifier on oversampled train set: {:.2f}'.format(RF2.score(FSos_x_train2, os_y_train2)))
print('Accuracy of RF classifier on test set: {:.2f}'.format(RF2.score(FSx_test2, y_test2)))
Accuracy of RF classifier on oversampled train set: 0.96
Accuracy of RF classifier on test set: 0.91
In [85]:
#previsioni sul terzo cluster
p_trainRF3 = RF3.predict(FSos_x_train3)
p_testRF3 = RF3.predict(FSx_test3)

print('Accuracy of RF classifier on oversampled train set: {:.2f}'.format(RF3.score(FSos_x_train3, os_y_train3)))
print('Accuracy of RF classifier on test set: {:.2f}'.format(RF3.score(FSx_test3, y_test3)))
Accuracy of RF classifier on oversampled train set: 0.93
Accuracy of RF classifier on test set: 0.89
In [86]:
#previsioni sul quarto cluster
p_trainRF4 = RF4.predict(FSos_x_train4)
p_testRF4 = RF4.predict(FSx_test4)

print('Accuracy of RF classifier on oversampled train set: {:.2f}'.format(RF4.score(FSos_x_train4, os_y_train4)))
print('Accuracy of RF classifier on test set: {:.2f}'.format(RF4.score(FSx_test4, y_test4)))
Accuracy of RF classifier on oversampled train set: 0.87
Accuracy of RF classifier on test set: 0.84
In [87]:
#concateno le prediction sui 4 test set
p_testRF_T = np.concatenate((p_testRF1, p_testRF2,p_testRF3, p_testRF4), axis=0)
In [88]:
#matric di confusione 
confusion_matrix_test = confusion_matrix(y_testT, p_testRF_T)
print(confusion_matrix_test)
[[7369  259]
 [ 760 1387]]
In [89]:
#misure di performance 
print(classification_report(y_testT, p_testRF_T))
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      7628
           1       0.84      0.65      0.73      2147

    accuracy                           0.90      9775
   macro avg       0.87      0.81      0.83      9775
weighted avg       0.89      0.90      0.89      9775

In [ ]:
 
In [90]:
#MODEL: ADABOOST
#creo come stimatori basi per i  modelli degli alberi decisionali formati da un solo split (uno per ogni cluster)
decTree1 = tree.DecisionTreeClassifier(random_state=0, max_depth=1)
decTree2 = tree.DecisionTreeClassifier(random_state=0, max_depth=1)
decTree3 = tree.DecisionTreeClassifier(random_state=0, max_depth=1)
decTree4 = tree.DecisionTreeClassifier(random_state=0, max_depth=1)
decTree1.fit(FSos_x_train1, os_y_train1)
decTree2.fit(FSos_x_train2, os_y_train2)
decTree3.fit(FSos_x_train3, os_y_train3)
decTree4.fit(FSos_x_train4, os_y_train4)
Out[90]:
DecisionTreeClassifier(max_depth=1, random_state=0)
In [91]:
#albero stimatore base per il modello sul primo cluster
tree.plot_tree(decTree1)
Out[91]:
[Text(0.5, 0.75, 'X[0] <= 0.3\ngini = 0.5\nsamples = 12226\nvalue = [6113, 6113]'),
 Text(0.25, 0.25, 'gini = 0.445\nsamples = 8689\nvalue = [5791, 2898]'),
 Text(0.75, 0.25, 'gini = 0.165\nsamples = 3537\nvalue = [322, 3215]')]
In [92]:
#albero stimatore base per il modello sul secondo cluster
tree.plot_tree(decTree2)
Out[92]:
[Text(0.5, 0.75, 'X[10] <= 0.5\ngini = 0.5\nsamples = 9304\nvalue = [4652, 4652]'),
 Text(0.25, 0.25, 'gini = 0.311\nsamples = 4999\nvalue = [964, 4035]'),
 Text(0.75, 0.25, 'gini = 0.246\nsamples = 4305\nvalue = [3688, 617]')]
In [108]:
#hyperparameter tuning
#cerco parametri ottimali 

#definisco la griglia:
# numero di alberi
n_estimators = [int(x) for x in np.linspace(start = 0, stop = 1000, num = 50)]
#learning rate
learning_rate=[0.4,0.5,0.6,0.7,0.8,0.9,1]
# creo la ranodm grid
random_grid = {'n_estimators': n_estimators,
               'learning_rate': learning_rate}
In [46]:
# uso la random grid per cercare i best hyperparameters
# creo il modello base su cui fare tuning
abc = AdaBoostClassifier()
# Random search dei parametri, usando 3 fold cross validation 
AB1_random = RandomizedSearchCV(estimator = abc, param_distributions = random_grid, n_iter = 50, cv = 5, verbose=2, random_state=42, n_jobs = -1)
AB2_random = RandomizedSearchCV(estimator = abc, param_distributions = random_grid, n_iter = 50, cv = 5, verbose=2, random_state=42, n_jobs = -1)
AB3_random = RandomizedSearchCV(estimator = abc, param_distributions = random_grid, n_iter = 50, cv = 5, verbose=2, random_state=42, n_jobs = -1)
AB4_random = RandomizedSearchCV(estimator = abc, param_distributions = random_grid, n_iter = 50, cv = 5, verbose=2, random_state=42, n_jobs = -1)

# fitto il modello random search
AB1_random.fit(FSos_x_train1, os_y_train1)
AB2_random.fit(FSos_x_train2, os_y_train2)
AB3_random.fit(FSos_x_train3, os_y_train3)
AB4_random.fit(FSos_x_train4, os_y_train4)
In [110]:
#migliori parametri per il modello sul primo cluster
AB1_random.best_params_
Out[110]:
{'n_estimators': 775, 'learning_rate': 0.9}
In [111]:
#migliori parametri per il modello sul secondo cluster
AB2_random.best_params_
Out[111]:
{'n_estimators': 959, 'learning_rate': 0.9}
In [112]:
#migliori parametri per il modello sul terzo cluster
AB3_random.best_params_
Out[112]:
{'n_estimators': 142, 'learning_rate': 0.7}
In [113]:
#migliori parametri per il modello sul quarto cluster
AB4_random.best_params_
Out[113]:
{'n_estimators': 81, 'learning_rate': 1}
In [94]:
#cerco di migliorare adaBoost usando grid search
#la nuova griglia è scelta in modo da essere un intorno del punto ottimale trovato precedentemente
#creo 4 griglie diverse negli intorni dei 4 punti ottimali per i vari modelli che ho trovato prima
param_grid1 = {
    'n_estimators':[760,770,775,780,790],
    'learning_rate':[0.85,0.88,0.9,0.92,0.95]
}
param_grid2 = {
    'n_estimators':[950,955,959,960,970],
    'learning_rate':[0.85,0.88,0.9,0.92,0.95]
}
param_grid3 = {
    'n_estimators':[135,140,142,145,150],
    'learning_rate':[0.65,0.68,0.70,0.72,0.74]
}
param_grid4 = {
    'n_estimators':[75,80,81,83,85,90],
    'learning_rate':[0.95,0.98,0.99,1]
}
In [95]:
# costruisco il modello grid search
grid_search1 = GridSearchCV(estimator = AdaBoostClassifier(), param_grid = param_grid1, 
                          cv = 5, n_jobs = -1, verbose = 2)
grid_search2 = GridSearchCV(estimator = AdaBoostClassifier(), param_grid = param_grid2, 
                          cv = 5, n_jobs = -1, verbose = 2)
grid_search3 = GridSearchCV(estimator = AdaBoostClassifier(), param_grid = param_grid3, 
                          cv = 5, n_jobs = -1, verbose = 2)
grid_search4 = GridSearchCV(estimator = AdaBoostClassifier(), param_grid = param_grid4, 
                          cv = 5, n_jobs = -1, verbose = 2)
In [47]:
# Fitto grid search ai dati
grid_search1.fit(FSos_x_train1, os_y_train1)
grid_search2.fit(FSos_x_train2, os_y_train2)
grid_search3.fit(FSos_x_train3, os_y_train3)
grid_search4.fit(FSos_x_train4, os_y_train4)
In [97]:
#stampo parametri ottimali per modello sul primo cluster
grid_search1.best_params_
Out[97]:
{'learning_rate': 0.9, 'n_estimators': 780}
In [98]:
#stampo paramentri ottimali per modello sul secondo cluster
grid_search2.best_params_
Out[98]:
{'learning_rate': 0.9, 'n_estimators': 970}
In [99]:
#stampo paramentri ottimali per modello sul terzo cluster
grid_search3.best_params_
Out[99]:
{'learning_rate': 0.74, 'n_estimators': 150}
In [100]:
#stampo paramentri ottimali per modello sul quarto cluster
grid_search4.best_params_
Out[100]:
{'learning_rate': 0.99, 'n_estimators': 75}
In [48]:
#definisco modelli con i nuovi parametri 
AB1 = AdaBoostClassifier(n_estimators= 780  , base_estimator=decTree1, learning_rate= 0.9 )
AB2 = AdaBoostClassifier(n_estimators= 970  , base_estimator=decTree2, learning_rate= 0.9 )
AB3 = AdaBoostClassifier(n_estimators= 150 , base_estimator=decTree1, learning_rate= 0.74 )
AB4 = AdaBoostClassifier(n_estimators= 75  , base_estimator=decTree2, learning_rate= 0.99  )
#e li fitto sui train sets
AB1.fit(FSos_x_train1, os_y_train1)
AB2.fit(FSos_x_train2, os_y_train2)
AB3.fit(FSos_x_train3, os_y_train3)
AB4.fit(FSos_x_train4, os_y_train4)
In [102]:
#previsioni su train e test set primo cluster
p_trainAB1 = RF1.predict(FSos_x_train1)
p_testAB1 = RF1.predict(FSx_test1)

print('Accuracy of AB classifier on oversampled train set: {:.2f}'.format(AB1.score(FSos_x_train1, os_y_train1)))
print('Accuracy of AB classifier on test set: {:.2f}'.format(AB1.score(FSx_test1, y_test1)))
Accuracy of AB classifier on oversampled train set: 0.91
Accuracy of AB classifier on test set: 0.89
In [103]:
#previsioni su train e test set secondo cluster
p_trainAB2 = AB2.predict(FSos_x_train2)
p_testAB2 = AB2.predict(FSx_test2)

print('Accuracy of AB classifier on oversampled train set: {:.2f}'.format(AB2.score(FSos_x_train2, os_y_train2)))
print('Accuracy of AB classifier on test set: {:.2f}'.format(RF2.score(FSx_test2, y_test2)))
Accuracy of AB classifier on oversampled train set: 0.94
Accuracy of AB classifier on test set: 0.91
In [104]:
#previsioni su train e test set terzo cluster
p_trainAB3 = AB3.predict(FSos_x_train3)
p_testAB3 = AB3.predict(FSx_test3)

print('Accuracy of AB classifier on oversampled train set: {:.2f}'.format(AB3.score(FSos_x_train3, os_y_train3)))
print('Accuracy of AB classifier on test set: {:.2f}'.format(RF3.score(FSx_test3, y_test3)))
Accuracy of AB classifier on oversampled train set: 0.91
Accuracy of AB classifier on test set: 0.89
In [105]:
#previsioni su train e test set quarto cluster
p_trainAB4 = AB4.predict(FSos_x_train4)
p_testAB4 = AB4.predict(FSx_test4)

print('Accuracy of AB classifier on oversampled train set: {:.2f}'.format(AB4.score(FSos_x_train4, os_y_train4)))
print('Accuracy of AB classifier on test set: {:.2f}'.format(RF4.score(FSx_test4, y_test4)))
#0.85  0.83
Accuracy of AB classifier on oversampled train set: 0.85
Accuracy of AB classifier on test set: 0.84
In [106]:
#concateno le prediction sui test sets
p_testAB_T = np.concatenate((p_testAB1, p_testAB2,p_testAB3, p_testAB4), axis=0)
In [107]:
#confusion matrix sul tot dei test sets
confusion_matrix_test = confusion_matrix(y_testT, p_testAB_T)
print(confusion_matrix_test)
[[7227  401]
 [ 669 1478]]
In [108]:
#performance measures
print(classification_report(y_testT, p_testAB_T))
              precision    recall  f1-score   support

           0       0.92      0.95      0.93      7628
           1       0.79      0.69      0.73      2147

    accuracy                           0.89      9775
   macro avg       0.85      0.82      0.83      9775
weighted avg       0.89      0.89      0.89      9775

In [249]:
#AUC cluster based methods
fig = plt.figure(figsize=(8,5))
plt.plot([0, 1], [0, 1],'r--')


# LOGISTIC REGRESSION
preds_proba_LR1 = LR1.predict_proba(FSx_test1)
probsLR1= preds_proba_LR1[:, 1]
preds_proba_LR2 = LR2.predict_proba(FSx_test2)
probsLR2= preds_proba_LR2[:, 1]
preds_proba_LR3 = LR3.predict_proba(FSx_test3)
probsLR3= preds_proba_LR3[:, 1]
preds_proba_LR4 = LR4.predict_proba(FSx_test4)
probsLR4= preds_proba_LR4[:, 1]
probsLR = np.concatenate((probsLR1,probsLR2,probsLR3,probsLR4), axis=0)
fpr, tpr, thresh = metrics.roc_curve(y_testT, probsLR)
auclg = roc_auc_score(y_testT, probsLR)
plt.plot(fpr, tpr, label=f'LR, AUC = {str(round(auclg,3))}')

#SVM
preds_proba_SVM1 = SVM1.predict_proba(FSx_test1_scaled)
probsSVM1= preds_proba_SVM1[:, 1]
preds_proba_SVM2 = SVM2.predict_proba(FSx_test2_scaled)
probsSVM2= preds_proba_SVM2[:, 1]
preds_proba_SVM3 = SVM3.predict_proba(FSx_test3_scaled)
probsSVM3= preds_proba_SVM3[:, 1]
preds_proba_SVM4 = SVM4.predict_proba(FSx_test4_scaled)
probsSVM4= preds_proba_SVM4[:, 1]
probsSVM = np.concatenate((probsSVM1,probsSVM2,probsSVM3,probsSVM4), axis=0)
fpr, tpr, thresh = metrics.roc_curve(y_testT, probsSVM)
auclg = roc_auc_score(y_testT, probsSVM)
plt.plot(fpr, tpr, label=f'SVM, AUC = {str(round(auclg,3))}')


#KNN
preds_proba_KNN1 = KNN1.predict_proba(FSx_test1_scaled)
probsKNN1= preds_proba_KNN1[:, 1]
preds_proba_KNN2 = KNN2.predict_proba(FSx_test2_scaled)
probsKNN2= preds_proba_KNN2[:, 1]
preds_proba_KNN3 = KNN3.predict_proba(FSx_test3_scaled)
probsKNN3= preds_proba_KNN3[:, 1]
preds_proba_KNN4 = KNN4.predict_proba(FSx_test4_scaled)
probsKNN4= preds_proba_KNN4[:, 1]
probsKNN = np.concatenate((probsKNN1,probsKNN2,probsKNN3,probsKNN4), axis=0)
fpr, tpr, thresh = metrics.roc_curve(y_testT, probsKNN)
auclg = roc_auc_score(y_testT, probsKNN)
plt.plot(fpr, tpr, label=f'KNN, AUC = {str(round(auclg,3))}')


#RF
preds_proba_RF1 = RF1.predict_proba(FSx_test1)
probsRF1= preds_proba_RF1[:, 1]
preds_proba_RF2 = RF2.predict_proba(FSx_test2)
probsRF2= preds_proba_RF2[:, 1]
preds_proba_RF3 = RF3.predict_proba(FSx_test3)
probsRF3= preds_proba_RF3[:, 1]
preds_proba_RF4 = RF4.predict_proba(FSx_test4)
probsRF4= preds_proba_RF4[:, 1]
probsRF = np.concatenate((probsRF1,probsRF2,probsRF3,probsRF4), axis=0)
fpr, tpr, thresh = metrics.roc_curve(y_testT, probsRF)
auclg = roc_auc_score(y_testT, probsRF)
plt.plot(fpr, tpr, label=f'RF, AUC = {str(round(auclg,3))}')



#AB
preds_proba_AB1 = AB1.predict_proba(FSx_test1)
probsAB1= preds_proba_AB1[:, 1]
preds_proba_AB2 = AB2.predict_proba(FSx_test2)
probsAB2= preds_proba_AB2[:, 1]
preds_proba_AB3 = AB3.predict_proba(FSx_test3)
probsAB3= preds_proba_AB3[:, 1]
preds_proba_AB4 = AB4.predict_proba(FSx_test4)
probsAB4= preds_proba_AB4[:, 1]
probsAB = np.concatenate((probsAB1,probsAB2,probsAB3,probsAB4), axis=0)
fpr, tpr, thresh = metrics.roc_curve(y_testT, probsAB)
auclg = roc_auc_score(y_testT, probsAB)
plt.plot(fpr, tpr, label=f'AB, AUC = {str(round(auclg,3))}')



plt.ylabel("True Positive Rate", fontsize=12)
plt.xlabel("False Positive Rate", fontsize=12)
plt.title("ROC curve")
plt.rcParams['axes.titlesize'] = 16
plt.legend()
plt.show()

¶