import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
plt.rc("font", size=14)
from sklearn import linear_model, model_selection, metrics, feature_selection, svm, tree
from sklearn.feature_selection import RFECV
from sklearn.svm import SVC
from sklearn_som.som import SOM
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from imblearn import over_sampling
from imblearn.over_sampling import SMOTE
from pprint import pprint
import seaborn as sns
sns.set (style="white")
sns.set(style="whitegrid", color_codes=True)
#leggo file csv
data = pd.read_csv ('credit_risk_dataset.csv', sep=',')
#dimensioni dataset
data.shape
(32581, 12)
#lista atributi dataset
list(data.columns)
['person_age', 'person_income', 'person_home_ownership', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_status', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length']
#osservo le prime righe del dataset
data.head()
person_age | person_income | person_home_ownership | person_emp_length | loan_intent | loan_grade | loan_amnt | loan_int_rate | loan_status | loan_percent_income | cb_person_default_on_file | cb_person_cred_hist_length | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22 | 59000 | RENT | 123.0 | PERSONAL | D | 35000 | 16.02 | 1 | 0.59 | Y | 3 |
1 | 21 | 9600 | OWN | 5.0 | EDUCATION | B | 1000 | 11.14 | 0 | 0.10 | N | 2 |
2 | 25 | 9600 | MORTGAGE | 1.0 | MEDICAL | C | 5500 | 12.87 | 1 | 0.57 | N | 3 |
3 | 23 | 65500 | RENT | 4.0 | MEDICAL | C | 35000 | 15.23 | 1 | 0.53 | N | 2 |
4 | 24 | 54400 | RENT | 8.0 | MEDICAL | C | 35000 | 14.27 | 1 | 0.55 | Y | 4 |
#conteggi default=0, default=1
data['loan_status'].value_counts()
0 25473 1 7108 Name: loan_status, dtype: int64
sns.countplot(x='loan_status', data=data)
<AxesSubplot:xlabel='loan_status', ylabel='count'>
#visualizzo media attributi numerici per ciascuna delle due classi
data.groupby('loan_status').mean()
person_age | person_income | person_emp_length | loan_amnt | loan_int_rate | loan_percent_income | cb_person_cred_hist_length | |
---|---|---|---|---|---|---|---|
loan_status | |||||||
0 | 27.807129 | 70804.361559 | 4.968745 | 9237.464178 | 10.435999 | 0.148805 | 5.837475 |
1 | 27.474676 | 49125.652223 | 4.137562 | 10850.502954 | 13.060207 | 0.246889 | 5.685003 |
#visualizzo media attributi numerici per ciascun valore di 'person_home_ownership'
data.groupby('person_home_ownership').mean()
person_age | person_income | person_emp_length | loan_amnt | loan_int_rate | loan_status | loan_percent_income | cb_person_cred_hist_length | |
---|---|---|---|---|---|---|---|---|
person_home_ownership | ||||||||
MORTGAGE | 27.980735 | 81127.121690 | 5.883907 | 10574.460726 | 10.488001 | 0.125707 | 0.151328 | 5.922493 |
OTHER | 26.803738 | 76387.803738 | 3.682243 | 11074.532710 | 12.030638 | 0.308411 | 0.191963 | 5.327103 |
OWN | 27.698529 | 57834.812693 | 5.167635 | 9029.943885 | 10.861150 | 0.074690 | 0.188777 | 5.868421 |
RENT | 27.545117 | 54997.747963 | 3.849216 | 8862.331266 | 11.455334 | 0.315700 | 0.182573 | 5.700535 |
#alcuni grafici per osservare il numero di default/non default in base ai vari attributi categorici
table=pd.crosstab(data['person_home_ownership'],data['loan_status'])
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
#plt.title('Stacked Bar Chart of person_home_ownership vs loan_status')
plt.xlabel('person_home_ownership')
plt.ylabel('Proportion of defaults')
plt.savefig('Figure_2')
#visualizzo media attributi numerici per ciascun valore di 'loan_intent'
data.groupby('loan_intent').mean()
person_age | person_income | person_emp_length | loan_amnt | loan_int_rate | loan_status | loan_percent_income | cb_person_cred_hist_length | |
---|---|---|---|---|---|---|---|---|
loan_intent | ||||||||
DEBTCONSOLIDATION | 27.606293 | 66470.876247 | 4.764613 | 9594.886800 | 10.983268 | 0.285879 | 0.170823 | 5.719302 |
EDUCATION | 26.588099 | 64135.199132 | 4.463911 | 9482.678599 | 10.950261 | 0.172168 | 0.170184 | 5.160855 |
HOMEIMPROVEMENT | 29.066574 | 73549.470458 | 5.134188 | 10360.520111 | 11.201152 | 0.261026 | 0.165515 | 6.481831 |
MEDICAL | 27.998023 | 61437.227145 | 4.767170 | 9259.582441 | 11.060194 | 0.267007 | 0.173044 | 5.941690 |
PERSONAL | 28.208477 | 67864.141279 | 4.888061 | 9573.772867 | 10.998221 | 0.198877 | 0.169230 | 6.122804 |
VENTURE | 27.568456 | 66386.574576 | 4.892549 | 9583.777758 | 10.948275 | 0.148103 | 0.170540 | 5.726875 |
table=pd.crosstab(data['loan_intent'],data['loan_status'])
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
#plt.title('Stacked Bar Chart of person_home_ownership vs loan_status')
plt.xlabel('loan_intent')
plt.ylabel('Proportion of defaults')
plt.savefig('Figure_3')
#visualizzo media attributi numerici per ciascun valore di 'loan_grade'
data.groupby('loan_grade').mean()
person_age | person_income | person_emp_length | loan_amnt | loan_int_rate | loan_status | loan_percent_income | cb_person_cred_hist_length | |
---|---|---|---|---|---|---|---|---|
loan_grade | ||||||||
A | 27.664099 | 66568.207201 | 5.102015 | 8539.273453 | 7.327651 | 0.099564 | 0.153683 | 5.743899 |
B | 27.686059 | 66354.839154 | 4.754761 | 9995.483686 | 10.995555 | 0.162760 | 0.175288 | 5.782126 |
C | 27.800557 | 64921.936203 | 4.450087 | 9213.862651 | 13.463542 | 0.207340 | 0.170084 | 5.865438 |
D | 27.877551 | 63663.682019 | 4.704443 | 10849.241589 | 15.361448 | 0.590458 | 0.191026 | 5.898235 |
E | 27.868257 | 70873.106846 | 4.377101 | 12915.845436 | 17.009455 | 0.644191 | 0.205996 | 5.829876 |
F | 28.352697 | 77008.730290 | 4.254237 | 14717.323651 | 18.609159 | 0.705394 | 0.215643 | 6.128631 |
G | 28.437500 | 76773.296875 | 6.125000 | 17195.703125 | 20.251525 | 0.984375 | 0.243906 | 6.453125 |
table=pd.crosstab(data['loan_grade'],data['loan_status'])
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
#plt.title('Stacked Bar Chart of person_home_ownership vs loan_status')
plt.xlabel('loan_grade')
plt.ylabel('Proportion of defaults')
plt.savefig('Figure_4')
#visualizzo media attributi numerici per ciascun valore di 'cb_person_default_on_file'
data.groupby('cb_person_default_on_file').mean()
person_age | person_income | person_emp_length | loan_amnt | loan_int_rate | loan_status | loan_percent_income | cb_person_cred_hist_length | |
---|---|---|---|---|---|---|---|---|
cb_person_default_on_file | ||||||||
N | 27.717544 | 66178.476263 | 4.843075 | 9475.055895 | 10.258913 | 0.183932 | 0.168429 | 5.795648 |
Y | 27.814273 | 65590.783116 | 4.542548 | 10123.359443 | 14.513800 | 0.378068 | 0.178491 | 5.844212 |
table=pd.crosstab(data['cb_person_default_on_file'],data['loan_status'])
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
#plt.title('Stacked Bar Chart of person_home_ownership vs loan_status')
plt.xlabel('cb_person_default_on_file')
plt.ylabel('Proportion of defaults')
plt.savefig('Figure_5')
#correlation heatmap per features numeriche
corr=data.corr()
cmap = sns.diverging_palette(270, 10, as_cmap=True)
mask= np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, cmap=cmap, mask=mask, annot=True)
plt.savefig('Figure_6')
#boxplot per gli attributi numerici
ax = sns.boxplot(x=data['loan_status'], y=data['person_age'], data=data)
plt.savefig('Figure_7')
ax = sns.boxplot(x=data['loan_status'], y=data['person_income'], data=data)
plt.savefig('Figure_8')
ax = sns.boxplot(x=data['loan_status'], y=data['person_emp_length'], data=data)
plt.savefig('Figure_9')
ax = sns.boxplot(x=data['loan_status'], y=data['loan_amnt'], data=data)
plt.savefig('Figure_10')
ax = sns.boxplot(x=data['loan_status'], y=data['loan_int_rate'], data=data)
plt.savefig('Figure_11')
ax = sns.boxplot(x=data['loan_status'], y=data['loan_percent_income'], data=data)
plt.savefig('Figure_12')
ax = sns.boxplot(x=data['loan_status'], y=data['cb_person_cred_hist_length'], data=data)
plt.savefig('Figure_13')
data.describe()
person_age | person_income | person_emp_length | loan_amnt | loan_int_rate | loan_status | loan_percent_income | cb_person_cred_hist_length | |
---|---|---|---|---|---|---|---|---|
count | 32581.000000 | 3.258100e+04 | 31686.000000 | 32581.000000 | 29465.000000 | 32581.000000 | 32581.000000 | 32581.000000 |
mean | 27.734600 | 6.607485e+04 | 4.789686 | 9589.371106 | 11.011695 | 0.218164 | 0.170203 | 5.804211 |
std | 6.348078 | 6.198312e+04 | 4.142630 | 6322.086646 | 3.240459 | 0.413006 | 0.106782 | 4.055001 |
min | 20.000000 | 4.000000e+03 | 0.000000 | 500.000000 | 5.420000 | 0.000000 | 0.000000 | 2.000000 |
25% | 23.000000 | 3.850000e+04 | 2.000000 | 5000.000000 | 7.900000 | 0.000000 | 0.090000 | 3.000000 |
50% | 26.000000 | 5.500000e+04 | 4.000000 | 8000.000000 | 10.990000 | 0.000000 | 0.150000 | 4.000000 |
75% | 30.000000 | 7.920000e+04 | 7.000000 | 12200.000000 | 13.470000 | 0.000000 | 0.230000 | 8.000000 |
max | 144.000000 | 6.000000e+06 | 123.000000 | 35000.000000 | 23.220000 | 1.000000 | 0.830000 | 30.000000 |
#null values
#percentuale di null values per ogni attributo
Nan_per = data.isnull().sum()/data.shape[0]*100
Nan_per.round(2)
person_age 0.00 person_income 0.00 person_home_ownership 0.00 person_emp_length 2.75 loan_intent 0.00 loan_grade 0.00 loan_amnt 0.00 loan_int_rate 9.56 loan_status 0.00 loan_percent_income 0.00 cb_person_default_on_file 0.00 cb_person_cred_hist_length 0.00 dtype: float64
# stampo moda e mediana per i due attributi che contengono missing values
print('person_emp_length mode {}'.format(data['person_emp_length'].mode()[0]))
print('person_emp_length median {}'.format(data['person_emp_length'].median()))
print('loan_int_rate mode {}'.format(data['loan_int_rate'].mode()[0]))
print('loan_int_rate median {}'.format(data['loan_int_rate'].median()))
person_emp_length mode 0.0 person_emp_length median 4.0 loan_int_rate mode 7.51 loan_int_rate median 10.99
#replace missing vlaues
#rimpiazzo i mising values di 'person_emp_length' con la moda e di 'loan_int_rate' con la mediana
data['person_emp_length'].fillna(data['person_emp_length'].mode()[0], inplace=True)
data['loan_int_rate'].fillna(data['loan_int_rate'].median(), inplace=True)
data.describe()
person_age | person_income | person_emp_length | loan_amnt | loan_int_rate | loan_status | loan_percent_income | cb_person_cred_hist_length | |
---|---|---|---|---|---|---|---|---|
count | 32581.000000 | 3.258100e+04 | 32581.000000 | 32581.000000 | 32581.000000 | 32581.000000 | 32581.000000 | 32581.000000 |
mean | 27.734600 | 6.607485e+04 | 4.658114 | 9589.371106 | 11.009620 | 0.218164 | 0.170203 | 5.804211 |
std | 6.348078 | 6.198312e+04 | 4.159669 | 6322.086646 | 3.081611 | 0.413006 | 0.106782 | 4.055001 |
min | 20.000000 | 4.000000e+03 | 0.000000 | 500.000000 | 5.420000 | 0.000000 | 0.000000 | 2.000000 |
25% | 23.000000 | 3.850000e+04 | 2.000000 | 5000.000000 | 8.490000 | 0.000000 | 0.090000 | 3.000000 |
50% | 26.000000 | 5.500000e+04 | 4.000000 | 8000.000000 | 10.990000 | 0.000000 | 0.150000 | 4.000000 |
75% | 30.000000 | 7.920000e+04 | 7.000000 | 12200.000000 | 13.110000 | 0.000000 | 0.230000 | 8.000000 |
max | 144.000000 | 6.000000e+06 | 123.000000 | 35000.000000 | 23.220000 | 1.000000 | 0.830000 | 30.000000 |
#outliers
data['outlier']=np.where((data['person_age']<=110) & (data['person_emp_length']<=120) & (data['person_income']<=5e6), 'no','yes')
sns.scatterplot(data=data, x= 'person_age', y='person_emp_length',hue='outlier', palette = sns.diverging_palette(10, 240, n=2))
plt.savefig('Figure_20')
#noto che è impossbilie che ci siano persone di età superirore ai 120
#segnalo anche come outliers tutti quei valori sopra alla diagonale del grafico per cui
#gli anni di impiego risultano maggiori dell'età
sns.scatterplot(data=data, x= 'person_age', y='person_income',hue='outlier', palette = sns.diverging_palette(10, 240, n=2))
plt.savefig('Figure_21')
sns.scatterplot(data=data, x= 'loan_percent_income', y='person_emp_length',hue='outlier', palette = sns.diverging_palette(10, 240, n=2))
plt.savefig('Figure_22')
#numerical data
#seleziono solo la parte numerica del dataset, escludendo gli attributi categorici
num_data= pd.DataFrame(data[data.select_dtypes(include=['float','int']).columns])
num_data.columns
Index(['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_status', 'loan_percent_income', 'cb_person_cred_hist_length'], dtype='object')
sns.pairplot(num_data)
<seaborn.axisgrid.PairGrid at 0x1d63c5e8d60>
#eliminate outliers
#in base ai grafici reppresentati di sopra rimuovo gli outliers
#rimuovo le persone che risultano avere più di 110 anni
Data = data[data['person_age']<=110]
#rimuovo le persone i cui anni di impiego sono superiori ai 120
Data = Data[Data['person_emp_length']<=120]
#rimuovo le persone il cui income è superiore a 5e6
Data = Data[Data['person_income']<=5e6]
#Data è il DATASET SENZA OUTLIERS E CON REPLACING MISSING VALUES
#numerical variables
numData = pd.DataFrame(Data[Data.select_dtypes(include=['float','int']).columns])
#categorical variables
catData = pd.DataFrame(Data[Data.select_dtypes(include=['object']).columns])
catData.columns
Index(['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file'], dtype='object')
#converto le variabili categoriche in variabili indicatrici 0-1 ----> ottengo così una varibile per ogni livello
#delle precedenti varibili categoriche
encoded_catdata = pd.get_dummies(catData)
encData = pd.concat([numData, encoded_catdata],axis=1)
encData.head()
person_age | person_income | person_emp_length | loan_amnt | loan_int_rate | loan_status | loan_percent_income | cb_person_cred_hist_length | person_home_ownership_MORTGAGE | person_home_ownership_OTHER | ... | loan_intent_VENTURE | loan_grade_A | loan_grade_B | loan_grade_C | loan_grade_D | loan_grade_E | loan_grade_F | loan_grade_G | cb_person_default_on_file_N | cb_person_default_on_file_Y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 21 | 9600 | 5.0 | 1000 | 11.14 | 0 | 0.10 | 2 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
2 | 25 | 9600 | 1.0 | 5500 | 12.87 | 1 | 0.57 | 3 | 1 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
3 | 23 | 65500 | 4.0 | 35000 | 15.23 | 1 | 0.53 | 2 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
4 | 24 | 54400 | 8.0 | 35000 | 14.27 | 1 | 0.55 | 4 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
5 | 21 | 9900 | 2.0 | 2500 | 7.14 | 1 | 0.25 | 2 | 0 | 0 | ... | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
5 rows × 27 columns
#correlation heatmap
#faccio la nuova correlation heatmap comprensiva di tutte le nuove variabili
corr=encData.corr().sort_values('loan_status', axis=1, ascending=False)
corr=corr.sort_values('loan_status', axis=0, ascending=True)
cmap = sns.diverging_palette(270, 10, as_cmap=True)
mask=np.zeros_like(corr)
mask[np.triu_indices_from(mask, k=1)]=True
with sns.axes_style("white"):
f, ax =plt.subplots(figsize=(16,10))
ax=sns.heatmap(corr, mask=mask, vmin=corr.loan_status.min(),
vmax=corr.drop(['loan_status'], axis=0).loan_status.max(),
square=True, annot=True, fmt='.2f',
center=0, cmap=cmap, annot_kws={"size":10})
#test_train splitting
#divisione del dataset in train e test set. rispettiamente 70 e 30%
label= encData['loan_status']
features= encData.drop('loan_status', axis=1)
x_train, x_test, y_train, y_test = model_selection.train_test_split(features, label, random_state=0, test_size = 0.30)
#conteggio delle due classi nel test set
y_test.value_counts()
0 7696 1 2077 Name: loan_status, dtype: int64
#conteggio delle due classi nel training set
y_train.value_counts()
# ----> il training set è fortemente sbilanciato e va bilanciato prima di testare gli algoritmi
0 17771 1 5030 Name: loan_status, dtype: int64
#balance train dataset
#oversample utilizzando la tecnica SMOTE
oversample = SMOTE (random_state=0)
os_x_train, os_y_train = oversample.fit_resample(x_train, y_train)
os_x_train = pd.DataFrame(data = os_x_train, columns = x_train.columns)
os_y_train = pd.DataFrame(data = os_y_train, columns= ['loan_status'])
print("length of oversampled data is ",len(os_x_train))
print("Number of no default in oversampled data",len(os_y_train[os_y_train['loan_status']==0]))
print("Number of default",len(os_y_train[os_y_train['loan_status']==1]))
print("Proportion of no default data in oversampled data is ",len(os_y_train[os_y_train['loan_status']==0])/len(os_x_train))
print("Proportion of default data in oversampled data is ",len(os_y_train[os_y_train['loan_status']==1])/len(os_x_train))
length of oversampled data is 35542 Number of no default in oversampled data 17771 Number of default 17771 Proportion of no default data in oversampled data is 0.5 Proportion of default data in oversampled data is 0.5
#MODEL : logistic regression (LR)
os_x_train, os_y_train = oversample.fit_resample(x_train, y_train)
#creo il modello di regressione logistica
LR = LogisticRegression()
#lo fitto sul training set
LR.fit(os_x_train, os_y_train)
LogisticRegression()
#predict
#previsioni sul training set
p_train = LR.predict(os_x_train)
#previsioni sul test set
p_test = LR.predict(x_test)
print('Accuracy of logistic regression classifier on oversampled train set: {:.2f}'.format(LR.score(os_x_train, os_y_train)))
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(LR.score(x_test, y_test)))
Accuracy of logistic regression classifier on oversampled train set: 0.74 Accuracy of logistic regression classifier on test set: 0.69
#confusion matrix sul training set. Riportata solo per completezza ma non utilizzata per la valutazione del metodo
confusion_matrix_train = confusion_matrix(os_y_train, p_train)
print(confusion_matrix_train)
#confusion matrix sul test set
confusion_matrix_test = confusion_matrix(y_test, p_test)
print(confusion_matrix_test)
[[12123 5648] [ 3585 14186]] [[5239 2457] [ 557 1520]]
#stampo un iepilogo delle misure di performance sul test set del metodo
print(classification_report(y_test, p_test))
precision recall f1-score support 0 0.90 0.68 0.78 7696 1 0.38 0.73 0.50 2077 accuracy 0.69 9773 macro avg 0.64 0.71 0.64 9773 weighted avg 0.79 0.69 0.72 9773
#scalo train set oversampled
#scalo il train set bilanciato
scaler = preprocessing.StandardScaler().fit(os_x_train)
os_x_train_scaled = scaler.transform(os_x_train)
x_test_scaled = scaler.transform(x_test)
# MODEL: logistic regression with feature selection (RFECV)
#ripeto un modello di regressione logistica ma questa volta facendo prima una selezione delle features
#creo modello di regressione logistica
LRFS=LogisticRegression()
#per selezionare il sottoinsieme delle features ottimale utilizzo recursive feature
#elimination with cross validation (RFECV)
rfecv= RFECV(estimator= LRFS, step=1, cv= StratifiedKFold(2), scoring="accuracy", min_features_to_select=1)
rfecv.fit(os_x_train_scaled, os_y_train)
RFECV(cv=StratifiedKFold(n_splits=2, random_state=None, shuffle=False), estimator=LogisticRegression(), scoring='accuracy')
print("optimal num of features: %d" %rfecv.n_features_)
#trovo che il numero ottimale di features da utilizzare è 15
optimal num of features: 15
#plot results RFECV
#trovo quli sono le 15 features da utilizzare
rfecv.support_
array([False, False, False, False, False, True, False, True, False, True, True, True, True, True, True, True, True, True, True, True, False, False, False, False, True, True])
#salvo le 15 features in selected_col
selected_col = [ 'loan_percent_income',
'person_home_ownership_MORTGAGE',
'person_home_ownership_OWN', 'person_home_ownership_RENT',
'loan_intent_DEBTCONSOLIDATION', 'loan_intent_EDUCATION',
'loan_intent_HOMEIMPROVEMENT', 'loan_intent_MEDICAL',
'loan_intent_PERSONAL', 'loan_intent_VENTURE', 'loan_grade_A',
'loan_grade_B', 'loan_grade_C', 'cb_person_default_on_file_N',
'cb_person_default_on_file_Y']
#creo il nuovo training set contenente solo le features selezionate
FSos_x_train = os_x_train[selected_col]
FSos_x_train
loan_percent_income | person_home_ownership_MORTGAGE | person_home_ownership_OWN | person_home_ownership_RENT | loan_intent_DEBTCONSOLIDATION | loan_intent_EDUCATION | loan_intent_HOMEIMPROVEMENT | loan_intent_MEDICAL | loan_intent_PERSONAL | loan_intent_VENTURE | loan_grade_A | loan_grade_B | loan_grade_C | cb_person_default_on_file_N | cb_person_default_on_file_Y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.160000 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 |
1 | 0.140000 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
2 | 0.110000 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 |
3 | 0.070000 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
4 | 0.120000 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
35537 | 0.370000 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
35538 | 0.400000 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
35539 | 0.090000 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
35540 | 0.130000 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
35541 | 0.085144 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
35542 rows × 15 columns
#e il nuovo set set contenente solo le 15 features selezionate
FSx_test = x_test[selected_col]
FSx_test
loan_percent_income | person_home_ownership_MORTGAGE | person_home_ownership_OWN | person_home_ownership_RENT | loan_intent_DEBTCONSOLIDATION | loan_intent_EDUCATION | loan_intent_HOMEIMPROVEMENT | loan_intent_MEDICAL | loan_intent_PERSONAL | loan_intent_VENTURE | loan_grade_A | loan_grade_B | loan_grade_C | cb_person_default_on_file_N | cb_person_default_on_file_Y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
8231 | 0.18 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
5444 | 0.09 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
7881 | 0.05 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 |
13540 | 0.16 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 |
256 | 0.41 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
28732 | 0.16 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
25828 | 0.15 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 |
27232 | 0.03 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 |
11159 | 0.20 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
15315 | 0.05 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
9773 rows × 15 columns
#fitto il modello di regressione sul training set con numero di features ridotte
LRFS = LogisticRegression()
LRFS.fit(FSos_x_train, os_y_train)
LogisticRegression()
#previsione sul training set
p_trainFS = LRFS.predict(FSos_x_train)
#previsione sul test set
p_testFS = LRFS.predict(FSx_test)
print('Accuracy of logistic regression classifier on oversampled train set: {:.2f}'.format(LRFS.score(FSos_x_train, os_y_train)))
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(LRFS.score(FSx_test, y_test)))
Accuracy of logistic regression classifier on oversampled train set: 0.91 Accuracy of logistic regression classifier on test set: 0.87
#confusion matrix sul train set
confusion_matrix_train = confusion_matrix(os_y_train, p_trainFS)
print(confusion_matrix_train)
#confusion matrix sul test set. Uso questa per giudicare la bontà del modello
confusion_matrix_test = confusion_matrix(y_test, p_testFS)
print(confusion_matrix_test)
[[16758 1013] [ 2346 15425]] [[7272 424] [ 887 1190]]
#stampo un riepilogo delle misure di performance
print(classification_report(y_test, p_testFS))
precision recall f1-score support 0 0.89 0.94 0.92 7696 1 0.74 0.57 0.64 2077 accuracy 0.87 9773 macro avg 0.81 0.76 0.78 9773 weighted avg 0.86 0.87 0.86 9773
#MODEL : SVM (grid search for best param)
#stabilisco una griglia di parametri su cui andrò a testare il modello
param_grid = [
{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]
# Grid search for best hyperparameters
# creo il modello base senza ancora i parametri ottimali
SVM = SVC()
#imposto la random search sulla griglia definita precedentemente
SVM_gridsearch = GridSearchCV(SVM, param_grid)
# Fitto il modello random search sul training set
SVM_gridsearch.fit(os_x_train_scaled, os_y_train)
GridSearchCV(estimator=SVC(), param_grid=[{'C': [1, 10, 100, 1000], 'kernel': ['linear']}, {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}])
#stampo la combinazione di parametri ottimale
SVM_gridsearch.best_params_
{'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
#C: 1000 , gamma : 0.001, kernel 'rbf'
#definisco il modello Support vector classifer utilizzando i parametri appena trovati
SVM = SVC(C=1000, gamma=0.001, kernel='rbf',probability=True)
#fitto il modello sul training set
SVM.fit(os_x_train_scaled, os_y_train)
SVC(C=1000, gamma=0.001, probability=True)
#previsioni su train e test set
p_trainSVM = SVM.predict(os_x_train_scaled)
p_testSVM = SVM.predict(x_test_scaled)
print('Accuracy of SVM classifier on oversampled train set: {:.2f}'.format(SVM.score(os_x_train_scaled, os_y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'.format(SVM.score(x_test_scaled, y_test)))
Accuracy of SVM classifier on oversampled train set: 0.94 Accuracy of SVM classifier on test set: 0.91
#confusion matrix su train set
confusion_matrix_train = confusion_matrix(os_y_train, p_trainSVM)
print(confusion_matrix_train)
#confusion matrix su test set
confusion_matrix_test = confusion_matrix(y_test, p_testSVM)
print(confusion_matrix_test)
[[17429 342] [ 1778 15993]] [[7527 169] [ 731 1346]]
#stampo riepilogo delle misure di performance
print(classification_report(y_test, p_testSVM))
precision recall f1-score support 0 0.91 0.98 0.94 7696 1 0.89 0.65 0.75 2077 accuracy 0.91 9773 macro avg 0.90 0.81 0.85 9773 weighted avg 0.91 0.91 0.90 9773
#SVM with feature selection
#utilizzo il dataset con le features ridotte trovato prima
#scalo anche questo dataset
scaler = preprocessing.StandardScaler().fit(FSos_x_train)
FSos_x_train_scaled = scaler.transform(FSos_x_train)
FSx_test_scaled = scaler.transform(FSx_test)
#stabilisco una griglia di parametri su cui andrò a testare il modello
param_grid = [
{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]
# Grid search for best hyperparameters
# creo il modello base senza ancora i parametri ottimali
SVMFS = SVC()
#imposto la random search sulla griglia definita precedentemente
SVMFS_gridsearch = GridSearchCV(SVMFS, param_grid)
# Fitto il modello random search sul training set
SVMFS_gridsearch.fit(FSos_x_train_scaled, os_y_train)
GridSearchCV(estimator=SVC(), param_grid=[{'C': [1, 10, 100, 1000], 'kernel': ['linear']}, {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}])
#stampo la combinazione di parametri ottimale
SVMFS_gridsearch.best_params_
{'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
#definisco il modello Support vector classifier
SVMFS = SVC(C=1000, gamma=0.001, kernel='rbf',probability=True)
#fitto il modello sul dataset con features ridotte
SVMFS.fit(FSos_x_train_scaled, os_y_train)
SVC(C=1000, gamma=0.001, probability=True)
#previsioni sul train e test set
p_trainSVMFS = SVMFS.predict(FSos_x_train_scaled)
p_testSVMFS = SVMFS.predict(FSx_test_scaled)
print('Accuracy of SVM classifier on oversampled train set: {:.2f}'.format(SVMFS.score(FSos_x_train_scaled, os_y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'.format(SVMFS.score(FSx_test_scaled, y_test)))
#93 89
Accuracy of SVM classifier on oversampled train set: 0.93 Accuracy of SVM classifier on test set: 0.89
#confusion matrix su train set
confusion_matrix_train = confusion_matrix(os_y_train, p_trainSVMFS)
print(confusion_matrix_train)
#confusion matrix su test set
confusion_matrix_test = confusion_matrix(y_test, p_testSVMFS)
print(confusion_matrix_test)
[[17234 537] [ 2038 15733]] [[7444 252] [ 784 1293]]
#stampo misure di performance
print(classification_report(y_test, p_testSVMFS))
precision recall f1-score support 0 0.90 0.97 0.93 7696 1 0.84 0.62 0.71 2077 accuracy 0.89 9773 macro avg 0.87 0.79 0.82 9773 weighted avg 0.89 0.89 0.89 9773
# MODEL: KNN
#definisco il modello KNN per ora con paramentri arbitrari
KNN = KNeighborsClassifier(n_neighbors=10, metric= 'minkowski', p=2)
#fitto il modello sul training set completo di tutte le features
KNN.fit(os_x_train_scaled, os_y_train)
#previsioni sul train e test set
p_trainKNN = KNN.predict(os_x_train_scaled)
p_testKNN = KNN.predict(x_test_scaled)
print('Accuracy of KNN classifier on oversampled train set: {:.2f}'.format(KNN.score(os_x_train_scaled, os_y_train)))
print('Accuracy of KNN classifier on test set: {:.2f}'.format(KNN.score(x_test_scaled, y_test)))
Accuracy of KNN classifier on oversampled train set: 0.94 Accuracy of KNN classifier on test set: 0.89
#confusion matrix su train set
confusion_matrix_train = confusion_matrix(os_y_train, p_trainKNN)
print(confusion_matrix_train)
#confusion matrix sul test set
confusion_matrix_test = confusion_matrix(y_test, p_testKNN)
print(confusion_matrix_test)
[[17541 230] [ 2034 15737]] [[7561 135] [ 893 1184]]
#stampo le performance measures del modello
print(classification_report(y_test, p_testKNN))
precision recall f1-score support 0 0.89 0.98 0.94 7696 1 0.90 0.57 0.70 2077 accuracy 0.89 9773 macro avg 0.90 0.78 0.82 9773 weighted avg 0.90 0.89 0.89 9773
#il parametro fondamentale del KNN è il numero di vicini K
#trovo quale sia il K ottimale utilizzando, come per il modello precedente, una random search su una griglia da me definita
#best choice of n_neighbors using grid search
#scelgo come punti della griglia 5, 10, 15,20,...,40,45,50
parameters = {"n_neighbors": range(5,50,5)}
gridsearch = GridSearchCV (KNeighborsClassifier(), parameters)
gridsearch.fit(os_x_train_scaled, os_y_train)
#stampo K ottimale
gridsearch.best_params_
{'n_neighbors': 5}
#provo a migliorare ulteriormente il risultato
#definisco una nuova griglia intorno a k=5
parameters = {"n_neighbors": range(2,9)}
gridsearch = GridSearchCV (KNeighborsClassifier(), parameters)
gridsearch.fit(os_x_train_scaled, os_y_train)
gridsearch.best_params_
#il k ottimale risulta ancora essere 5
{'n_neighbors': 5}
#creo modello con K ottimale
KNN = KNeighborsClassifier(n_neighbors=5, metric= 'minkowski', p=2)
#e lo fitto sul train set
KNN.fit(os_x_train_scaled, os_y_train)
KNeighborsClassifier()
#faccio le previsioni su train e test set
p_trainKNN = KNN.predict(os_x_train_scaled)
p_testKNN = KNN.predict(x_test_scaled)
print('Accuracy of KNN classifier on oversampled train set: {:.2f}'.format(KNN.score(os_x_train_scaled, os_y_train)))
print('Accuracy of KNN classifier on test set: {:.2f}'.format(KNN.score(x_test_scaled, y_test)))
Accuracy of KNN classifier on oversampled train set: 0.95 Accuracy of KNN classifier on test set: 0.89
#confusion matrix su train set
confusion_matrix_train = confusion_matrix(os_y_train, p_trainKNN)
print(confusion_matrix_train)
#confusion matrix su test set
confusion_matrix_test = confusion_matrix(y_test, p_testKNN)
print(confusion_matrix_test)
[[17426 345] [ 1556 16215]] [[7439 257] [ 797 1280]]
#riepilogo misure di performance del modello
print(classification_report(y_test, p_testKNN))
precision recall f1-score support 0 0.90 0.97 0.93 7696 1 0.83 0.62 0.71 2077 accuracy 0.89 9773 macro avg 0.87 0.79 0.82 9773 weighted avg 0.89 0.89 0.89 9773
#KNN CON FEATURE SELECTION
#ripeto lo stesso modello ma questa volta sul dataset di sole 15 features
#scalo questo dataset
scaler = preprocessing.StandardScaler().fit(FSos_x_train)
FSos_x_train_scaled = scaler.transform(FSos_x_train)
FSx_test_scaled = scaler.transform(FSx_test)
#definisco una nuova griglia intorno a k=10
parameters = {"n_neighbors": range(6,14)}
gridsearch = GridSearchCV (KNeighborsClassifier(), parameters)
gridsearch.fit(FSos_x_train_scaled, os_y_train)
GridSearchCV(estimator=KNeighborsClassifier(), param_grid={'n_neighbors': range(6, 14)})
gridsearch.best_params_
#k=9 optimal value
{'n_neighbors': 9}
#creo il modello e lo fitto
KNNFS = KNeighborsClassifier(n_neighbors=9, metric= 'minkowski', p=2)
KNNFS.fit(FSos_x_train_scaled, os_y_train)
KNeighborsClassifier(n_neighbors=9)
#eseguo le previsioni sul train e test set
p_trainKNNFS = KNNFS.predict(FSos_x_train_scaled)
p_testKNNFS = KNNFS.predict(FSx_test_scaled)
print('Accuracy of KNN classifier on oversampled train set: {:.2f}'.format(KNNFS.score(FSos_x_train_scaled, os_y_train)))
print('Accuracy of KNN classifier on test set: {:.2f}'.format(KNNFS.score(FSx_test_scaled, y_test)))
Accuracy of KNN classifier on oversampled train set: 0.93 Accuracy of KNN classifier on test set: 0.89
#confusion matrix su train set
confusion_matrix_train = confusion_matrix(os_y_train, p_trainKNNFS)
print(confusion_matrix_train)
#confusion matrix su test set
confusion_matrix_test = confusion_matrix(y_test, p_testKNNFS)
print(confusion_matrix_test)
[[17355 416] [ 1933 15838]] [[7439 257] [ 794 1283]]
#riepilogo misure di performance
print(classification_report(y_test, p_testKNNFS))
precision recall f1-score support 0 0.90 0.97 0.93 7696 1 0.83 0.62 0.71 2077 accuracy 0.89 9773 macro avg 0.87 0.79 0.82 9773 weighted avg 0.89 0.89 0.89 9773
# MODEL: RANDOM FOREST (RF)
#creo modello random forest, setto un seme per ottenere sempre lo stesso risultato essendo questo un metodo non deterministico
RF = RandomForestClassifier(random_state=0)
#per ora ho lasciato i parametri di default
#vado a vedere quali sono tali parametri
print('Parameters currently in use:\n')
pprint(RF.get_params())
Parameters currently in use: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 0, 'verbose': 0, 'warm_start': False}
# fitto la random forestcon i parametri di default
RF.fit(os_x_train, os_y_train)
RandomForestClassifier(random_state=0)
#faccio previsioni su train e test set
p_train_RF = RF.predict(os_x_train)
p_test_RF = RF.predict(x_test)
print('Accuracy of random forest on oversampled train set: {:.2f}'.format(rf.score(os_x_train, os_y_train)))
print('Accuracy of random forest on test set: {:.2f}'.format(rf.score(x_test, y_test)))
#il modello evidentemente, vista l'accuratezza pari a 1 sul train set splitta fino a quando tutti i leaves contengono
#solo un'osservazione ----> i paramentri vanno cambiati
Accuracy of random forest on oversampled train set: 1.00 Accuracy of random forest on test set: 0.93
print(p_train_RF)
[0 0 0 ... 1 1 1]
#confusion matrix su train set
confusion_matrix_train = confusion_matrix(os_y_train, p_train_RF)
print(confusion_matrix_train)
#confusion matrix su test set
confusion_matrix_test = confusion_matrix(y_test, p_test_RF)
print(confusion_matrix_test)
[[17771 0] [ 0 17771]] [[7598 98] [ 582 1495]]
#riepilogo misure d performance del modello
print(classification_report(y_test, p_test_RF))
precision recall f1-score support 0 0.93 0.99 0.96 7696 1 0.94 0.72 0.81 2077 accuracy 0.93 9773 macro avg 0.93 0.85 0.89 9773 weighted avg 0.93 0.93 0.93 9773
#cambiamo i parametri del modello
#come per i modelli precedenti troviamo i parametri ottimali eseguendo una random search su una griglia
# definisco la random hyperparameter grid
# numero di alberi nella random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 5)]
# numero di features considerato ad ogni split
max_features = ['auto', 'sqrt']
# massima profondità dell'albero
max_depth = [int(x) for x in np.linspace(10, 80, num = 4)]
max_depth.append(None)
# minimo numero di samples per splittare un nodo
min_samples_split = [2, 5, 10]
# minimo numero di samples richiesto a ogni nodo terminale dell'albero
min_samples_leaf = [ 2, 4]
# metodo per selezionare samples per training di ogni albero
bootstrap = [True]
# creo la random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
pprint(random_grid)
{'bootstrap': [True], 'max_depth': [10, 33, 56, 80, None], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [2, 4], 'min_samples_split': [2, 5, 10], 'n_estimators': [200, 650, 1100, 1550, 2000]}
# uso la random grid per la ricerca dei paramentri ottimali
# prima creo il modello base su cui voglio fare tuning
RF = RandomForestClassifier()
# Random search dei parametri, usando 3 fold cross validation,
RF_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fitto il modello random search
RF_random.fit(os_x_train, os_y_train)
Fitting 3 folds for each of 50 candidates, totalling 150 fits
RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=50, n_jobs=-1, param_distributions={'bootstrap': [True], 'max_depth': [10, 33, 56, 80, None], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [2, 4], 'min_samples_split': [2, 5, 10], 'n_estimators': [200, 650, 1100, 1550, 2000]}, random_state=42, verbose=2)
#trovo i best parameters
RF_random.best_params_
{'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 33, 'bootstrap': True}
#RANDOM FOREST with best parameters
#definisco il modello con i parametri ottimali
RF = RandomForestClassifier(random_state=0, n_estimators= 200,
min_samples_split= 2,min_samples_leaf= 2,max_features= 'sqrt',
max_depth= 33,bootstrap= True )
#e lo fitto sul train set
RF.fit(os_x_train, os_y_train)
RandomForestClassifier(max_depth=33, max_features='sqrt', min_samples_leaf=2, n_estimators=200, random_state=0)
#effettuo previsioni su train e test set
p_train_RF = RF.predict(os_x_train)
p_test_RF = RF.predict(x_test)
print('Accuracy of random forest on oversampled train set: {:.2f}'.format(RF.score(os_x_train, os_y_train)))
print('Accuracy of random forest on test set: {:.2f}'.format(RF.score(x_test, y_test)))
Accuracy of random forest on oversampled train set: 0.98 Accuracy of random forest on test set: 0.93
#confusion matrix su train set
confusion_matrix_train = confusion_matrix(os_y_train, p_train_RF)
print(confusion_matrix_train)
#confusion matrix su test set
confusion_matrix_test = confusion_matrix(y_test, p_test_RF)
print(confusion_matrix_test)
[[17753 18] [ 734 17037]] [[7629 67] [ 597 1480]]
#riepilogo misure di performance
print(classification_report(y_test, p_test_RF))
precision recall f1-score support 0 0.93 0.99 0.96 7696 1 0.96 0.71 0.82 2077 accuracy 0.93 9773 macro avg 0.94 0.85 0.89 9773 weighted avg 0.93 0.93 0.93 9773
#cerco di migliorare ulteriormente i risultati utilizzando una grid search
#baso la mia nuova griglia sui risultati otteniti prima, andando a cercare 'nelle vicinanze' di quello che,
#secondo la grid search di prima, era il miglior modello
#definisco la nuova griglia
param_grid = {
'bootstrap': [True],
'max_depth': [25,30,33,35],
'max_features': ['sqrt'],
'min_samples_leaf': [2,5],
'min_samples_split': [2,5],
'n_estimators': [200]
}
# modello grid search
grid_search = GridSearchCV(estimator = RandomForestClassifier(), param_grid = param_grid,
cv = 5, n_jobs = -1, verbose = 2)
# Fitto la grid search ai dati
grid_search.fit(os_x_train, os_y_train)
#trovo i best parameters
grid_search.best_params_
Fitting 5 folds for each of 16 candidates, totalling 80 fits
{'bootstrap': True, 'max_depth': 33, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
#teniamo RF come best parameter model in quanto risulta ancora il miglio modello
#MODEL: RANDOM FOREST con feature selection
#uso di nuovo la random forest ma questa volta sul dataset con features ridotte
#come per i modelli precedenti troviamo i parametri ottimali eseguendo una random search su una griglia
# definisco la random hyperparameter grid
# numero di alberi nella foresta
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 5)]
# numero di features considerate ad ogni split
max_features = ['auto', 'sqrt']
# profondità massima dell'albero
max_depth = [int(x) for x in np.linspace(10, 80, num = 4)]
max_depth.append(None)
# minmimo numero di samples richiesto per lo split di un nodo
min_samples_split = [2, 5, 10]
# minimo numero di samples richiesto a ogni leaf
min_samples_leaf = [ 2, 4]
# metodo per seleionare i samples per il training di ciascun albero
bootstrap = [True]
# Creo la random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
pprint(random_grid)
{'bootstrap': [True], 'max_depth': [10, 33, 56, 80, None], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [2, 4], 'min_samples_split': [2, 5, 10], 'n_estimators': [200, 650, 1100, 1550, 2000]}
#uso la random grid per trovare i best hyperparameters
#creo il modello base su cui fare tuning
RFFS = RandomForestClassifier()
# Random search dei parametri, usando 3 fold cross validation,
RFFS_random = RandomizedSearchCV(estimator = RFFS, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fitto il modello random search
RFFS_random.fit(FSos_x_train, os_y_train)
Fitting 3 folds for each of 50 candidates, totalling 150 fits
RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=50, n_jobs=-1, param_distributions={'bootstrap': [True], 'max_depth': [10, 33, 56, 80, None], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [2, 4], 'min_samples_split': [2, 5, 10], 'n_estimators': [200, 650, 1100, 1550, 2000]}, random_state=42, verbose=2)
#trovo i parametri migliori secondo il modello
RFFS_random.best_params_
{'n_estimators': 1550, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 80, 'bootstrap': True}
#definisco il nuovo modello utilizzando i parametri trovati
RFFS = RandomForestClassifier(random_state=0, n_estimators= 1550,
min_samples_split= 10,min_samples_leaf= 4,max_features= 'sqrt',
max_depth= 80,bootstrap= True )
#fitto il modello sul train set con numero di features ridotto
RFFS.fit(FSos_x_train, os_y_train)
RandomForestClassifier(max_depth=80, max_features='sqrt', min_samples_leaf=4, min_samples_split=10, n_estimators=1550, random_state=0)
#eseguo le previsioni
p_train_RFFS = RFFS.predict(FSos_x_train)
p_test_RFFS = RFFS.predict(FSx_test)
print('Accuracy of random forest on oversampled train set: {:.2f}'.format(RFFS.score(FSos_x_train, os_y_train)))
print('Accuracy of random forest on test set: {:.2f}'.format(RFFS.score(FSx_test, y_test)))
#94 98
Accuracy of random forest on oversampled train set: 0.94 Accuracy of random forest on test set: 0.89
#confusion matrix su train set
confusion_matrix_train = confusion_matrix(os_y_train, p_train_RFFS)
print(confusion_matrix_train)
#confusion matrix su test set
confusion_matrix_test = confusion_matrix(y_test, p_test_RFFS)
print(confusion_matrix_test)
#7405 291
#786 1291
[[17391 380] [ 1880 15891]] [[7457 239] [ 793 1284]]
#misure di performance
print(classification_report(y_test, p_test_RFFS))
#82 62 71 89
precision recall f1-score support 0 0.90 0.97 0.94 7696 1 0.84 0.62 0.71 2077 accuracy 0.89 9773 macro avg 0.87 0.79 0.82 9773 weighted avg 0.89 0.89 0.89 9773
#MODEL: ADABOOST (AB)
#il metodo ada boost è un metodo ensemble basato su classificatori di base che solitamente sono scelti come
#alberi con un solo split
#definisco quindi il classificatore di base, settando il seme per ottenere sempre lo stesso risultato e
#fissando a 1 la profondità massima dell'albero
#base estimator: decision tree
decTree = tree.DecisionTreeClassifier(random_state=0, max_depth=1)
#fitto l'albero sul train set
decTree.fit(os_x_train, os_y_train)
#rappresentazione dell'albero
tree.plot_tree(decTree)
[Text(0.5, 0.75, 'X[5] <= 0.3\ngini = 0.5\nsamples = 35542\nvalue = [17771, 17771]'), Text(0.25, 0.25, 'gini = 0.476\nsamples = 27833\nvalue = [16974, 10859]'), Text(0.75, 0.25, 'gini = 0.185\nsamples = 7709\nvalue = [797, 6912]')]
#adaboost classifier
#a questo punto posso definire il modello adaboost
AB = AdaBoostClassifier(n_estimators=50, base_estimator=decTree, learning_rate=1)
#fitto il modello sul train set
AB.fit(os_x_train, os_y_train)
#previsioni su train e test set
p_train_AB = AB.predict(os_x_train)
p_test_AB = AB.predict(x_test)
print('Accuracy of random forest on oversampled train set: {:.2f}'.format(AB.score(os_x_train, os_y_train)))
print('Accuracy of random forest on test set: {:.2f}'.format(AB.score(x_test, y_test)))
#confusion matrix su train set
confusion_matrix_train = confusion_matrix(os_y_train, p_train_AB)
print(confusion_matrix_train)
#confusion matrix su test set
confusion_matrix_test = confusion_matrix(y_test, p_test_AB)
print(confusion_matrix_test)
#hyperparameter tuning
#cerco parametri ottimali
#definisco la griglia:
# Number of trees
n_estimators = [int(x) for x in np.linspace(start = 0, stop = 1000, num = 50)]
#learning rate
learning_rate=[0.4,0.5,0.6,0.7,0.8,0.9,1]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
'learning_rate': learning_rate}
# Use the random grid to search for best hyperparameters
# First create the base model to tune
AB = AdaBoostClassifier()
# Random search of parameters, using 3 fold cross validation,
AB_random = RandomizedSearchCV(estimator = AB, param_distributions = random_grid, n_iter = 50, cv = 5, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
AB_random.fit(os_x_train, os_y_train)
Fitting 5 folds for each of 50 candidates, totalling 250 fits
RandomizedSearchCV(cv=5, estimator=AdaBoostClassifier(), n_iter=50, n_jobs=-1, param_distributions={'learning_rate': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1], 'n_estimators': [0, 20, 40, 61, 81, 102, 122, 142, 163, 183, 204, 224, 244, 265, 285, 306, 326, 346, 367, 387, 408, 428, 448, 469, 489, 510, 530, 551, 571, 591, ...]}, random_state=42, verbose=2)
#trovo parametri ottiamli
AB_random.best_params_
{'n_estimators': 897, 'learning_rate': 0.9}
#MODEL: ADABOOST with best parameters
#definisco il modello con i best parameters trovati
AB = AdaBoostClassifier(n_estimators=897, base_estimator=decTree, learning_rate=0.9)
AB.fit(os_x_train, os_y_train)
#eseguo le previsioni su train e test set
p_train_AB = AB.predict(os_x_train)
p_test_AB = AB.predict(x_test)
print('Accuracy of random forest on oversampled train set: {:.2f}'.format(AB.score(os_x_train, os_y_train)))
print('Accuracy of random forest on test set: {:.2f}'.format(AB.score(x_test, y_test)))
Accuracy of random forest on oversampled train set: 0.93 Accuracy of random forest on test set: 0.89
#confusion matrix su train set
confusion_matrix_train = confusion_matrix(os_y_train, p_train_AB)
print(confusion_matrix_train)
#confusion matrix su test set
confusion_matrix_test = confusion_matrix(y_test, p_test_AB)
print(confusion_matrix_test)
[[16912 859] [ 1627 16144]] [[7302 394] [ 671 1406]]
#riepilogo performance measures
print(classification_report(y_test, p_test_AB))
precision recall f1-score support 0 0.92 0.95 0.93 7696 1 0.78 0.68 0.73 2077 accuracy 0.89 9773 macro avg 0.85 0.81 0.83 9773 weighted avg 0.89 0.89 0.89 9773
#cerco di migliorere ulteriormente adaBoost usando grid search
#la nuova griglia è scelta in modo da essere un intorno del punto ottimale trovato precedentemente
param_grid = {
'n_estimators':[850,897,900,950],
'learning_rate':[0.85,0.88,0.9,0.92,0.95]
}
# creo il modello grid search
grid_search = GridSearchCV(estimator = AdaBoostClassifier(), param_grid = param_grid,
cv = 5, n_jobs = -1, verbose = 2)
# Fittto grid search sui dati
grid_search.fit(os_x_train, os_y_train)
#stampo paramentri ottimali
grid_search.best_params_
Fitting 5 folds for each of 20 candidates, totalling 100 fits
{'learning_rate': 0.92, 'n_estimators': 950}
#definisco modello con i nuovi parametri
AB = AdaBoostClassifier(n_estimators=950, base_estimator=decTree, learning_rate=0.92)
#e lo fitto sul train set
AB.fit(os_x_train, os_y_train)
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1, random_state=0), learning_rate=0.92, n_estimators=950)
#eseguo le previsioni
p_train_AB = AB.predict(os_x_train)
p_test_AB = AB.predict(x_test)
print('Accuracy of random forest on oversampled train set: {:.2f}'.format(AB.score(os_x_train, os_y_train)))
print('Accuracy of random forest on test set: {:.2f}'.format(AB.score(x_test, y_test)))
Accuracy of random forest on oversampled train set: 0.93 Accuracy of random forest on test set: 0.89
#confusion matrix su train set
confusion_matrix_train = confusion_matrix(os_y_train, p_train_AB)
print(confusion_matrix_train)
#confusion matrix su test set
confusion_matrix_test = confusion_matrix(y_test, p_test_AB)
print(confusion_matrix_test)
[[16923 848] [ 1628 16143]] [[7297 399] [ 678 1399]]
#riepilogo misure di performance
print(classification_report(y_test, p_test_AB))
precision recall f1-score support 0 0.91 0.95 0.93 7696 1 0.78 0.67 0.72 2077 accuracy 0.89 9773 macro avg 0.85 0.81 0.83 9773 weighted avg 0.89 0.89 0.89 9773
#MODEL: ADABOOST con feature selection
#ripeto lo stesso modello ma usando il dataset contente solo 15 features
#definisco lo stimatore base
#base estimator: decision tree
decTree3 = tree.DecisionTreeClassifier(random_state=0, max_depth=1)
decTree3.fit(FSos_x_train, os_y_train)
tree.plot_tree(decTree)
#hyperparameter tuning
#cerco parametri ottimali
#definisco la griglia:
# Number of trees
n_estimators = [int(x) for x in np.linspace(start = 0, stop = 1000, num = 50)]
#learning rate
learning_rate=[0.4,0.5,0.6,0.7,0.8,0.9,1]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
'learning_rate': learning_rate}
# Uso random grid per cercare i best parameters
# creo il modello base su cui fare tuning
ABFS = AdaBoostClassifier()
# Random search dei parametri, usando 3 fold cross validation,
ABFS_random = RandomizedSearchCV(estimator = ABFS, param_distributions = random_grid, n_iter = 50, cv = 5, verbose=2, random_state=42, n_jobs = -1)
# fitto il modello random search
ABFS_random.fit(FSos_x_train, os_y_train)
Fitting 5 folds for each of 50 candidates, totalling 250 fits
RandomizedSearchCV(cv=5, estimator=AdaBoostClassifier(), n_iter=50, n_jobs=-1, param_distributions={'learning_rate': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1], 'n_estimators': [0, 20, 40, 61, 81, 102, 122, 142, 163, 183, 204, 224, 244, 265, 285, 306, 326, 346, 367, 387, 408, 428, 448, 469, 489, 510, 530, 551, 571, 591, ...]}, random_state=42, verbose=2)
#trovo parametri ottiamli
ABFS_random.best_params_
{'n_estimators': 102, 'learning_rate': 0.4}
#MODEL: ADABOOST with best parameters
#definisco il modello con i best parameters trovati
ABFS = AdaBoostClassifier(n_estimators= 102 , base_estimator=decTree3, learning_rate= 0.4)
ABFS.fit(FSos_x_train, os_y_train)
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1, random_state=0), learning_rate=0.4, n_estimators=102)
#eseguo le previsioni su train e test set
p_train_ABFS = ABFS.predict(FSos_x_train)
p_test_ABFS = ABFS.predict(FSx_test)
print('Accuracy of random forest on oversampled train set: {:.2f}'.format(ABFS.score(FSos_x_train, os_y_train)))
print('Accuracy of random forest on test set: {:.2f}'.format(ABFS.score(FSx_test, y_test)))
Accuracy of random forest on oversampled train set: 0.92 Accuracy of random forest on test set: 0.88
#confusion matrix su train set
confusion_matrix_train = confusion_matrix(os_y_train, p_train_ABFS)
print(confusion_matrix_train)
#confusion matrix su test set
confusion_matrix_test = confusion_matrix(y_test, p_test_ABFS)
print(confusion_matrix_test)
#7245 451
#748 1329
[[16619 1152] [ 1839 15932]] [[7205 491] [ 679 1398]]
#riepilogo misure di performance
print(classification_report(y_test, p_test_ABFS))
#75 64 69 88
precision recall f1-score support 0 0.91 0.94 0.92 7696 1 0.74 0.67 0.70 2077 accuracy 0.88 9773 macro avg 0.83 0.80 0.81 9773 weighted avg 0.88 0.88 0.88 9773
#confronto i metodi usati utilizzando le curve ROC
#confronto per tutte e 5 le tipologie di modello (regressione logistica, SVM, KNN, random forest, AdaBoost) i modelli
#ottenuti con i best parameters e utilizzando tutte le features
#roc (all features)
#creo la figura
fig = plt.figure(figsize=(8,5))
plt.plot([0, 1], [0, 1],'r--')
# LOGISTIC REGRESSION (logreg)
#previsioni delle prob di default e di non default del modello regressione logistica
preds_proba_LR = LR.predict_proba(x_test)
#il risultato è una matrice con due colonne. Ciascuna colonna contiene rispettivamente la prob di default e la
#prob di non default di quell'inidividuo
#di questa matrice prendo solo la prima colonna, la seconda sarà semplicemente (1-prima colonna)
probsLR = preds_proba_LR[:, 1]
#disegno roc curve per regressione logistica
fpr, tpr, thresh = metrics.roc_curve(y_test, probsLR)
aucLR = roc_auc_score(y_test, probsLR)
plt.plot(fpr, tpr, label=f'LR, AUC = {str(round(aucLR,3))}')
#ripeto gli stessi passaggi per gli altri 4 metodi
# SVM
preds_proba_SVM = SVM.predict_proba(x_test_scaled)
probsSVM = preds_proba_SVM[:, 1]
fpr, tpr, thresh = metrics.roc_curve(y_test, probsSVM)
auclg = roc_auc_score(y_test, probsSVM)
plt.plot(fpr, tpr, label=f'SVM, AUC = {str(round(auclg,3))}')
# KNN
preds_proba_KNN = KNN.predict_proba(x_test_scaled)
probsKNN = preds_proba_KNN[:, 1]
fpr, tpr, thresh = metrics.roc_curve(y_test, probsKNN)
auclg = roc_auc_score(y_test, probsKNN)
plt.plot(fpr, tpr, label=f'KNN, AUC = {str(round(auclg,3))}')
#RANDOM FOREST (RF)
preds_proba_RF = RF.predict_proba(x_test)
probsRF = preds_proba_RF[:, 1]
fpr, tpr, thresh = metrics.roc_curve(y_test, probsRF)
auclg = roc_auc_score(y_test, probsRF)
plt.plot(fpr, tpr, label=f'RF, AUC = {str(round(auclg,3))}')
# ADABOOST (AB)
preds_proba_AB = AB.predict_proba(x_test)
probsAB = preds_proba_AB[:, 1]
fpr, tpr, thresh = metrics.roc_curve(y_test, probsAB)
auclg = roc_auc_score(y_test, probsAB)
plt.plot(fpr, tpr, label=f'AB, AUC = {str(round(auclg,3))}')
plt.ylabel("True Positive Rate", fontsize=12)
plt.xlabel("False Positive Rate", fontsize=12)
plt.title("ROC curve")
plt.rcParams['axes.titlesize'] = 16
plt.legend()
plt.show()
#confronto le curve ROC dei modelli basati solo su 15 features (empre considerando i modelli con best parameters)
#roc (feature selection)
fig = plt.figure(figsize=(8,5))
plt.plot([0, 1], [0, 1],'r--')
# RFECV LOGISTIC REGRESSION (LRFS)
preds_proba_LRFS = LRFS.predict_proba(FSx_test)
probsLRFS = preds_proba_LRFS[:, 1]
fpr, tpr, thresh = metrics.roc_curve(y_test, probsLRFS)
auclg = roc_auc_score(y_test, probsLRFS)
plt.plot(fpr, tpr, label=f'LR, AUC = {str(round(auclg,3))}')
# SVM (SVMFS)
preds_proba_SVMFS = SVMFS.predict_proba(FSx_test_scaled)
probsSVMFS = preds_proba_SVMFS[:, 1]
fpr, tpr, thresh = metrics.roc_curve(y_test, probsSVMFS)
auclg = roc_auc_score(y_test, probsSVMFS)
plt.plot(fpr, tpr, label=f'SVM, AUC = {str(round(auclg,3))}')
# KNN (KNNFS)
preds_proba_KNNFS = KNNFS.predict_proba(FSx_test_scaled)
probsKNNFS = preds_proba_KNNFS[:, 1]
fpr, tpr, thresh = metrics.roc_curve(y_test, probsKNNFS)
auclg = roc_auc_score(y_test, probsKNNFS)
plt.plot(fpr, tpr, label=f'KNN, AUC = {str(round(auclg,3))}')
#RANDOM FOREST (RFFS)
preds_proba_RFFS = RFFS.predict_proba(FSx_test)
probsRFFS = preds_proba_RFFS[:, 1]
fpr, tpr, thresh = metrics.roc_curve(y_test, probsRFFS)
auclg = roc_auc_score(y_test, probsRFFS)
plt.plot(fpr, tpr, label=f'RF, AUC = {str(round(auclg,3))}')
# ADABOOST(ABFS)
preds_proba_ABFS = ABFS.predict_proba(FSx_test)
probsABFS = preds_proba_ABFS[:, 1]
fpr, tpr, thresh = metrics.roc_curve(y_test, probsABFS)
auclg = roc_auc_score(y_test, probsABFS)
plt.plot(fpr, tpr, label=f'AB, AUC = {str(round(auclg,3))}')
plt.ylabel("True Positive Rate", fontsize=12)
plt.xlabel("False Positive Rate", fontsize=12)
plt.title("ROC curve")
plt.rcParams['axes.titlesize'] = 16
plt.legend()
plt.show()
# CLUSTER-BASED METHODS
# utilizzo nuovamente gli algoritmi di apprendimento supervisionato visti sopra, ma questa volta li applico
# sui cluster invece che sul dataset completo
#importo delle librerie utili per la divisione in cluster
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
#K-MEANS clustering
#prima di effettuare il K-means clustering standardizzo le features selezionate con RFECV
scaler = preprocessing.StandardScaler().fit(features[selected_col])
features_scaled = scaler.transform(features[selected_col])
#due possibili metodi per trovare il numero ideale di cluster K:
#PRIMO METODO: elbow method
#scelgo il K che coincide con il 'gomito' del grafico che ha in ascissa il numero di cluster e in ordinata l'SSE
kmeans_kwargs = {
"init": "random",
"n_init": 10,
"max_iter": 300,
"random_state": 42,
}
# creo una lista contenente i valori degli SSE per ciascun k
sse = []
for k in range(1, 11):
kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
kmeans.fit(features_scaled)
sse.append(kmeans.inertia_)
plt.style.use("fivethirtyeight")
plt.plot(range(1, 11), sse)
plt.xticks(range(1, 11))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()
#trovo il k ottimale in corrispondenza del gomito tramite 'kneelocator'
kl = KneeLocator( range(1, 11), sse, curve="convex", direction="decreasing")
kl.elbow
#trovo che il numero ottimale di cluster è 4
4
#SECONDO METODO: silhouette coefficient
#sceglo il K che massimizzi la silhouette
silhouette_coefficients = []
for k in range(2, 6):
kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
kmeans.fit(features_scaled)
score = silhouette_score(features_scaled, kmeans.labels_)
silhouette_coefficients.append(score)
plt.style.use("fivethirtyeight")
plt.plot(range(2, 6), silhouette_coefficients)
plt.xticks(range(2, 6))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()
#4 è un massimo locale per il coefficiente di silhouette
#unendo i due approcci visti, il numero ottimale di cluster sembrerebbe essere k=4
#divisione del dataset nei cluster
#applico algoritmo k_means
kmeans = KMeans(n_clusters=4, random_state=0).fit(features_scaled)
#vedo i labels attribuiti a ciascuna osservazione (label 0 per il primo cluster, label 1 per il secondo,...)
kmeans.labels_
array([0, 1, 2, ..., 0, 0, 0])
#test_train splitting
#dovendo di nuovo 'riaddestrare' gli algoritmi supervisionati, splitto ciascun cluster in train e test set
encData['cluster']=kmeans.labels_
encData1 = encData[encData['cluster']==0]
encData2 = encData[encData['cluster']==1]
encData3 = encData[encData['cluster']==2]
encData4 = encData[encData['cluster']==3]
label1= encData1['loan_status']
features1= encData1.drop('loan_status', axis=1)
features1=features1.drop('cluster',axis=1)
label2= encData2['loan_status']
features2= encData2.drop('loan_status', axis=1)
features2= features2.drop('cluster', axis=1)
label3= encData3['loan_status']
features3= encData3.drop('loan_status', axis=1)
features3=features3.drop('cluster',axis=1)
label4= encData4['loan_status']
features4= encData4.drop('loan_status', axis=1)
features4=features4.drop('cluster',axis=1)
#vedo conteggi di default = 0 e default =1 per il primo cluster
label1.value_counts()
0 8747 1 1701 Name: loan_status, dtype: int64
#vedo conteggi di default = 0 e default =1 per il secondo cluster
label2.value_counts()
0 6617 1 794 Name: loan_status, dtype: int64
#vedo conteggi di default = 0 e default =1 per il terzo cluster
label3.value_counts()
0 6530 1 2441 Name: loan_status, dtype: int64
#vedo conteggi di default = 0 e default =1 per il quarto cluster
label4.value_counts()
0 3573 1 2171 Name: loan_status, dtype: int64
#divido il primo cluster in train e test set (rispettivamente 70 e 30%)
x_train1, x_test1, y_train1, y_test1 = model_selection.train_test_split(features1, label1, random_state=0, test_size = 0.30)
#divido il secondo cluster in train e test set (rispettivamente 70 e 30%)
x_train2, x_test2, y_train2, y_test2 = model_selection.train_test_split(features2, label2, random_state=0, test_size = 0.30)
#divido il terzo cluster in train e test set (rispettivamente 70 e 30%)
x_train3, x_test3, y_train3, y_test3 = model_selection.train_test_split(features3, label3, random_state=0, test_size = 0.30)
#divido il quarto cluster in train e test set (rispettivamente 70 e 30%)
x_train4, x_test4, y_train4, y_test4 = model_selection.train_test_split(features4, label4, random_state=0, test_size = 0.30)
#bilancio i quattro train set
oversample = SMOTE (random_state=0)
os_x_train1, os_y_train1 = oversample.fit_resample(x_train1, y_train1)
os_x_train1 = pd.DataFrame(data = os_x_train1, columns = x_train1.columns)
os_y_train1 = pd.DataFrame(data = os_y_train1, columns= ['loan_status'])
os_x_train2, os_y_train2 = oversample.fit_resample(x_train2, y_train2)
os_x_train2 = pd.DataFrame(data = os_x_train2, columns = x_train2.columns)
os_y_train2 = pd.DataFrame(data = os_y_train2, columns= ['loan_status'])
os_x_train3, os_y_train3 = oversample.fit_resample(x_train3, y_train3)
os_x_train3 = pd.DataFrame(data = os_x_train3, columns = x_train3.columns)
os_y_train3 = pd.DataFrame(data = os_y_train3, columns= ['loan_status'])
os_x_train4, os_y_train4 = oversample.fit_resample(x_train4, y_train4)
os_x_train4 = pd.DataFrame(data = os_x_train4, columns = x_train4.columns)
os_y_train4 = pd.DataFrame(data = os_y_train4, columns= ['loan_status'])
#concateno i label dei quattro train set e dei quattro test set(servirà dopo per valutare le performances dei metodi post-cluster)
os_y_trainT = pd.concat([os_y_train1,os_y_train2,os_y_train3,os_y_train4], ignore_index=True)
y_testT = pd.concat([y_test1,y_test2, y_test3, y_test4], ignore_index=True)
print("length of oversampled data is ",len(os_x_train1))
print("Number of no default in oversampled data",len(os_y_train1[os_y_train1['loan_status']==0]))
print("Number of default",len(os_y_train1[os_y_train1['loan_status']==1]))
print("Proportion of no default data in oversampled data is ",len(os_y_train1[os_y_train1['loan_status']==0])/len(os_x_train1))
print("Proportion of default data in oversampled data is ",len(os_y_train1[os_y_train1['loan_status']==1])/len(os_x_train1))
length of oversampled data is 12226 Number of no default in oversampled data 6113 Number of default 6113 Proportion of no default data in oversampled data is 0.5 Proportion of default data in oversampled data is 0.5
print("length of oversampled data is ",len(os_x_train2))
print("Number of no default in oversampled data",len(os_y_train2[os_y_train2['loan_status']==0]))
print("Number of default",len(os_y_train2[os_y_train2['loan_status']==1]))
print("Proportion of no default data in oversampled data is ",len(os_y_train2[os_y_train2['loan_status']==0])/len(os_x_train2))
print("Proportion of default data in oversampled data is ",len(os_y_train2[os_y_train2['loan_status']==1])/len(os_x_train2))
length of oversampled data is 9304 Number of no default in oversampled data 4652 Number of default 4652 Proportion of no default data in oversampled data is 0.5 Proportion of default data in oversampled data is 0.5
#scalo i train set
scaler = preprocessing.StandardScaler().fit(os_x_train1)
os_x_train1_scaled = scaler.transform(os_x_train1)
x_test1_scaled = scaler.transform(x_test1)
scaler = preprocessing.StandardScaler().fit(os_x_train2)
os_x_train2_scaled = scaler.transform(os_x_train2)
x_test2_scaled = scaler.transform(x_test2)
scaler = preprocessing.StandardScaler().fit(os_x_train3)
os_x_train3_scaled = scaler.transform(os_x_train3)
x_test3_scaled = scaler.transform(x_test3)
scaler = preprocessing.StandardScaler().fit(os_x_train4)
os_x_train4_scaled = scaler.transform(os_x_train4)
x_test4_scaled = scaler.transform(x_test4)
#salvo il vettore di features selezionate dalla feature selection
selected_col = [ 'loan_percent_income',
'person_home_ownership_MORTGAGE',
'person_home_ownership_OWN', 'person_home_ownership_RENT',
'loan_intent_DEBTCONSOLIDATION', 'loan_intent_EDUCATION',
'loan_intent_HOMEIMPROVEMENT', 'loan_intent_MEDICAL',
'loan_intent_PERSONAL', 'loan_intent_VENTURE', 'loan_grade_A',
'loan_grade_B', 'loan_grade_C', 'cb_person_default_on_file_N',
'cb_person_default_on_file_Y']
#salvo il dataset (train) ridotto alle sole features selezionate dalla REFCV
FSos_x_train1 = os_x_train1[selected_col]
FSos_x_train2 = os_x_train2[selected_col]
FSos_x_train3 = os_x_train3[selected_col]
FSos_x_train4 = os_x_train4[selected_col]
#salvo il dataset (test) ridotto alle sole features selezionate dalla REFCV
FSx_test1 = x_test1[selected_col]
FSx_test2 = x_test2[selected_col]
FSx_test3 = x_test3[selected_col]
FSx_test4 = x_test4[selected_col]
#MODELLO: REGRESSIONE LGOISTICA
#sul primo cluster
LR1 = LogisticRegression()
LR1.fit(FSos_x_train1, os_y_train1)
#sul secondo cluster
LR2 = LogisticRegression()
LR2.fit(FSos_x_train2, os_y_train2)
#sul terzo cluster
LR3 = LogisticRegression()
LR3.fit(FSos_x_train3, os_y_train3)
#sul quarto cluster
LR4 = LogisticRegression()
LR4.fit(FSos_x_train4, os_y_train4)
#previsioni su train e test set del primo cluster
p_trainLR1 = LR1.predict(FSos_x_train1)
p_testLR1 = LR1.predict(FSx_test1)
print('Accuracy of logistic regression classifier on oversampled train set: {:.2f}'.format(LR1.score(FSos_x_train1, os_y_train1)))
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(LR1.score(FSx_test1, y_test1)))
Accuracy of logistic regression classifier on oversampled train set: 0.91 Accuracy of logistic regression classifier on test set: 0.89
#previsioni su train e test set del secondo cluster
p_trainLR2 = LR2.predict(FSos_x_train2)
p_testLR2 = LR2.predict(FSx_test2)
print('Accuracy of logistic regression classifier on oversampled train set: {:.2f}'.format(LR2.score(FSos_x_train2, os_y_train2)))
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(LR2.score(FSx_test2, y_test2)))
Accuracy of logistic regression classifier on oversampled train set: 0.93 Accuracy of logistic regression classifier on test set: 0.90
#previsioni su train e test set del terzo cluster
p_trainLR3 = LR3.predict(FSos_x_train3)
p_testLR3 = LR3.predict(FSx_test3)
print('Accuracy of logistic regression classifier on oversampled train set: {:.2f}'.format(LR3.score(FSos_x_train3, os_y_train3)))
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(LR3.score(FSx_test3, y_test3)))
Accuracy of logistic regression classifier on oversampled train set: 0.90 Accuracy of logistic regression classifier on test set: 0.88
#previsioni su train e test set del quarto cluster
p_trainLR4 = LR4.predict(FSos_x_train4)
p_testLR4 = LR4.predict(FSx_test4)
print('Accuracy of logistic regression classifier on oversampled train set: {:.2f}'.format(LR4.score(FSos_x_train4, os_y_train4)))
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(LR4.score(FSx_test4, y_test4)))
Accuracy of logistic regression classifier on oversampled train set: 0.82 Accuracy of logistic regression classifier on test set: 0.81
#concateno le previsioni sui test set in modo da poter costruire la matrice di confusione
p_testLR_T = np.concatenate((p_testLR1, p_testLR2,p_testLR3, p_testLR4), axis=0)
#matrice di confusione sul totale dei test set
confusion_matrix_test = confusion_matrix(y_testT, p_testLR_T)
print(confusion_matrix_test)
[[7163 465] [ 766 1381]]
#misure di performance
print(classification_report(y_testT, p_testLR_T))
precision recall f1-score support 0 0.90 0.94 0.92 7628 1 0.75 0.64 0.69 2147 accuracy 0.87 9775 macro avg 0.83 0.79 0.81 9775 weighted avg 0.87 0.87 0.87 9775
#noto migliormanenti rispetto al metodo applicato sul dataset completo
#SVM
#scalo i training sets prima di applicare SVM
scaler = preprocessing.StandardScaler().fit(FSos_x_train1)
FSos_x_train1_scaled = scaler.transform(FSos_x_train1)
FSx_test1_scaled = scaler.transform(FSx_test1)
scaler = preprocessing.StandardScaler().fit(FSos_x_train2)
FSos_x_train2_scaled = scaler.transform(FSos_x_train2)
FSx_test2_scaled = scaler.transform(FSx_test2)
scaler = preprocessing.StandardScaler().fit(FSos_x_train3)
FSos_x_train3_scaled = scaler.transform(FSos_x_train3)
FSx_test3_scaled = scaler.transform(FSx_test3)
scaler = preprocessing.StandardScaler().fit(FSos_x_train4)
FSos_x_train4_scaled = scaler.transform(FSos_x_train4)
FSx_test4_scaled = scaler.transform(FSx_test4)
#grid search per la ricerca dei best hyperparameters
#stabilisco una griglia di parametri su cui andrò a testare il modello
param_grid = [
{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]
# creo il modello base su cui applicare il tuning
SVM = SVC()
#imposto la random search sulla griglia definita precedentemente
SVM_gridsearch1 = GridSearchCV(SVM, param_grid)
SVM_gridsearch2 = GridSearchCV(SVM, param_grid)
SVM_gridsearch3 = GridSearchCV(SVM, param_grid)
SVM_gridsearch4 = GridSearchCV(SVM, param_grid)
# Fitto il modello random search sul training set
SVM_gridsearch1.fit(FSos_x_train1_scaled, os_y_train1)
SVM_gridsearch2.fit(FSos_x_train2_scaled, os_y_train2)
SVM_gridsearch3.fit(FSos_x_train3_scaled, os_y_train3)
SVM_gridsearch4.fit(FSos_x_train4_scaled, os_y_train4)
#stampo la combinazione di parametri ottimale
SVM_gridsearch1.best_params_
SVM_gridsearch2.best_params_
SVM_gridsearch3.best_params_
SVM_gridsearch4.best_params_
#definisco il modello Support vector classifer utilizzando i parametri appena trovati
SVM1 = SVC(C= 1000 , gamma= 0.001 , kernel='rbf',probability= True )
SVM2 = SVC(C= 1000 , gamma= 0.001 , kernel='rbf',probability= True )
SVM3 = SVC(C= 1000 , gamma= 0.001 , kernel='rbf',probability= True )
SVM4 = SVC(C= 1000 , gamma= 0.001 , kernel='rbf',probability= True )
#fitto il modello sul training set
SVM1.fit(FSos_x_train1_scaled, os_y_train1)
SVM2.fit(FSos_x_train2_scaled, os_y_train2)
SVM3.fit(FSos_x_train3_scaled, os_y_train3)
SVM4.fit(FSos_x_train4_scaled, os_y_train4)
#previsioni su train e test set numero 1
p_trainSVM1 = SVM1.predict(FSos_x_train1_scaled)
p_testSVM1 = SVM1.predict(FSx_test1_scaled)
print('Accuracy of SVM classifier on oversampled train set: {:.2f}'.format(SVM1.score(FSos_x_train1_scaled, os_y_train1)))
print('Accuracy of SVM classifier on test set: {:.2f}'.format(SVM1.score(FSx_test1_scaled, y_test1)))
Accuracy of SVM classifier on oversampled train set: 0.91 Accuracy of SVM classifier on test set: 0.90
#previsioni su train e test set numero 2
p_trainSVM2 = SVM2.predict(FSos_x_train2_scaled)
p_testSVM2 = SVM2.predict(FSx_test2_scaled)
print('Accuracy of SVM classifier on oversampled train set: {:.2f}'.format(SVM2.score(FSos_x_train2_scaled, os_y_train2)))
print('Accuracy of SVM classifier on test set: {:.2f}'.format(SVM2.score(FSx_test2_scaled, y_test2)))
Accuracy of SVM classifier on oversampled train set: 0.94 Accuracy of SVM classifier on test set: 0.91
#previsioni su train e test set numero 3
p_trainSVM3 = SVM3.predict(FSos_x_train3_scaled)
p_testSVM3 = SVM3.predict(FSx_test3_scaled)
print('Accuracy of SVM classifier on oversampled train set: {:.2f}'.format(SVM3.score(FSos_x_train3_scaled, os_y_train3)))
print('Accuracy of SVM classifier on test set: {:.2f}'.format(SVM3.score(FSx_test3_scaled, y_test3)))
Accuracy of SVM classifier on oversampled train set: 0.91 Accuracy of SVM classifier on test set: 0.89
#previsioni su train e test set numero 4
p_trainSVM4 = SVM4.predict(FSos_x_train4_scaled)
p_testSVM4 = SVM4.predict(FSx_test4_scaled)
print('Accuracy of SVM classifier on oversampled train set: {:.2f}'.format(SVM4.score(FSos_x_train4_scaled, os_y_train4)))
print('Accuracy of SVM classifier on test set: {:.2f}'.format(SVM4.score(FSx_test4_scaled, y_test4)))
Accuracy of SVM classifier on oversampled train set: 0.86 Accuracy of SVM classifier on test set: 0.83
#concateno le previsioni sui 4 test sets
p_testSVM_T = np.concatenate((p_testSVM1, p_testSVM2,p_testSVM3, p_testSVM4), axis=0)
#costruisco la matrice di onfusione utilizzando il vettore p_testSVM_T appena costruito
confusion_matrix_test = confusion_matrix(y_testT, p_testSVM_T)
print(confusion_matrix_test)
[[7276 352] [ 737 1410]]
#misure di performance
print(classification_report(y_testT, p_testSVM_T))
precision recall f1-score support 0 0.91 0.95 0.93 7628 1 0.80 0.66 0.72 2147 accuracy 0.89 9775 macro avg 0.85 0.81 0.83 9775 weighted avg 0.88 0.89 0.88 9775
#KNN
#scalo i train set
scaler = preprocessing.StandardScaler().fit(FSos_x_train1)
FSos_x_train1_scaled = scaler.transform(FSos_x_train1)
FSx_test1_scaled = scaler.transform(FSx_test1)
scaler = preprocessing.StandardScaler().fit(FSos_x_train2)
FSos_x_train2_scaled = scaler.transform(FSos_x_train2)
FSx_test2_scaled = scaler.transform(FSx_test2)
scaler = preprocessing.StandardScaler().fit(FSos_x_train3)
FSos_x_train3_scaled = scaler.transform(FSos_x_train3)
FSx_test3_scaled = scaler.transform(FSx_test3)
scaler = preprocessing.StandardScaler().fit(FSos_x_train4)
FSos_x_train4_scaled = scaler.transform(FSos_x_train4)
FSx_test4_scaled = scaler.transform(FSx_test4)
#miglior scelta di n_neighbors utilizzando grid search
#scelgo, per ciascuno dei clusters, il K ottimale
#definisco griglia su cui effettuare ricerca
parameters = {"n_neighbors": range(5,30,2)}
gridsearch1 = GridSearchCV (KNeighborsClassifier(), parameters)
gridsearch1.fit(os_x_train1_scaled, os_y_train1)
gridsearch2 = GridSearchCV (KNeighborsClassifier(), parameters)
gridsearch2.fit(os_x_train2_scaled, os_y_train2)
gridsearch3 = GridSearchCV (KNeighborsClassifier(), parameters)
gridsearch3.fit(os_x_train3_scaled, os_y_train3)
gridsearch4 = GridSearchCV (KNeighborsClassifier(), parameters)
gridsearch4.fit(os_x_train4_scaled, os_y_train4)
#k ottimo per il primo cluster
gridsearch1.best_params_
{'n_neighbors': 7}
#k ottimo per il secondo cluster
gridsearch2.best_params_
{'n_neighbors': 5}
#k ottimo per il terzo cluster
gridsearch3.best_params_
{'n_neighbors': 5}
#k ottimo per il quarto cluster
gridsearch4.best_params_
{'n_neighbors': 11}
#definisco i 4 KNN e li fitto sui train set
KNN1 = KNeighborsClassifier(n_neighbors=7, metric= 'minkowski', p=2)
KNN2 = KNeighborsClassifier(n_neighbors=5, metric= 'minkowski', p=2)
KNN3 = KNeighborsClassifier(n_neighbors=5, metric= 'minkowski', p=2)
KNN4 = KNeighborsClassifier(n_neighbors=11, metric= 'minkowski', p=2)
KNN1.fit(FSos_x_train1_scaled, os_y_train1)
KNN2.fit(FSos_x_train2_scaled, os_y_train2)
KNN3.fit(FSos_x_train3_scaled, os_y_train3)
KNN4.fit(FSos_x_train4_scaled, os_y_train4)
#prediction su train e test set primo cluster
p_trainKNN1 = KNN1.predict(FSos_x_train1_scaled)
p_testKNN1 = KNN1.predict(FSx_test1_scaled)
print('Accuracy of KNN classifier on oversampled train set: {:.2f}'.format(KNN1.score(FSos_x_train1_scaled, os_y_train1)))
print('Accuracy of KNN classifier on test set: {:.2f}'.format(KNN1.score(FSx_test1_scaled, y_test1)))
Accuracy of KNN classifier on oversampled train set: 0.93 Accuracy of KNN classifier on test set: 0.92
#prediction su train e test set secondo cluster
p_trainKNN2 = KNN2.predict(FSos_x_train2_scaled)
p_testKNN2 = KNN2.predict(FSx_test2_scaled)
print('Accuracy of KNN classifier on oversampled train set: {:.2f}'.format(KNN2.score(FSos_x_train2_scaled, os_y_train2)))
print('Accuracy of KNN classifier on test set: {:.2f}'.format(KNN2.score(FSx_test2_scaled, y_test2)))
Accuracy of KNN classifier on oversampled train set: 0.95 Accuracy of KNN classifier on test set: 0.91
#prediction su train e test set terzo cluster
p_trainKNN3 = KNN3.predict(FSos_x_train3_scaled)
p_testKNN3 = KNN3.predict(FSx_test3_scaled)
print('Accuracy of KNN classifier on oversampled train set: {:.2f}'.format(KNN3.score(FSos_x_train3_scaled, os_y_train3)))
print('Accuracy of KNN classifier on test set: {:.2f}'.format(KNN3.score(FSx_test3_scaled, y_test3)))
Accuracy of KNN classifier on oversampled train set: 0.92 Accuracy of KNN classifier on test set: 0.89
#prediction su train e test set quarto cluster
p_trainKNN4 = KNN4.predict(FSos_x_train4_scaled)
p_testKNN4 = KNN4.predict(FSx_test4_scaled)
print('Accuracy of KNN classifier on oversampled train set: {:.2f}'.format(KNN4.score(FSos_x_train4_scaled, os_y_train4)))
print('Accuracy of KNN classifier on test set: {:.2f}'.format(KNN4.score(FSx_test4_scaled, y_test4)))
Accuracy of KNN classifier on oversampled train set: 0.87 Accuracy of KNN classifier on test set: 0.82
#concateno le prediction sui test set in modo da poter costruire la matrice di confusione
p_testKNN_T = np.concatenate((p_testKNN1, p_testKNN2,p_testKNN3, p_testKNN4), axis=0)
#confusion matrix
confusion_matrix_test = confusion_matrix(y_testT, p_testKNN_T)
print(confusion_matrix_test)
[[7355 273] [ 796 1351]]
#misure di performance
print(classification_report(y_testT, p_testKNN_T))
precision recall f1-score support 0 0.90 0.96 0.93 7628 1 0.83 0.63 0.72 2147 accuracy 0.89 9775 macro avg 0.87 0.80 0.82 9775 weighted avg 0.89 0.89 0.88 9775
#MODEL: RANDOM FOREST
# definisco la random hyperparameter grid
# numero di alberi nella forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 5)]
# numero di features considerate ad ogni split
max_features = ['auto', 'sqrt']
# massima profondità dell'albero
max_depth = [int(x) for x in np.linspace(10, 80, num = 4)]
max_depth.append(None)
# numero minimo di samples richiesto ad ogni split di un nodo
min_samples_split = [2, 5, 10]
# minimo numero di samples richiesto in ogni leaf
min_samples_leaf = [ 2, 4]
# metodo per selezionare samples per training degli alberi
bootstrap = [True]
# Creo la random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
pprint(random_grid)
{'bootstrap': [True], 'max_depth': [10, 33, 56, 80, None], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [2, 4], 'min_samples_split': [2, 5, 10], 'n_estimators': [200, 650, 1100, 1550, 2000]}
# uso la random grid per cercare i best parameters
# creo il modello base su cui fare tuning
RF1 = RandomForestClassifier()
RF2 = RandomForestClassifier()
RF3 = RandomForestClassifier()
RF4 = RandomForestClassifier()
# Random search dei parametri, usando 3 fold cross validation
RF1_random = RandomizedSearchCV(estimator = RF1, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)
RF2_random = RandomizedSearchCV(estimator = RF2, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)
RF3_random = RandomizedSearchCV(estimator = RF3, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)
RF4_random = RandomizedSearchCV(estimator = RF4, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fitto il modello random search
RF1_random.fit(FSos_x_train1, os_y_train1)
RF2_random.fit(FSos_x_train2, os_y_train2)
RF3_random.fit(FSos_x_train3, os_y_train3)
RF4_random.fit(FSos_x_train4, os_y_train4)
#trovo best parameters per il primo modello (riferito al cluster 1)
RF1_random.best_params_
{'n_estimators': 1100, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 10, 'bootstrap': True}
#trovo best parameter per il secondo modello (riferito al cluster 2)
RF2_random.best_params_
{'n_estimators': 2000, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': None, 'bootstrap': True}
#trovo best parameter per il terzo modello (riferito al cluster 3)
RF3_random.best_params_
{'n_estimators': 650, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 80, 'bootstrap': True}
#trovo best parameter per il quarto modello (riferito al cluster 4)
RF4_random.best_params_
{'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 10, 'bootstrap': True}
#cerco di migliorare ulteriormente i risultati utilizzando una grid search
#baso la mia nuova griglia sui risultati otteniti prima, andando a cercare 'nelle vicinanze' di quello che,
#secondo la grid search di prima, era il miglior modello
#definisco la nuova griglia
param_grid1 = {
'bootstrap': [True],
'max_depth': [8,10,12],
'max_features': ['sqrt'],
'min_samples_leaf': [2,4,6],
'min_samples_split': [10,15],
'n_estimators': [1000,1100,1200]
}
param_grid2 = {
'bootstrap': [True],
'max_depth': [None],
'max_features': ['auto'],
'min_samples_leaf': [2,4],
'min_samples_split': [4,5,6],
'n_estimators': [1900,2000,2100]
}
param_grid3 = {
'bootstrap': [True],
'max_depth': [70,80,90],
'max_features': ['sqrt'],
'min_samples_leaf': [2,4],
'min_samples_split': [2,3,4],
'n_estimators': [600,650,700]
}
param_grid4 = {
'bootstrap': [True],
'max_depth': [8,10,12],
'max_features': ['auto'],
'min_samples_leaf': [2,4],
'min_samples_split': [4,5,6],
'n_estimators': [150,200,250]
}
# Costruisco i modelli grid search
grid_search1 = GridSearchCV(estimator = RandomForestClassifier() , param_grid = param_grid1,
cv = 5, n_jobs = -1, verbose = 2)
grid_search2 = GridSearchCV(estimator = RandomForestClassifier(), param_grid = param_grid2,
cv = 5, n_jobs = -1, verbose = 2)
grid_search3 = GridSearchCV(estimator = RandomForestClassifier() , param_grid = param_grid3,
cv = 5, n_jobs = -1, verbose = 2)
grid_search4 = GridSearchCV(estimator = RandomForestClassifier(), param_grid = param_grid4,
cv = 5, n_jobs = -1, verbose = 2)
# li fitto ai dati
grid_search1.fit(FSos_x_train1, os_y_train1)
grid_search2.fit(FSos_x_train2, os_y_train2)
grid_search3.fit(FSos_x_train3, os_y_train3)
grid_search4.fit(FSos_x_train4, os_y_train4)
# trovo i best parameters per il modello riferito al cluster 1
grid_search1.best_params_
{'bootstrap': True, 'max_depth': 12, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 15, 'n_estimators': 1200}
# trovo i best parameters per il modello riferito al cluster 2
grid_search2.best_params_
#true 10 sqrt leaf: 2 split: 4 1550
{'bootstrap': True, 'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 6, 'n_estimators': 2000}
# trovo i best parameters per il modello riferito al cluster 3
grid_search3.best_params_
{'bootstrap': True, 'max_depth': 70, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 4, 'n_estimators': 600}
#trovo i best parameter per il modello riferito al cluster 4
grid_search4.best_params_
{'bootstrap': True, 'max_depth': 8, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 200}
#costruisco le 4 random forest
RF1 = RandomForestClassifier(random_state=0, n_estimators= 1200,
min_samples_split= 15,min_samples_leaf= 2,max_features= 'sqrt',
max_depth=12, bootstrap= True )
RF2 = RandomForestClassifier(random_state=0, n_estimators= 2000,
min_samples_split= 6,min_samples_leaf= 2,max_features= 'auto',
max_depth= None,bootstrap= True )
RF3 = RandomForestClassifier(random_state=0, n_estimators= 600,
min_samples_split= 4, min_samples_leaf= 4,max_features= 'sqrt',
max_depth= 70,bootstrap= True )
RF4 = RandomForestClassifier(random_state=0, n_estimators= 200,
min_samples_split= 4,min_samples_leaf= 2,max_features= 'auto',
max_depth= 8,bootstrap= True )
#e le fitto sui traing set
RF1.fit(FSos_x_train1, os_y_train1)
RF2.fit(FSos_x_train2, os_y_train2)
RF3.fit(FSos_x_train3, os_y_train3)
RF4.fit(FSos_x_train4, os_y_train4)
#previsioni sul primo cluster
p_trainRF1 = RF1.predict(FSos_x_train1)
p_testRF1 = RF1.predict(FSx_test1)
print('Accuracy of RF classifier on oversampled train set: {:.2f}'.format(RF1.score(FSos_x_train1, os_y_train1)))
print('Accuracy of RF classifier on test set: {:.2f}'.format(RF1.score(FSx_test1, y_test1)))
Accuracy of RF classifier on oversampled train set: 0.93 Accuracy of RF classifier on test set: 0.92
#previsioni sul secondo cluster
p_trainRF2 = RF2.predict(FSos_x_train2)
p_testRF2 = RF2.predict(FSx_test2)
print('Accuracy of RF classifier on oversampled train set: {:.2f}'.format(RF2.score(FSos_x_train2, os_y_train2)))
print('Accuracy of RF classifier on test set: {:.2f}'.format(RF2.score(FSx_test2, y_test2)))
Accuracy of RF classifier on oversampled train set: 0.96 Accuracy of RF classifier on test set: 0.91
#previsioni sul terzo cluster
p_trainRF3 = RF3.predict(FSos_x_train3)
p_testRF3 = RF3.predict(FSx_test3)
print('Accuracy of RF classifier on oversampled train set: {:.2f}'.format(RF3.score(FSos_x_train3, os_y_train3)))
print('Accuracy of RF classifier on test set: {:.2f}'.format(RF3.score(FSx_test3, y_test3)))
Accuracy of RF classifier on oversampled train set: 0.93 Accuracy of RF classifier on test set: 0.89
#previsioni sul quarto cluster
p_trainRF4 = RF4.predict(FSos_x_train4)
p_testRF4 = RF4.predict(FSx_test4)
print('Accuracy of RF classifier on oversampled train set: {:.2f}'.format(RF4.score(FSos_x_train4, os_y_train4)))
print('Accuracy of RF classifier on test set: {:.2f}'.format(RF4.score(FSx_test4, y_test4)))
Accuracy of RF classifier on oversampled train set: 0.87 Accuracy of RF classifier on test set: 0.84
#concateno le prediction sui 4 test set
p_testRF_T = np.concatenate((p_testRF1, p_testRF2,p_testRF3, p_testRF4), axis=0)
#matric di confusione
confusion_matrix_test = confusion_matrix(y_testT, p_testRF_T)
print(confusion_matrix_test)
[[7369 259] [ 760 1387]]
#misure di performance
print(classification_report(y_testT, p_testRF_T))
precision recall f1-score support 0 0.91 0.97 0.94 7628 1 0.84 0.65 0.73 2147 accuracy 0.90 9775 macro avg 0.87 0.81 0.83 9775 weighted avg 0.89 0.90 0.89 9775
#MODEL: ADABOOST
#creo come stimatori basi per i modelli degli alberi decisionali formati da un solo split (uno per ogni cluster)
decTree1 = tree.DecisionTreeClassifier(random_state=0, max_depth=1)
decTree2 = tree.DecisionTreeClassifier(random_state=0, max_depth=1)
decTree3 = tree.DecisionTreeClassifier(random_state=0, max_depth=1)
decTree4 = tree.DecisionTreeClassifier(random_state=0, max_depth=1)
decTree1.fit(FSos_x_train1, os_y_train1)
decTree2.fit(FSos_x_train2, os_y_train2)
decTree3.fit(FSos_x_train3, os_y_train3)
decTree4.fit(FSos_x_train4, os_y_train4)
DecisionTreeClassifier(max_depth=1, random_state=0)
#albero stimatore base per il modello sul primo cluster
tree.plot_tree(decTree1)
[Text(0.5, 0.75, 'X[0] <= 0.3\ngini = 0.5\nsamples = 12226\nvalue = [6113, 6113]'), Text(0.25, 0.25, 'gini = 0.445\nsamples = 8689\nvalue = [5791, 2898]'), Text(0.75, 0.25, 'gini = 0.165\nsamples = 3537\nvalue = [322, 3215]')]
#albero stimatore base per il modello sul secondo cluster
tree.plot_tree(decTree2)
[Text(0.5, 0.75, 'X[10] <= 0.5\ngini = 0.5\nsamples = 9304\nvalue = [4652, 4652]'), Text(0.25, 0.25, 'gini = 0.311\nsamples = 4999\nvalue = [964, 4035]'), Text(0.75, 0.25, 'gini = 0.246\nsamples = 4305\nvalue = [3688, 617]')]
#hyperparameter tuning
#cerco parametri ottimali
#definisco la griglia:
# numero di alberi
n_estimators = [int(x) for x in np.linspace(start = 0, stop = 1000, num = 50)]
#learning rate
learning_rate=[0.4,0.5,0.6,0.7,0.8,0.9,1]
# creo la ranodm grid
random_grid = {'n_estimators': n_estimators,
'learning_rate': learning_rate}
# uso la random grid per cercare i best hyperparameters
# creo il modello base su cui fare tuning
abc = AdaBoostClassifier()
# Random search dei parametri, usando 3 fold cross validation
AB1_random = RandomizedSearchCV(estimator = abc, param_distributions = random_grid, n_iter = 50, cv = 5, verbose=2, random_state=42, n_jobs = -1)
AB2_random = RandomizedSearchCV(estimator = abc, param_distributions = random_grid, n_iter = 50, cv = 5, verbose=2, random_state=42, n_jobs = -1)
AB3_random = RandomizedSearchCV(estimator = abc, param_distributions = random_grid, n_iter = 50, cv = 5, verbose=2, random_state=42, n_jobs = -1)
AB4_random = RandomizedSearchCV(estimator = abc, param_distributions = random_grid, n_iter = 50, cv = 5, verbose=2, random_state=42, n_jobs = -1)
# fitto il modello random search
AB1_random.fit(FSos_x_train1, os_y_train1)
AB2_random.fit(FSos_x_train2, os_y_train2)
AB3_random.fit(FSos_x_train3, os_y_train3)
AB4_random.fit(FSos_x_train4, os_y_train4)
#migliori parametri per il modello sul primo cluster
AB1_random.best_params_
{'n_estimators': 775, 'learning_rate': 0.9}
#migliori parametri per il modello sul secondo cluster
AB2_random.best_params_
{'n_estimators': 959, 'learning_rate': 0.9}
#migliori parametri per il modello sul terzo cluster
AB3_random.best_params_
{'n_estimators': 142, 'learning_rate': 0.7}
#migliori parametri per il modello sul quarto cluster
AB4_random.best_params_
{'n_estimators': 81, 'learning_rate': 1}
#cerco di migliorare adaBoost usando grid search
#la nuova griglia è scelta in modo da essere un intorno del punto ottimale trovato precedentemente
#creo 4 griglie diverse negli intorni dei 4 punti ottimali per i vari modelli che ho trovato prima
param_grid1 = {
'n_estimators':[760,770,775,780,790],
'learning_rate':[0.85,0.88,0.9,0.92,0.95]
}
param_grid2 = {
'n_estimators':[950,955,959,960,970],
'learning_rate':[0.85,0.88,0.9,0.92,0.95]
}
param_grid3 = {
'n_estimators':[135,140,142,145,150],
'learning_rate':[0.65,0.68,0.70,0.72,0.74]
}
param_grid4 = {
'n_estimators':[75,80,81,83,85,90],
'learning_rate':[0.95,0.98,0.99,1]
}
# costruisco il modello grid search
grid_search1 = GridSearchCV(estimator = AdaBoostClassifier(), param_grid = param_grid1,
cv = 5, n_jobs = -1, verbose = 2)
grid_search2 = GridSearchCV(estimator = AdaBoostClassifier(), param_grid = param_grid2,
cv = 5, n_jobs = -1, verbose = 2)
grid_search3 = GridSearchCV(estimator = AdaBoostClassifier(), param_grid = param_grid3,
cv = 5, n_jobs = -1, verbose = 2)
grid_search4 = GridSearchCV(estimator = AdaBoostClassifier(), param_grid = param_grid4,
cv = 5, n_jobs = -1, verbose = 2)
# Fitto grid search ai dati
grid_search1.fit(FSos_x_train1, os_y_train1)
grid_search2.fit(FSos_x_train2, os_y_train2)
grid_search3.fit(FSos_x_train3, os_y_train3)
grid_search4.fit(FSos_x_train4, os_y_train4)
#stampo parametri ottimali per modello sul primo cluster
grid_search1.best_params_
{'learning_rate': 0.9, 'n_estimators': 780}
#stampo paramentri ottimali per modello sul secondo cluster
grid_search2.best_params_
{'learning_rate': 0.9, 'n_estimators': 970}
#stampo paramentri ottimali per modello sul terzo cluster
grid_search3.best_params_
{'learning_rate': 0.74, 'n_estimators': 150}
#stampo paramentri ottimali per modello sul quarto cluster
grid_search4.best_params_
{'learning_rate': 0.99, 'n_estimators': 75}
#definisco modelli con i nuovi parametri
AB1 = AdaBoostClassifier(n_estimators= 780 , base_estimator=decTree1, learning_rate= 0.9 )
AB2 = AdaBoostClassifier(n_estimators= 970 , base_estimator=decTree2, learning_rate= 0.9 )
AB3 = AdaBoostClassifier(n_estimators= 150 , base_estimator=decTree1, learning_rate= 0.74 )
AB4 = AdaBoostClassifier(n_estimators= 75 , base_estimator=decTree2, learning_rate= 0.99 )
#e li fitto sui train sets
AB1.fit(FSos_x_train1, os_y_train1)
AB2.fit(FSos_x_train2, os_y_train2)
AB3.fit(FSos_x_train3, os_y_train3)
AB4.fit(FSos_x_train4, os_y_train4)
#previsioni su train e test set primo cluster
p_trainAB1 = RF1.predict(FSos_x_train1)
p_testAB1 = RF1.predict(FSx_test1)
print('Accuracy of AB classifier on oversampled train set: {:.2f}'.format(AB1.score(FSos_x_train1, os_y_train1)))
print('Accuracy of AB classifier on test set: {:.2f}'.format(AB1.score(FSx_test1, y_test1)))
Accuracy of AB classifier on oversampled train set: 0.91 Accuracy of AB classifier on test set: 0.89
#previsioni su train e test set secondo cluster
p_trainAB2 = AB2.predict(FSos_x_train2)
p_testAB2 = AB2.predict(FSx_test2)
print('Accuracy of AB classifier on oversampled train set: {:.2f}'.format(AB2.score(FSos_x_train2, os_y_train2)))
print('Accuracy of AB classifier on test set: {:.2f}'.format(RF2.score(FSx_test2, y_test2)))
Accuracy of AB classifier on oversampled train set: 0.94 Accuracy of AB classifier on test set: 0.91
#previsioni su train e test set terzo cluster
p_trainAB3 = AB3.predict(FSos_x_train3)
p_testAB3 = AB3.predict(FSx_test3)
print('Accuracy of AB classifier on oversampled train set: {:.2f}'.format(AB3.score(FSos_x_train3, os_y_train3)))
print('Accuracy of AB classifier on test set: {:.2f}'.format(RF3.score(FSx_test3, y_test3)))
Accuracy of AB classifier on oversampled train set: 0.91 Accuracy of AB classifier on test set: 0.89
#previsioni su train e test set quarto cluster
p_trainAB4 = AB4.predict(FSos_x_train4)
p_testAB4 = AB4.predict(FSx_test4)
print('Accuracy of AB classifier on oversampled train set: {:.2f}'.format(AB4.score(FSos_x_train4, os_y_train4)))
print('Accuracy of AB classifier on test set: {:.2f}'.format(RF4.score(FSx_test4, y_test4)))
#0.85 0.83
Accuracy of AB classifier on oversampled train set: 0.85 Accuracy of AB classifier on test set: 0.84
#concateno le prediction sui test sets
p_testAB_T = np.concatenate((p_testAB1, p_testAB2,p_testAB3, p_testAB4), axis=0)
#confusion matrix sul tot dei test sets
confusion_matrix_test = confusion_matrix(y_testT, p_testAB_T)
print(confusion_matrix_test)
[[7227 401] [ 669 1478]]
#performance measures
print(classification_report(y_testT, p_testAB_T))
precision recall f1-score support 0 0.92 0.95 0.93 7628 1 0.79 0.69 0.73 2147 accuracy 0.89 9775 macro avg 0.85 0.82 0.83 9775 weighted avg 0.89 0.89 0.89 9775
#AUC cluster based methods
fig = plt.figure(figsize=(8,5))
plt.plot([0, 1], [0, 1],'r--')
# LOGISTIC REGRESSION
preds_proba_LR1 = LR1.predict_proba(FSx_test1)
probsLR1= preds_proba_LR1[:, 1]
preds_proba_LR2 = LR2.predict_proba(FSx_test2)
probsLR2= preds_proba_LR2[:, 1]
preds_proba_LR3 = LR3.predict_proba(FSx_test3)
probsLR3= preds_proba_LR3[:, 1]
preds_proba_LR4 = LR4.predict_proba(FSx_test4)
probsLR4= preds_proba_LR4[:, 1]
probsLR = np.concatenate((probsLR1,probsLR2,probsLR3,probsLR4), axis=0)
fpr, tpr, thresh = metrics.roc_curve(y_testT, probsLR)
auclg = roc_auc_score(y_testT, probsLR)
plt.plot(fpr, tpr, label=f'LR, AUC = {str(round(auclg,3))}')
#SVM
preds_proba_SVM1 = SVM1.predict_proba(FSx_test1_scaled)
probsSVM1= preds_proba_SVM1[:, 1]
preds_proba_SVM2 = SVM2.predict_proba(FSx_test2_scaled)
probsSVM2= preds_proba_SVM2[:, 1]
preds_proba_SVM3 = SVM3.predict_proba(FSx_test3_scaled)
probsSVM3= preds_proba_SVM3[:, 1]
preds_proba_SVM4 = SVM4.predict_proba(FSx_test4_scaled)
probsSVM4= preds_proba_SVM4[:, 1]
probsSVM = np.concatenate((probsSVM1,probsSVM2,probsSVM3,probsSVM4), axis=0)
fpr, tpr, thresh = metrics.roc_curve(y_testT, probsSVM)
auclg = roc_auc_score(y_testT, probsSVM)
plt.plot(fpr, tpr, label=f'SVM, AUC = {str(round(auclg,3))}')
#KNN
preds_proba_KNN1 = KNN1.predict_proba(FSx_test1_scaled)
probsKNN1= preds_proba_KNN1[:, 1]
preds_proba_KNN2 = KNN2.predict_proba(FSx_test2_scaled)
probsKNN2= preds_proba_KNN2[:, 1]
preds_proba_KNN3 = KNN3.predict_proba(FSx_test3_scaled)
probsKNN3= preds_proba_KNN3[:, 1]
preds_proba_KNN4 = KNN4.predict_proba(FSx_test4_scaled)
probsKNN4= preds_proba_KNN4[:, 1]
probsKNN = np.concatenate((probsKNN1,probsKNN2,probsKNN3,probsKNN4), axis=0)
fpr, tpr, thresh = metrics.roc_curve(y_testT, probsKNN)
auclg = roc_auc_score(y_testT, probsKNN)
plt.plot(fpr, tpr, label=f'KNN, AUC = {str(round(auclg,3))}')
#RF
preds_proba_RF1 = RF1.predict_proba(FSx_test1)
probsRF1= preds_proba_RF1[:, 1]
preds_proba_RF2 = RF2.predict_proba(FSx_test2)
probsRF2= preds_proba_RF2[:, 1]
preds_proba_RF3 = RF3.predict_proba(FSx_test3)
probsRF3= preds_proba_RF3[:, 1]
preds_proba_RF4 = RF4.predict_proba(FSx_test4)
probsRF4= preds_proba_RF4[:, 1]
probsRF = np.concatenate((probsRF1,probsRF2,probsRF3,probsRF4), axis=0)
fpr, tpr, thresh = metrics.roc_curve(y_testT, probsRF)
auclg = roc_auc_score(y_testT, probsRF)
plt.plot(fpr, tpr, label=f'RF, AUC = {str(round(auclg,3))}')
#AB
preds_proba_AB1 = AB1.predict_proba(FSx_test1)
probsAB1= preds_proba_AB1[:, 1]
preds_proba_AB2 = AB2.predict_proba(FSx_test2)
probsAB2= preds_proba_AB2[:, 1]
preds_proba_AB3 = AB3.predict_proba(FSx_test3)
probsAB3= preds_proba_AB3[:, 1]
preds_proba_AB4 = AB4.predict_proba(FSx_test4)
probsAB4= preds_proba_AB4[:, 1]
probsAB = np.concatenate((probsAB1,probsAB2,probsAB3,probsAB4), axis=0)
fpr, tpr, thresh = metrics.roc_curve(y_testT, probsAB)
auclg = roc_auc_score(y_testT, probsAB)
plt.plot(fpr, tpr, label=f'AB, AUC = {str(round(auclg,3))}')
plt.ylabel("True Positive Rate", fontsize=12)
plt.xlabel("False Positive Rate", fontsize=12)
plt.title("ROC curve")
plt.rcParams['axes.titlesize'] = 16
plt.legend()
plt.show()