In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
In [2]:
# Upload the aggregated data 
dataFrame = pd.read_csv('../finalAggrData.csv')

aggregatedData = dataFrame.drop(dataFrame.columns[0], axis=1)
In [3]:
aggregatedData
Out[3]:
skewnees_ambient kurtos_ambient rootMeanSquare_ambient mean_ambient minimum_motorSpeed maximum_motorSpeed skewnees_motorSpeed kurtos_motorSpeed rootMeanSquare_motorSpeed mean_motorSpeed ... skewnees_statorYoke kurtos_statorYoke rootMeanSquare_statorYoke mean_statorYoke minimum_is maximum_is skewnees_is kurtos_is rootMeanSquare_is mean_us
0 1.899346 4.500659 0.839541 -0.824363 -1.222434 2.024125 -6.829433 48.372817 1.997416 1.957918 ... -1.195548 0.104083 1.158354 -1.118858 0.245700 1.061958 -4.833939 32.929966 0.835201 1.683886
1 0.927837 1.363293 0.709375 -0.691958 2.024113 2.024164 7.322111 86.921904 2.024119 2.024119 ... 2.064666 7.440780 0.807498 -0.806944 0.790524 1.417418 2.057164 2.274116 0.907889 1.640992
2 0.089422 -1.221464 0.493933 -0.435459 2.024112 2.024127 0.091408 0.050537 2.024119 2.024119 ... -0.588208 -0.221040 0.384326 -0.373790 1.372053 1.403274 0.973967 -0.095441 1.382427 1.335422
3 0.264213 0.597537 0.329476 -0.206130 -0.140251 2.024138 -0.916830 -1.155746 1.704222 1.393389 ... -1.082580 0.507289 0.266135 -0.105679 0.636603 1.880627 -0.655548 -1.412830 1.616796 1.388083
4 0.729262 2.282428 0.628898 -0.609202 -0.140250 -0.140241 0.203611 -0.164875 0.140247 -0.140247 ... 1.043693 0.046425 1.284657 -1.272790 1.058018 1.058132 0.027496 -1.279040 1.058074 0.965450
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
560 -19.086382 396.240781 0.263500 -0.163568 -1.222436 1.997176 1.226331 -0.073210 1.176275 -0.562050 ... -0.015880 -1.332019 0.800095 -0.783398 0.179256 3.144260 3.044340 14.565252 1.119364 1.391315
561 -21.647223 526.680430 0.172958 -0.152272 -1.222439 1.858156 1.431948 0.660207 1.125211 -0.672799 ... -0.958949 0.497364 1.093059 -1.089670 0.193069 2.468388 0.294261 5.014944 1.066121 1.325685
562 -0.602059 -0.463666 0.135882 -0.135265 -1.222437 1.604025 1.197231 1.452843 0.917033 -0.635737 ... -0.980320 -0.372597 1.054546 -1.044445 0.166680 3.503960 2.337673 5.936040 1.345588 1.257001
563 -0.312943 1.209110 0.124580 -0.124513 -1.222436 1.847017 0.858079 -0.843144 1.087083 -0.448148 ... 1.762694 1.870350 0.899508 -0.848235 0.007740 2.569018 0.899475 1.399187 1.224262 1.391771
564 0.318117 -0.831621 0.070823 -0.049876 -1.222437 1.940564 -0.382442 -1.611694 1.141185 0.256659 ... -0.991372 -0.157321 1.136816 1.077969 0.030285 2.556495 0.323650 2.927493 1.152175 1.558935

565 rows × 29 columns

Regression Algorithms

The first model we want to try is the simplest one.

Linear regression

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
from sklearn import metrics
In [97]:
X = aggregatedData.drop('mean_us', axis= 1)
y = aggregatedData['mean_us']

The result of one single linear regression model

In [98]:
# Slit the data in train, validation and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=1)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=1)

# Add a column of ones for the intercept coefficient
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)
 
# Fit the linear model
regrModel = sm.OLS(y_train, X_train)
regrModel = regrModel.fit()

print(regrModel.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                mean_us   R-squared:                       0.555
Model:                            OLS   Adj. R-squared:                  0.525
Method:                 Least Squares   F-statistic:                     18.83
Date:                Fri, 19 Feb 2021   Prob (F-statistic):           9.44e-58
Time:                        21:12:02   Log-Likelihood:                 51.696
No. Observations:                 452   AIC:                            -45.39
Df Residuals:                     423   BIC:                             73.90
Df Model:                          28                                         
Covariance Type:            nonrobust                                         
=============================================================================================
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const                         1.1282      0.071     15.972      0.000       0.989       1.267
skewnees_ambient             -0.0087      0.006     -1.525      0.128      -0.020       0.003
kurtos_ambient               -0.0003      0.000     -1.104      0.270      -0.001       0.000
rootMeanSquare_ambient       -0.0343      0.022     -1.563      0.119      -0.077       0.009
mean_ambient                  0.0301      0.015      1.954      0.051      -0.000       0.060
minimum_motorSpeed            0.0236      0.031      0.758      0.449      -0.038       0.085
maximum_motorSpeed            0.0098      0.027      0.356      0.722      -0.044       0.064
skewnees_motorSpeed           0.0107      0.005      2.134      0.033       0.001       0.021
kurtos_motorSpeed         -1.279e-05      0.000     -0.043      0.966      -0.001       0.001
rootMeanSquare_motorSpeed     0.0432      0.035      1.221      0.223      -0.026       0.113
mean_motorSpeed               0.1543      0.038      4.055      0.000       0.080       0.229
minimum_torque                0.0505      0.026      1.935      0.054      -0.001       0.102
maximum_torque               -0.0714      0.027     -2.677      0.008      -0.124      -0.019
skewnees_torque              -0.0051      0.006     -0.800      0.424      -0.018       0.007
kurtos_torque             -1.007e-05      0.000     -0.028      0.978      -0.001       0.001
mean_torque                  -0.1171      0.043     -2.697      0.007      -0.202      -0.032
skewnees_pm                  -0.0046      0.020     -0.230      0.818      -0.044       0.035
kurtos_pm                    -0.0144      0.012     -1.249      0.212      -0.037       0.008
rootMeanSquare_pm            -0.0067      0.026     -0.261      0.794      -0.057       0.044
mean_pm                       0.0159      0.020      0.775      0.439      -0.024       0.056
skewnees_statorYoke          -0.0016      0.013     -0.124      0.902      -0.026       0.023
kurtos_statorYoke             0.0081      0.005      1.539      0.125      -0.002       0.018
rootMeanSquare_statorYoke    -0.0586      0.030     -1.954      0.051      -0.118       0.000
mean_statorYoke               0.0088      0.020      0.441      0.659      -0.030       0.048
minimum_is                   -0.2471      0.050     -4.924      0.000      -0.346      -0.148
maximum_is                    0.0106      0.043      0.245      0.807      -0.075       0.096
skewnees_is                   0.0181      0.009      2.064      0.040       0.001       0.035
kurtos_is                     0.0014      0.001      1.513      0.131      -0.000       0.003
rootMeanSquare_is             0.3630      0.073      4.966      0.000       0.219       0.507
==============================================================================
Omnibus:                       35.394   Durbin-Watson:                   1.950
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               54.132
Skew:                          -0.553   Prob(JB):                     1.76e-12
Kurtosis:                       4.286   Cond. No.                     1.23e+03
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.23e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
In [96]:
ytest_pred = regrModel.predict(X_test)

fig, (ax1, ax2) = plt.subplots(2, figsize=(10,7))
fig.suptitle('First linear model')

ax1.plot(y_test.values, label= 'true values')
ax1.plot(ytest_pred.values, 'ro', label= 'predicted values')
ax1.legend()
ax1.set_title('True values VS predicted values')
ax1.set(xlabel='', ylabel='mean_us values')

sort_index = np.argsort(y_test.values) 

ax2.plot(y_test.values[sort_index], y_test.values[sort_index])
ax2.plot(y_test.values[sort_index], ytest_pred.values[sort_index], 'ro')
ax2.set(xlabel='true values', ylabel='predicted values')
Out[96]:
[Text(0, 0.5, 'predicted values'), Text(0.5, 0, 'true values')]

Now we want to compute some metrics in order to better understand the results:

  • Mean Absolute Error,
  • Root Mean Square Error,
  • Accuracy.

Moreover, for having a robust metrics for analysing the models, the output will be averaged on the different simulations that will be computed by considering different split of the data.

In [30]:
numSimulations = 100
lengthTrainSet = X_val.shape[0]
numFeatures = X.shape[1]

ytest_pred = np.zeros((numSimulations, lengthTrainSet))
errors = np.zeros((numSimulations, lengthTrainSet))
mape = np.zeros((numSimulations, lengthTrainSet))
accuracy = np.zeros(numSimulations)
mae = np.zeros(numSimulations)
rmse = np.zeros(numSimulations)
aic = np.zeros(numSimulations)

for i in range(0,100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=i)
    #X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=i)

    X_train = sm.add_constant(X_train)
    X_test = sm.add_constant(X_test)
                                                  
    regrModel = sm.OLS(y_train, X_train)
    regrModel = regrModel.fit()

    ytest_pred[i] = regrModel.predict(X_test)
    
    # Calculate the absolute errors vector
    errors[i] = abs(y_test - ytest_pred[i])
    # Calculate mean absolute percentage error vector(MAPE)
    mape[i] = 100 * (errors[i] / y_test)
    
    # Calculate the vector metrics
    accuracy[i] = 100 - np.mean(mape[i])
    mae[i] = np.mean(errors[i])
    rmse[i] = np.sqrt(mean_squared_error(y_test, ytest_pred[i]))
    aic[i] = 2*numFeatures - 2*regrModel.llf
    

# Display the metrics
print('The results of the first linear model:\n')
print('Mean Absolute Error: %.2f ' %np.mean(mae))  
print('The Root Mean Square Error: %.2f ' %np.mean(rmse))
print('Accuracy:', round(np.mean(accuracy), 2), '%.')
print('AIC : %.2f' %np.mean(aic) )
The results of the first linear model:

Mean Absolute Error: 0.17 
The Root Mean Square Error: 0.23 
Accuracy: 82.53 %.
AIC : -63.13

Linear regression with only the features having a p-value lower than 5%

It is evident that not all the feature are important. That is, we can't exclude that most of the coefficient are zero. Infact the p-value associated to the null hypotesis that some coefficients beta_i are null, is very large for some of them. Regarding to the hypothesis that the model with only the intercept is equal to the one just computed, It's reasonable to reject it, since the F-static is quite big.

Now we analyze another model by considering only the following variables:

  • mean_ambient
  • skewnees_motorSpeed
  • mean_motorSpeed
  • kurtos_motorSpeed
  • minimum_torque
  • maximum_torque
  • mean_torque
  • rootMeanSquare_is
  • rootMeanSquare_statorYoke
  • minimum_is
  • skewnees_is
In [38]:
# Slit the data in train, validation and test set
X_train, X_test, y_train, y_test = train_test_split(X[['mean_ambient', 'skewnees_motorSpeed', 'mean_motorSpeed', 
                                      'minimum_torque', 'maximum_torque', 'mean_torque', 'rootMeanSquare_is',
                                      'rootMeanSquare_statorYoke', 'minimum_is', 'skewnees_is']], y, test_size = 0.20, random_state=1)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=1)

# Add a column of ones for the intercept coefficient
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)
 
regrModel2 = sm.OLS(y_train, X_train)

regrModel2 = regrModel2.fit()

print(regrModel2.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                mean_us   R-squared:                       0.530
Model:                            OLS   Adj. R-squared:                  0.519
Method:                 Least Squares   F-statistic:                     49.75
Date:                Mon, 15 Feb 2021   Prob (F-statistic):           3.97e-66
Time:                        19:33:22   Log-Likelihood:                 39.463
No. Observations:                 452   AIC:                            -56.93
Df Residuals:                     441   BIC:                            -11.68
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
=============================================================================================
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const                         1.1914      0.048     24.954      0.000       1.098       1.285
mean_ambient                  0.0515      0.012      4.353      0.000       0.028       0.075
skewnees_motorSpeed           0.0134      0.004      3.244      0.001       0.005       0.022
mean_motorSpeed               0.1877      0.015     12.639      0.000       0.159       0.217
minimum_torque                0.0420      0.022      1.904      0.058      -0.001       0.085
maximum_torque               -0.0745      0.022     -3.409      0.001      -0.117      -0.032
mean_torque                  -0.1114      0.039     -2.832      0.005      -0.189      -0.034
rootMeanSquare_is             0.3420      0.042      8.095      0.000       0.259       0.425
rootMeanSquare_statorYoke    -0.0619      0.022     -2.772      0.006      -0.106      -0.018
minimum_is                   -0.2333      0.042     -5.555      0.000      -0.316      -0.151
skewnees_is                   0.0179      0.007      2.493      0.013       0.004       0.032
==============================================================================
Omnibus:                       48.632   Durbin-Watson:                   1.957
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               74.555
Skew:                          -0.717   Prob(JB):                     6.46e-17
Kurtosis:                       4.379   Cond. No.                         16.1
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [39]:
ytest_pred = regrModel2.predict(X_test)
 
fig, (ax1, ax2) = plt.subplots(2, figsize=(10,7))
fig.suptitle('Linear model more parsimonious')

ax1.plot(y_test.values, label= 'true values')
ax1.plot(ytest_pred.values, 'ro', label= 'predicted values')
ax1.legend()
ax1.set_title('True values VS predicted values')
ax1.set(xlabel='', ylabel='mean_us values')


ax2.plot(y_test.values[sort_index], y_test.values[sort_index])
ax2.plot(y_test.values[sort_index], ytest_pred.values[sort_index], 'ro')
ax2.set(xlabel='true values', ylabel='predicted values')
Out[39]:
[Text(0, 0.5, 'predicted values'), Text(0.5, 0, 'true values')]
In [40]:
X_morePars = X[['mean_ambient', 'skewnees_motorSpeed', 'mean_motorSpeed', 
                                      'minimum_torque', 'maximum_torque', 'mean_torque', 'rootMeanSquare_is',
                                      'rootMeanSquare_statorYoke', 'minimum_is', 'skewnees_is']]
numFeatures = X_morePars.shape[1]

ytest_pred = np.zeros((numSimulations, lengthTrainSet))
error = np.zeros((numSimulations, lengthTrainSet))
mape = np.zeros((numSimulations, lengthTrainSet))
accuracy = np.zeros(numSimulations)
mae = np.zeros(numSimulations)
rmse = np.zeros(numSimulations)
aic = np.zeros(numSimulations)

for i in range(0,numSimulations):
    X_train, X_test, y_train, y_test = train_test_split(X_morePars, y, test_size = 0.20, random_state=i)
    #X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=i)

    X_train = sm.add_constant(X_train)
    X_test = sm.add_constant(X_test)
                                                  
    regrModel = sm.OLS(y_train, X_train)
    regrModel = regrModel.fit()

    ytest_pred[i] = regrModel.predict(X_test)
    
    # Calculate the absolute errors vector
    errors[i] = abs(y_test - ytest_pred[i])
    # Calculate mean absolute percentage error vector(MAPE)
    mape[i] = 100 * (errors[i] / y_test)
    
    # Calculate the vector metrics
    accuracy[i] = 100 - np.mean(mape[i])
    mae[i] = np.mean(errors[i])
    rmse[i] = np.sqrt(mean_squared_error(y_test, ytest_pred[i]))
    aic[i] = 2*numFeatures - 2*regrModel.llf


# Display the metrics

print('The results of the linear model without the feature with a p-value higher than 0.5:\n')
print('Mean Absolute Error: %.2f' %np.mean(mae))  
print('The Root Mean Square Error: %.2f ' %np.mean(rmse))
print('Accuracy:', round(np.mean(accuracy), 2), '%.')
print('AIC : %.2f' %np.mean(aic) )
The results of the linear model without the feature with a p-value higher than 0.5:

Mean Absolute Error: 0.17
The Root Mean Square Error: 0.23 
Accuracy: 82.23 %.
AIC : -63.82

It seems that the model has not been made worse or improved. Using a p-value cutoff of 0.05, means that if you add 100 features to a model that are pure noise, 5 of them (on average) will still be counted as significant. Moreover, the feature associated to a larger values than 0.5, is few important only if the model is fitted with all the features except itself. But if we remove some feature, the p-value of that features could decrease (or increase)!

Linear regression with optimal features

Now we want to select a subset of the initial features. We do that by taking into account the AIC and RMSE lowest values.

We can't compute all the combination because it would be too computational expensive!! If we suppose that we want to compute all the combinations of the features by taking into account 14 of them, the machine should consider 28 binomial 14= 28!/14!14! = 4010^6 iteration. If we suppose that each iteration lasts 0.08 seconds --> the time needed to complete the code is about 891 hours! Only for one set of variables!!

In [100]:
import time

# I divide the data in train and test set.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=1)

# Split the train data in train and validation set.
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=1)

import itertools
import numpy as np
import pandas as pd
import statsmodels.api as sm
AICs = {}
RMSEs = {}
rang = [1, 2, 3, 4, 5, 6, 24, 25, 26, 27, 28] # We can't compute all the combination because it would too computational expensive
# range(1,len(X_train.columns)+1, 13)

for k in rang:#
    t = time.time()
    # do stuff
    for variables in itertools.combinations(X_train.columns, k):
        predictorsTrain = X_train[list(variables)]
        predictorsTest = X_val[list(variables)]
        #print(variables)
        #predictors['Intercept'] = 1
        res = sm.OLS(y_train, sm.add_constant(predictorsTrain)).fit()
        yVal_pred = res.predict(sm.add_constant(predictorsTest))
        
        AICs[variables] = 2*(k) - 2*res.llf
        RMSEs[variables] = np.sqrt(mean_squared_error(y_val, yVal_pred))
     
    
    elapsed = time.time() - t
    #print(elapsed)
    
In [101]:
features_minAIC = list(AICs.keys())[list(AICs.values()).index(min(AICs.values()))]

print('The features associated to a minimum AIC are: \n', features_minAIC)
print('\nThe number of features is ', len(features_minAIC), ', and the AIC obtained is equal to %.3f' %min(AICs.values()))
The features associated to a minimum AIC are: 
 ('skewnees_ambient', 'kurtos_ambient', 'rootMeanSquare_ambient', 'mean_ambient', 'minimum_motorSpeed', 'skewnees_motorSpeed', 'rootMeanSquare_motorSpeed', 'mean_motorSpeed', 'minimum_torque', 'maximum_torque', 'skewnees_torque', 'kurtos_torque', 'mean_torque', 'skewnees_pm', 'kurtos_pm', 'rootMeanSquare_pm', 'mean_pm', 'kurtos_statorYoke', 'rootMeanSquare_statorYoke', 'minimum_is', 'maximum_is', 'skewnees_is', 'kurtos_is', 'rootMeanSquare_is')

The number of features is  24 , and the AIC obtained is equal to -56.959
In [103]:
featuresAIC_Excluded = np.setdiff1d(X_train.columns, features_minAIC)
print('The features that have been removed when we were looking for the minimum AIC are: \n', featuresAIC_Excluded)
The features that have been removed when we were looking for the minimum AIC are: 
 ['kurtos_motorSpeed' 'maximum_motorSpeed' 'mean_statorYoke'
 'skewnees_statorYoke']

Now we can fit our last linear model using the features above mentioned!

In [112]:
# I divide the data in train and test set.
X_train, X_test, y_train, y_test = train_test_split(X[['skewnees_ambient', 'kurtos_ambient', 'rootMeanSquare_ambient', 
                                                       'mean_ambient', 'minimum_motorSpeed', 'skewnees_motorSpeed', 
                                                       'rootMeanSquare_motorSpeed', 'mean_motorSpeed', 'minimum_torque',
                                                       'maximum_torque', 'skewnees_torque', 'kurtos_torque', 'mean_torque',
                                                       'skewnees_pm', 'kurtos_pm', 'rootMeanSquare_pm', 'mean_pm', 
                                                       'kurtos_statorYoke', 'rootMeanSquare_statorYoke', 'minimum_is', 
                                                       'maximum_is', 'skewnees_is', 'kurtos_is', 'rootMeanSquare_is']], 
                                                    y, test_size = 0.20, random_state=1)

regrModelFinal = sm.OLS(y_train, sm.add_constant(X_train)).fit()

print(regrModelFinal.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                mean_us   R-squared:                       0.554
Model:                            OLS   Adj. R-squared:                  0.529
Method:                 Least Squares   F-statistic:                     22.14
Date:                Sat, 20 Feb 2021   Prob (F-statistic):           2.51e-60
Time:                        08:48:17   Log-Likelihood:                 51.485
No. Observations:                 452   AIC:                            -52.97
Df Residuals:                     427   BIC:                             49.87
Df Model:                          24                                         
Covariance Type:            nonrobust                                         
=============================================================================================
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const                         1.1273      0.069     16.429      0.000       0.992       1.262
skewnees_ambient             -0.0087      0.006     -1.534      0.126      -0.020       0.002
kurtos_ambient               -0.0003      0.000     -1.108      0.269      -0.001       0.000
rootMeanSquare_ambient       -0.0368      0.021     -1.722      0.086      -0.079       0.005
mean_ambient                  0.0306      0.015      2.039      0.042       0.001       0.060
minimum_motorSpeed            0.0223      0.031      0.725      0.469      -0.038       0.083
skewnees_motorSpeed           0.0109      0.005      2.380      0.018       0.002       0.020
rootMeanSquare_motorSpeed     0.0457      0.034      1.333      0.183      -0.022       0.113
mean_motorSpeed               0.1622      0.031      5.304      0.000       0.102       0.222
minimum_torque                0.0484      0.026      1.890      0.059      -0.002       0.099
maximum_torque               -0.0699      0.026     -2.659      0.008      -0.122      -0.018
skewnees_torque              -0.0050      0.006     -0.807      0.420      -0.017       0.007
kurtos_torque             -1.213e-05      0.000     -0.035      0.972      -0.001       0.001
mean_torque                  -0.1160      0.043     -2.701      0.007      -0.200      -0.032
skewnees_pm                  -0.0056      0.018     -0.308      0.758      -0.042       0.030
kurtos_pm                    -0.0137      0.011     -1.203      0.230      -0.036       0.009
rootMeanSquare_pm            -0.0082      0.025     -0.327      0.744      -0.057       0.041
mean_pm                       0.0224      0.016      1.443      0.150      -0.008       0.053
kurtos_statorYoke             0.0074      0.005      1.458      0.146      -0.003       0.017
rootMeanSquare_statorYoke    -0.0534      0.027     -1.979      0.048      -0.106      -0.000
minimum_is                   -0.2550      0.047     -5.428      0.000      -0.347      -0.163
maximum_is                    0.0115      0.043      0.269      0.788      -0.073       0.095
skewnees_is                   0.0176      0.009      2.053      0.041       0.001       0.034
kurtos_is                     0.0013      0.001      1.574      0.116      -0.000       0.003
rootMeanSquare_is             0.3663      0.072      5.070      0.000       0.224       0.508
==============================================================================
Omnibus:                       35.727   Durbin-Watson:                   1.947
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               55.425
Skew:                          -0.552   Prob(JB):                     9.22e-13
Kurtosis:                       4.314   Cond. No.                     1.22e+03
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.22e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
In [113]:
ytest_pred = regrModelFinal.predict(sm.add_constant(X_test)) #[list(features_minAIC)]

fig, (ax1, ax2) = plt.subplots(2, figsize=(10,7))
fig.suptitle('Linear model with a subset of the optimal features')

ax1.plot(y_test.values, label= 'true values')
ax1.plot(ytest_pred.values, 'ro', label= 'predicted values')
#plt.ylim((0.4,2.6))
ax1.legend()
ax1.set_title('True VS predicted values')
ax1.set(xlabel='', ylabel='mean_us values')


ax2.plot(y_test.values[sort_index], y_test.values[sort_index])
ax2.plot(y_test.values[sort_index], ytest_pred.values[sort_index], 'ro')
ax2.set(xlabel='true values', ylabel='predicted values')
Out[113]:
[Text(0, 0.5, 'predicted values'), Text(0.5, 0, 'true values')]
In [ ]:
 
In [117]:
X_optFeat_linRegr = X[['skewnees_ambient', 'kurtos_ambient', 'rootMeanSquare_ambient', 
                                                       'mean_ambient', 'minimum_motorSpeed', 'skewnees_motorSpeed', 
                                                       'rootMeanSquare_motorSpeed', 'mean_motorSpeed', 'minimum_torque',
                                                       'maximum_torque', 'skewnees_torque', 'kurtos_torque', 'mean_torque',
                                                       'skewnees_pm', 'kurtos_pm', 'rootMeanSquare_pm', 'mean_pm', 
                                                       'kurtos_statorYoke', 'rootMeanSquare_statorYoke', 'minimum_is', 
                                                       'maximum_is', 'skewnees_is', 'kurtos_is', 'rootMeanSquare_is']]

X_optFeat_linRegr = sm.add_constant(X_optFeat_linRegr)
numSimulations = 100
lengthTrainSet = X_val.shape[0]
numFeatures = X_optFeat_linRegr.shape[1]

ytest_pred = np.zeros((numSimulations, lengthTrainSet))
error = np.zeros((numSimulations, lengthTrainSet))
mape = np.zeros((numSimulations, lengthTrainSet))
accuracy = np.zeros(numSimulations)
mae = np.zeros(numSimulations)
rmse = np.zeros(numSimulations)
aic = np.zeros(numSimulations)

for i in range(0,numSimulations):
    X_train, X_test, y_train, y_test = train_test_split(X_optFeat_linRegr, y, test_size = 0.20, random_state=i)
    #X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=itest                                                  
    regrModel = sm.OLS(y_train, X_train)
    regrModel = regrModel.fit()

    ytest_pred[i] = regrModel.predict(X_test)
    
    # Calculate the absolute errors vector
    errors[i] = abs(y_test - ytest_pred[i])
    # Calculate mean absolute percentage error vector(MAPE)
    mape[i] = 100 * (errors[i] / y_test)
    
    # Calculate the vector metrics
    accuracy[i] = 100 - np.mean(mape[i])
    mae[i] = np.mean(errors[i])
    rmse[i] = np.sqrt(mean_squared_error(y_test, ytest_pred[i]))
    aic[i] = 2*numFeatures - 2*regrModel.llf
    

# Display the metrics
print('The results of the linear regression model \nwith optimal features are:\n')
print('Mean Absolute Error: %.2f ' %np.mean(mae))  
print('Root Mean Square Error: %.2f ' %np.mean(rmse))
print('Accuracy:', round(np.mean(accuracy), 2), '%.')
print('AIC : %.2f' %np.mean(aic) )
The results of the linear regression model 
with optimal features are:

Mean Absolute Error: 0.17 
Root Mean Square Error: 0.23 
Accuracy: 82.54 %.
AIC : -64.15

Even if we haven't been able to select the best features ever for the linear regression, it is evident that this model doesn't fit good the data: the performance are so far to be good. These metrics can be taken into account as starting point..

XGBoost

I will build the model using Trees as base learners (which are the default base learners) using XGBoost's scikit-learn compatible API. I will also perform, along the way, some of the tuning parameters which XGBoost provides in order to improve the model's performance, and using the root mean squared error (RMSE) performance metric to check the performance of the trained model on the test set.

Now I will convert the dataset into an optimized data structure called Dmatrix that XGBoost supports and gives it acclaimed performance and efficiency gains.

In [46]:
import xgboost as xgb
from matplotlib import pyplot

The next step is to instantiate an XGBoost regressor object by calling the XGBRegressor() class from the XGBoost library with the hyper-parameters passed as arguments. For classification problems, I should have used the XGBClassifier() class.

The most common parameters that we should know are:

  • learning_rate: step size shrinkage used to prevent overfitting. Range is [0,1]
  • max_depth: determines how deeply each tree is allowed to grow during any boosting round.
  • subsample: percentage of samples used per tree. Low value can lead to underfitting.
  • colsample_bytree: percentage of features used per tree. High value can lead to overfitting.
  • n_estimators: number of trees you want to build.
  • min_child_weight is the minimum weight required in order to create a new node in the tree. A smaller min_child_weight allows for more complex trees, but again, more likely to overfit.
  • objective: determines the loss function to be used like reg:squarederror for regression problems, reg:logistic for classification problems with only decision, binary:logistic for classification problems with probability.

XGBoost also supports regularization parameters to penalize models as they become more complex and reduce them to simple (parsimonious) models.

  • gamma: controls whether a given node will split based on the expected reduction in loss after the split. A higher value leads to fewer splits. Supported only for tree-based learners.
  • alpha: L1 regularization on leaf weights. A large value leads to more regularization.
  • lambda: L2 regularization on leaf weights and is smoother than L1 regularization.

First XGBoost model

In [48]:
# Slit the data in train, validation and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=1)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=1)

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', seed = 1)
xg_reg.fit(X_train,y_train)
    
ytest_pred = xg_reg.predict(X_test)

fig, (ax1, ax2) = plt.subplots(2, figsize=(10,7))
fig.suptitle('First XGBoost model')

ax1.plot(y_test.values, label= 'true values')
ax1.plot(ytest_pred, 'ro', label= 'predicted values')
ax1.legend()
ax1.set_title('True values VS predicted values')
ax1.set(xlabel='', label='mean_us values')


ax2.plot(y_test.values[sort_index], y_test.values[sort_index])
ax2.plot(y_test.values[sort_index], ytest_pred[sort_index], 'ro')
ax2.set(xlabel='true values', ylabel='predicted values')
Out[48]:
[Text(0, 0.5, 'predicted values'), Text(0.5, 0, 'true values')]
In [136]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', seed = 1)


ytest_pred = np.zeros((numSimulations, lengthTrainSet))
error = np.zeros((numSimulations, lengthTrainSet))
mape = np.zeros((numSimulations, lengthTrainSet))
accuracy = np.zeros(numSimulations)
mae = np.zeros(numSimulations)
rmse = np.zeros(numSimulations)
#aic = np.zeros(numSimulations)

for i in range(0,numSimulations):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=i)
    #X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=i)
                                              
    xg_reg.fit(X_train,y_train)
    ytest_pred[i] = xg_reg.predict(X_test)
    
    # Calculate the absolute errors vector
    errors[i] = abs(y_test - ytest_pred[i])
    # Calculate mean absolute percentage error vector(MAPE)
    mape[i] = 100 * (errors[i] / y_test)
    
    # Calculate the vector metrics
    accuracy[i] = 100 - np.mean(mape[i])
    mae[i] = np.mean(errors[i])
    rmse[i] = np.sqrt(mean_squared_error(y_test, ytest_pred[i]))
    aic[i] = 2*numFeatures - 2*regrModel.llf
    

# Display the metrics
print('The results of the XGBoost model with \n the parameters chosen by default:\n')
print('Mean Absolute Error: %.2f ' %np.mean(mae))  
print('Root Mean Square Error: %.2f ' %np.mean(rmse))
print('Accuracy:', round(np.mean(accuracy), 2), '%.')
The results of the XGBoost model with 
 the parameters chosen by default:

Mean Absolute Error: 0.07 
Root Mean Square Error: 0.11 
Accuracy: 93.1 %.

These are definetely good results! The predicted values seem to fit the real ones and the performance are very good by reaching an accuracy higher than 90%!

XGBoost model with tuning the parameters

In order to build more robust models, it is common to do a k-fold cross validation for tuning the parameters, where all the entries in the original training dataset are used for both training as well as validation. Also, each entry is used for validation just once. XGBoost supports k-fold cross validation via the cv() method. All we need to do is specify the nfolds parameter, which is the number of cross validation sets I want to build

The first parameter we will look at is not part of the params dictionary, but will be passed as a standalone argument to the training method. This parameter is called num_boost_round and corresponds to the number of boosting rounds or trees to build

In [137]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=1)


dtrain = xgb.DMatrix(data = X_train,label=y_train)
dval = xgb.DMatrix(data = X_val, label=y_val)
In [138]:
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'learning_rate': 0.1,
    'subsample': 1,
    'colsample_bytree': 1,
    'alpha': 0,
    # Other parameters
    'objective':'reg:squarederror',
}

XGBoost provides a nice way to find the best number of rounds whilst training. Since trees are built sequentially, instead of fixing the number of rounds at the beginning, we can test our model at each step and see if adding a new tree/round improves performance

First, we need to add the evaluation metric we are interested in to our params dictionary

In [148]:
params['eval_metric'] = "rmse"

We still need to pass a num_boost_round which corresponds to the maximum number of boosting rounds that we allow. We set it to a large value hoping to find the optimal number of rounds before reaching it, if we haven't improved performance on our test dataset in early_stopping_round rounds

In [149]:
num_boost_round = 1000
In [152]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dval, "Test")],
    early_stopping_rounds=15
)
print("Best RMSE: {:.2f} with {} rounds".format(
                 model.best_score,
                 model.best_iteration+1))
[0]	Test-rmse:0.84428
[1]	Test-rmse:0.76412
[2]	Test-rmse:0.69321
[3]	Test-rmse:0.63024
[4]	Test-rmse:0.57183
[5]	Test-rmse:0.52069
[6]	Test-rmse:0.47416
[7]	Test-rmse:0.43232
[8]	Test-rmse:0.39470
[9]	Test-rmse:0.36103
[10]	Test-rmse:0.33276
[11]	Test-rmse:0.30706
[12]	Test-rmse:0.28146
[13]	Test-rmse:0.25974
[14]	Test-rmse:0.23986
[15]	Test-rmse:0.22240
[16]	Test-rmse:0.20798
[17]	Test-rmse:0.19449
[18]	Test-rmse:0.18294
[19]	Test-rmse:0.17219
[20]	Test-rmse:0.16313
[21]	Test-rmse:0.15478
[22]	Test-rmse:0.14744
[23]	Test-rmse:0.14179
[24]	Test-rmse:0.13671
[25]	Test-rmse:0.13288
[26]	Test-rmse:0.12950
[27]	Test-rmse:0.12607
[28]	Test-rmse:0.12320
[29]	Test-rmse:0.12077
[30]	Test-rmse:0.11896
[31]	Test-rmse:0.11699
[32]	Test-rmse:0.11530
[33]	Test-rmse:0.11394
[34]	Test-rmse:0.11259
[35]	Test-rmse:0.11165
[36]	Test-rmse:0.11055
[37]	Test-rmse:0.10942
[38]	Test-rmse:0.10887
[39]	Test-rmse:0.10810
[40]	Test-rmse:0.10752
[41]	Test-rmse:0.10697
[42]	Test-rmse:0.10647
[43]	Test-rmse:0.10620
[44]	Test-rmse:0.10580
[45]	Test-rmse:0.10550
[46]	Test-rmse:0.10522
[47]	Test-rmse:0.10508
[48]	Test-rmse:0.10484
[49]	Test-rmse:0.10474
[50]	Test-rmse:0.10464
[51]	Test-rmse:0.10449
[52]	Test-rmse:0.10437
[53]	Test-rmse:0.10416
[54]	Test-rmse:0.10412
[55]	Test-rmse:0.10406
[56]	Test-rmse:0.10393
[57]	Test-rmse:0.10393
[58]	Test-rmse:0.10379
[59]	Test-rmse:0.10379
[60]	Test-rmse:0.10363
[61]	Test-rmse:0.10363
[62]	Test-rmse:0.10357
[63]	Test-rmse:0.10358
[64]	Test-rmse:0.10363
[65]	Test-rmse:0.10364
[66]	Test-rmse:0.10349
[67]	Test-rmse:0.10345
[68]	Test-rmse:0.10344
[69]	Test-rmse:0.10345
[70]	Test-rmse:0.10338
[71]	Test-rmse:0.10333
[72]	Test-rmse:0.10332
[73]	Test-rmse:0.10332
[74]	Test-rmse:0.10336
[75]	Test-rmse:0.10334
[76]	Test-rmse:0.10330
[77]	Test-rmse:0.10325
[78]	Test-rmse:0.10321
[79]	Test-rmse:0.10324
[80]	Test-rmse:0.10320
[81]	Test-rmse:0.10314
[82]	Test-rmse:0.10316
[83]	Test-rmse:0.10315
[84]	Test-rmse:0.10314
[85]	Test-rmse:0.10314
[86]	Test-rmse:0.10314
[87]	Test-rmse:0.10313
[88]	Test-rmse:0.10309
[89]	Test-rmse:0.10309
[90]	Test-rmse:0.10307
[91]	Test-rmse:0.10306
[92]	Test-rmse:0.10304
[93]	Test-rmse:0.10304
[94]	Test-rmse:0.10303
[95]	Test-rmse:0.10301
[96]	Test-rmse:0.10297
[97]	Test-rmse:0.10297
[98]	Test-rmse:0.10296
[99]	Test-rmse:0.10297
[100]	Test-rmse:0.10295
[101]	Test-rmse:0.10293
[102]	Test-rmse:0.10293
[103]	Test-rmse:0.10293
[104]	Test-rmse:0.10293
[105]	Test-rmse:0.10290
[106]	Test-rmse:0.10287
[107]	Test-rmse:0.10287
[108]	Test-rmse:0.10285
[109]	Test-rmse:0.10285
[110]	Test-rmse:0.10285
[111]	Test-rmse:0.10285
[112]	Test-rmse:0.10285
[113]	Test-rmse:0.10283
[114]	Test-rmse:0.10283
[115]	Test-rmse:0.10282
[116]	Test-rmse:0.10278
[117]	Test-rmse:0.10277
[118]	Test-rmse:0.10277
[119]	Test-rmse:0.10277
[120]	Test-rmse:0.10275
[121]	Test-rmse:0.10272
[122]	Test-rmse:0.10271
[123]	Test-rmse:0.10270
[124]	Test-rmse:0.10269
[125]	Test-rmse:0.10268
[126]	Test-rmse:0.10267
[127]	Test-rmse:0.10267
[128]	Test-rmse:0.10266
[129]	Test-rmse:0.10267
[130]	Test-rmse:0.10267
[131]	Test-rmse:0.10267
[132]	Test-rmse:0.10264
[133]	Test-rmse:0.10264
[134]	Test-rmse:0.10264
[135]	Test-rmse:0.10264
[136]	Test-rmse:0.10263
[137]	Test-rmse:0.10261
[138]	Test-rmse:0.10261
[139]	Test-rmse:0.10261
[140]	Test-rmse:0.10260
[141]	Test-rmse:0.10261
[142]	Test-rmse:0.10262
[143]	Test-rmse:0.10261
[144]	Test-rmse:0.10260
[145]	Test-rmse:0.10259
[146]	Test-rmse:0.10259
[147]	Test-rmse:0.10258
[148]	Test-rmse:0.10257
[149]	Test-rmse:0.10257
[150]	Test-rmse:0.10257
[151]	Test-rmse:0.10257
[152]	Test-rmse:0.10256
[153]	Test-rmse:0.10256
[154]	Test-rmse:0.10255
[155]	Test-rmse:0.10255
[156]	Test-rmse:0.10253
[157]	Test-rmse:0.10254
[158]	Test-rmse:0.10254
[159]	Test-rmse:0.10254
[160]	Test-rmse:0.10254
[161]	Test-rmse:0.10254
[162]	Test-rmse:0.10254
[163]	Test-rmse:0.10253
[164]	Test-rmse:0.10253
[165]	Test-rmse:0.10253
[166]	Test-rmse:0.10253
[167]	Test-rmse:0.10253
[168]	Test-rmse:0.10253
[169]	Test-rmse:0.10253
[170]	Test-rmse:0.10253
Best RMSE: 0.10 with 157 rounds

As we can see we stopped before reaching the maximum number of boosting rounds, that’s because after the 85th tree adding more rounds did not lead to improvements of RMSE on the test dataset.

In order to tune the other hyperparameters, we will use the cv function from XGBoost. It allows us to run cross-validation on our training dataset and returns a mean RMSE score. We don’t need to pass a test dataset here. It’s because the cross-validation function is splitting the train dataset into nfolds and iteratively keeps one of the folds for test purposes

In [154]:
# Let’s make a list containing all the combinations of the parameters we want to try.
num_boost_round = 200

learning_rateList = [0.05 ,0.10, 0.15, 0.30 , 0.6, 1.0]
max_depthList = [ 3, 4, 5, 6, 8, 10, 12, 15]
min_child_weightList = [ 1, 2, 3, 4, 5]
gammaList = [ 0.0, 0.1, 0.2, 0.3, 0.4]
colsample_bytreeList = [ 0.5, 0.7, 0.9, 1.0]
alphaList =  [0, 3, 6, 9]

gridsearch_params = [
    (max_depth, min_child_weight, learning_rate, gamma, colsample_bytree, alpha)
    for max_depth in max_depthList
    for min_child_weight in min_child_weightList
    for learning_rate in learning_rateList
    for gamma in gammaList
    for colsample_bytree in colsample_bytreeList
    for alpha in alphaList
]

Let’s run cross validation on each of those pairs.

In [156]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=1)
dtrain = xgb.DMatrix(data = X_train,label=y_train)


# Define initial best params 
min_RMSE = float("Inf")
best_params = None
t = time.time()
    # do 
for max_depth, min_child_weight, learning_rate, gamma, colsample_bytree, alpha in gridsearch_params:
    #print("CV with max_depth={}, min_child_weight={}".format(
    #                         max_depth,
    #                         min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    params['learning_rate'] = learning_rate
    params['gamma'] = gamma
    params['colsample_bytree'] = colsample_bytree
    params['alpha'] = alpha
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'rmse'},
        early_stopping_rounds=15
    )
    # Update best RMSE
    mean_RMSE = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    #print("\tMAE {} for {} rounds".format(mean_RMSE, boost_rounds))
    if mean_RMSE < min_RMSE:
        min_RMSE = mean_RMSE
        best_params = (max_depth, min_child_weight, learning_rate, gamma, colsample_bytree, alpha)

elapsed = time.time() - t

print("Best parameters for the XGBoost model are: \n max_depth {}, \n min_child_weight {}, \n learning_rate {}, \n gamma {}, \n colsample_bytree {}, \n alpha{}.\n with a RMSE equal to {}".format(best_params[0], best_params[1], best_params[2], best_params[3], best_params[4], best_params[5], min_RMSE))
print('The time elapsed to tune the parameters of the XGBoost model is', elapsed)
Best parameters for the XGBoost model are: 
 max_depth 3, 
 min_child_weight 3, 
 learning_rate 0.15, 
 gamma0.0, 
 colsample_bytree 0.9, 
 alpha0.
 with a RMSE equal to 0.099184
The time elapsed to tune the parameters of the XGBoost model is 12328.224152326584
In [165]:
# Slit the data in train, validation and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=1)

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', max_depth = 3, 
                                     min_child_weight = 3, 
                                     learning_rate = 0.15, 
                                     gamma = 0.0, 
                                     colsample_bytree = 0.9, 
                                     alpha = 0, 
                                     n_estimators = 150, 
                                     seed = 1)

xg_reg.fit(X_train,y_train)
ytest_pred = xg_reg.predict(X_test)

fig, (ax1, ax2) = plt.subplots(2, figsize=(10,7))
fig.suptitle('XGBoost model after tuning the parameters')

ax1.plot(y_test.values, label= 'true values')
ax1.plot(ytest_pred, 'ro', label= 'predicted values')
ax1.legend()
ax1.set_title('True values VS predicted values')
ax1.set(xlabel='', ylabel='mean_us values')


ax2.plot(y_test.values[sort_index], y_test.values[sort_index])
ax2.plot(y_test.values[sort_index], ytest_pred[sort_index], 'ro')
ax2.set(xlabel='true values', ylabel='predicted values')
Out[165]:
[Text(0, 0.5, 'predicted values'), Text(0.5, 0, 'true values')]
In [166]:
ytest_pred = np.zeros((numSimulations, lengthTrainSet))
error = np.zeros((numSimulations, lengthTrainSet))
mape = np.zeros((numSimulations, lengthTrainSet))
accuracy = np.zeros(numSimulations)
mae = np.zeros(numSimulations)
rmse = np.zeros(numSimulations)
#aic = np.zeros(numSimulations)
    
for i in range(0,numSimulations):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=i)
    #X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=i)

                                         
    xg_reg = xg_reg.fit(X_train, y_train)
    ytest_pred[i] = xg_reg.predict(X_test)
    
    # Calculate the absolute errors vector
    errors[i] = abs(y_test - ytest_pred[i])
    # Calculate mean absolute percentage error vector(MAPE)
    mape[i] = 100 * (errors[i] / y_test)
    
    # Calculate the vector metrics
    accuracy[i] = 100 - np.mean(mape[i])
    mae[i] = np.mean(errors[i])
    rmse[i] = np.sqrt(mean_squared_error(y_test, ytest_pred[i]))
    aic[i] = 2*numFeatures - 2*regrModel.llf
    

# Display the metrics
print('The results of the XGBoost model with optimal tuning parameter:\n')
print('Mean Absolute Error: %.2f ' %np.mean(mae))  
print('Root Mean Square Error: %.2f ' %np.mean(rmse))
print('Accuracy:', round(np.mean(accuracy), 2), '%.')
The results of the XGBoost model with optimal tuning parameter:

Mean Absolute Error: 0.07 
Root Mean Square Error: 0.10 
Accuracy: 93.72 %.

Features selection of the XGBoost model

Another method to triyng to improve the performances is removing some useless features by visualizing your XGBoost models is to examine the importance of each feature column in the original dataset within the model.

There are several types of importance in the Xgboost - it can be computed in several different ways. The default type is gain that shows the average gain across all splits where feature was used

In [159]:
sorted_idx = (-xg_reg.feature_importances_).argsort()

plt.figure(figsize = (8,3))
pyplot.bar(range(len(xg_reg.feature_importances_[sorted_idx])), xg_reg.feature_importances_[sorted_idx])
# Tick labels for x axis
plt.xticks(range(len(xg_reg.feature_importances_[sorted_idx])), X_train.columns[sorted_idx], rotation='90')
# Axis labels and title
plt.ylabel('Importance') 
#plt.xlabel('Variable')
plt.title('Variable Importances for the XGBoost model')
pyplot.show()

We want to choose all the feature that give a gain more than 95%

In [160]:
sortImportance = xg_reg.feature_importances_[sorted_idx]
namesOrderFeatures = X_train.columns[sorted_idx]

totalGain = 0
numbImportFeatures = 0
for i in range(0, X.shape[1]):
    if totalGain >0.95:
        n = i
        break;
    else:
        totalGain = totalGain + sortImportance[i]  
        
        
a = 'In order to obtain a total gain of %.2f' %(100*totalGain)
print(a + '%, we will consider only the following ',n , 'features: \n \n', namesOrderFeatures[:n].values)
In order to obtain a total gain of 95.36%, we will consider only the following  14 features: 
 
 ['maximum_motorSpeed' 'mean_motorSpeed' 'minimum_motorSpeed'
 'rootMeanSquare_motorSpeed' 'mean_torque' 'rootMeanSquare_is'
 'minimum_torque' 'maximum_is' 'maximum_torque' 'skewnees_motorSpeed'
 'mean_ambient' 'skewnees_statorYoke' 'kurtos_is' 'minimum_is']
In [167]:
X_optFeat_xgb = X[['maximum_motorSpeed', 'mean_motorSpeed', 'minimum_motorSpeed',
 'rootMeanSquare_motorSpeed', 'mean_torque', 'rootMeanSquare_is',
 'minimum_torque', 'maximum_is', 'maximum_torque', 'skewnees_motorSpeed',
 'mean_ambient', 'skewnees_statorYoke', 'kurtos_is', 'minimum_is']]
                                                    

X_train, X_test, y_train, y_test = train_test_split(X_optFeat_xgb, y, test_size = 0.20, random_state=1)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=1)

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', max_depth = 3, 
                                     min_child_weight = 3, 
                                     learning_rate = 0.15, 
                                     gamma = 0.0, 
                                     colsample_bytree = 0.9, 
                                     alpha = 0, 
                                     n_estimators = 150, 
                                     seed = 1)

xg_reg.fit(X_train,y_train)
ytest_pred = xg_reg.predict(X_test)

fig, (ax1, ax2) = plt.subplots(2, figsize=(10,7))
fig.suptitle('XGBoost model after tuning the parameters')

ax1.plot(y_test.values, label= 'true values')
ax1.plot(ytest_pred, 'ro', label= 'predicted values')
ax1.legend()
ax1.set_title('True values VS predicted values')
ax1.set(xlabel='', ylabel='mean_us values')


ax2.plot(y_test.values[sort_index], y_test.values[sort_index])
ax2.plot(y_test.values[sort_index], ytest_pred[sort_index], 'ro')
ax2.set(xlabel='true values', ylabel='predicted values')
Out[167]:
[Text(0, 0.5, 'predicted values'), Text(0.5, 0, 'true values')]
In [168]:
ytest_pred = np.zeros((numSimulations, lengthTrainSet))
error = np.zeros((numSimulations, lengthTrainSet))
mape = np.zeros((numSimulations, lengthTrainSet))
accuracy = np.zeros(numSimulations)
mae = np.zeros(numSimulations)
rmse = np.zeros(numSimulations)
#aic = np.zeros(numSimulations)

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', max_depth = 3, 
                                     min_child_weight = 3, 
                                     learning_rate = 0.15, 
                                     gamma = 0.0, 
                                     colsample_bytree = 0.9, 
                                     alpha = 0, 
                                     n_estimators = 150, 
                                     seed = 1)

for i in range(0,numSimulations):
    X_train, X_test, y_train, y_test = train_test_split(X_optFeat_xgb, y, test_size = 0.20, random_state=i)

    #xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', max_depth = 3, 
    #                                 min_child_weight = 5, 
    #                                 learning_rate = 0.15, 
    #                                 gamma = 0.0, 
    #                                 colsample_bytree = 0.7, 
    #                                 alpha = 0, 
    #                                 n_estimators = 1000, 
    #                                 seed = 1)
                                         
    xg_reg = xg_reg.fit(X_train, y_train)
    ytest_pred[i] = xg_reg.predict(X_test)
    
    # Calculate the absolute errors vector
    errors[i] = abs(y_test - ytest_pred[i])
    # Calculate mean absolute percentage error vector(MAPE)
    mape[i] = 100 * (errors[i] / y_test)
    
    # Calculate the vector metrics
    accuracy[i] = 100 - np.mean(mape[i])
    mae[i] = np.mean(errors[i])
    rmse[i] = np.sqrt(mean_squared_error(y_test, ytest_pred[i]))
    #aic[i] = 2*numFeatures - 2*xg_reg.llf
    

# Display the metrics
print('The results of the XGBoost model with optimal tuning parameter:\n')
print('Mean Absolute Error: %.2f ' %np.mean(mae))  
print('Root Mean Square Error: %.2f ' %np.mean(rmse))
print('Accuracy:', round(np.mean(accuracy), 2), '%.')
The results of the XGBoost model with optimal tuning parameter:

Mean Absolute Error: 0.07 
Root Mean Square Error: 0.09 
Accuracy: 93.94 %.
In [ ]:
 

Random Forest regressor

In [63]:
from numpy import mean
from numpy import std
from numpy import arange
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns;

#from sklearn.datasets import make_classification
#from sklearn.model_selection import cross_val_score
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.model_selection import RepeatedStratifiedKFold
#from sklearn.ensemble import BaggingClassifier
In [584]:
 
In [118]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(random_state = 0)

ytest_pred = np.zeros((numSimulations, lengthTrainSet))
error = np.zeros((numSimulations, lengthTrainSet))
mape = np.zeros((numSimulations, lengthTrainSet))
accuracy = np.zeros(numSimulations)
mae = np.zeros(numSimulations)
rmse = np.zeros(numSimulations)
#aic = np.zeros(numSimulations)

for i in range(0,numSimulations):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=i)
    #X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=i)
                                              

    # Train the model on training data
    rf.fit(X_train, y_train);
    # Use the forest's predict method on the test data
    ytest_pred[i] = rf.predict(X_test)

    # Calculate the absolute errors vector
    errors[i] = abs(y_test - ytest_pred[i])
    # Calculate mean absolute percentage error vector(MAPE)
    mape[i] = 100 * (errors[i] / y_test)
    
    # Calculate the vector metrics
    accuracy[i] = 100 - np.mean(mape[i])
    mae[i] = np.mean(errors[i])
    rmse[i] = np.sqrt(mean_squared_error(y_test, ytest_pred[i]))
    aic[i] = 2*numFeatures - 2*regrModel.llf
    

# Display the metrics
print('The results of the first Random Forest model:\n')
print('Mean Absolute Error: %.2f ' %np.mean(mae))  
print('Root Mean Square Error: %.2f ' %np.mean(rmse))
print('Accuracy:', round(np.mean(accuracy), 2), '%.')
The results of the first Random Forest model:

Mean Absolute Error: 0.08 
Root Mean Square Error: 0.11 
Accuracy: 92.69 %.
In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=1)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=1)

# Train the model on training data
rf.fit(X_train, y_train);
    
ytest_pred = rf.predict(X_test)

fig, (ax1, ax2) = plt.subplots(2, figsize=(10,7))
fig.suptitle('First Random Forest model')

ax1.plot(y_test.values, label= 'true values')
ax1.plot(ytest_pred, 'ro', label= 'predicted values')
ax1.legend()
ax1.set_title('True values VS predicted values')
ax1.set(xlabel='', ylabel='mean_us values')


ax2.plot(y_test.values[sort_index], y_test.values[sort_index])
ax2.plot(y_test.values[sort_index], ytest_pred[sort_index], 'ro')
ax2.set(xlabel='true values', ylabel='predicted values')
Out[66]:
[Text(0, 0.5, 'predicted values'), Text(0.5, 0, 'true values')]
In [ ]:
 

Random Forest with a Random Search for tuning the parameters.

In [45]:
# Split the data in train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=1)
In [46]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]

# Number of features to consider at every split
numbOfFeat = len(X.columns)
max_features = ['auto', 'sqrt', numbOfFeat/2]

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 100, num = 10)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Max number of samples to take into account at each tree
max_samples = [0.1, 0.3, 0.5,  0.7, 0.9, 1]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'max_samples' : max_samples,
               'bootstrap': bootstrap}
print(random_grid)
{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], 'max_features': ['auto', 'sqrt', 14.0], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'max_samples': [0.1, 0.3, 0.5, 0.7, 0.9, 1], 'bootstrap': [True, False]}

On each iteration, the algorithm will choose a difference combination of the features. Altogether, there are 10 3 11 3 3* 6 = 17820 settings! However, the benefit of a random search is that we are not trying every combination, but selecting at random to sample a wide range of values

Now, we instantiate the random search and fit it like any Scikit-Learn model. The most important arguments in RandomizedSearchCV are n_iter, which controls the number of different combinations to try, and cv which is the number of folds to use for cross validation (we use 100 and 3 respectively). More iterations will cover a wider search space and more cv folds reduces the chances of overfitting, but raising each will increase the run time. Machine learning is a field of trade-offs, and performance vs time is one of the most fundamental.

In [54]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor(random_state = 42)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, scoring = 'neg_mean_absolute_error',
                               n_iter = 500, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train) #or X_train
Fitting 3 folds for each of 500 candidates, totalling 1500 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   23.9s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   55.2s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed:  3.6min finished
Out[54]:
RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(random_state=42),
                   n_iter=500, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, None],
                                        'max_features': ['auto', 'sqrt', 14.0],
                                        'max_samples': [0.1, 0.3, 0.5, 0.7, 0.9,
                                                        1],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500, 600, 700, 800,
                                                         900, 1000]},
                   random_state=42, scoring='neg_mean_absolute_error',
                   verbose=2)
In [55]:
rf_random.best_params_
Out[55]:
{'n_estimators': 500,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_samples': 0.9,
 'max_features': 'auto',
 'max_depth': 60,
 'bootstrap': True}
In [123]:
# Instantiate model with the best parameters that we have found
rf_bestRandParam = RandomForestRegressor(n_estimators = 500,
 min_samples_split = 5,
 min_samples_leaf = 2,
 max_samples = 0.9,
 max_features = 'auto',
 max_depth = 60,
 bootstrap = True,
 random_state = 42)

ytest_pred = np.zeros((numSimulations, lengthTrainSet))
error = np.zeros((numSimulations, lengthTrainSet))
mape = np.zeros((numSimulations, lengthTrainSet))
accuracy = np.zeros(numSimulations)
mae = np.zeros(numSimulations)
rmse = np.zeros(numSimulations)
#aic = np.zeros(numSimulations)

for i in range(0,numSimulations):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=i)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=i)
                                              

    # Train the model on training data
    rf_bestRandParam.fit(X_train, y_train);
    # Use the forest's predict method on the test data
    ytest_pred[i] = rf_bestRandParam.predict(X_test)

    # Calculate the absolute errors vector
    errors[i] = abs(y_test - ytest_pred[i])
    # Calculate mean absolute percentage error vector(MAPE)
    mape[i] = 100 * (errors[i] / y_test)
    
    # Calculate the vector metrics
    accuracy[i] = 100 - np.mean(mape[i])
    mae[i] = np.mean(errors[i])
    rmse[i] = np.sqrt(mean_squared_error(y_test, ytest_pred[i]))
    #aic[i] = 2*numFeatures - 2*rf_bestParam.llf
    

# Display the metrics
print('The results of the Random Forest model with a Random Search \nfor tuning the parameters :\n')
print('Mean Absolute Error: %.2f ' %np.mean(mae))  
print('Root Mean Square Error: %.2f ' %np.mean(rmse))
print('Accuracy:', round(np.mean(accuracy), 2), '%.')
The results of the Random Forest model with a Random Search 
for tuning the parameters :

Mean Absolute Error: 0.09 
Root Mean Square Error: 0.12 
Accuracy: 91.75 %.
In [125]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 1)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = 1)


# Train the model on training data
rf_bestRandParam.fit(X_train, y_train);
    
ytest_pred = rf_bestRandParam.predict(X_test)

fig, (ax1, ax2) = plt.subplots(2, figsize=(10,7))
fig.suptitle('Random Forest model with a Random Search')

ax1.plot(y_test.values, label= 'true values')
ax1.plot(ytest_pred, 'ro', label= 'predicted values')
ax1.legend()
ax1.set_title('True values VS predicted values')
ax1.set(xlabel='', ylabel='mean_us values')


ax2.plot(y_test.values[sort_index], y_test.values[sort_index])
ax2.plot(y_test.values[sort_index], ytest_pred[sort_index], 'ro')
ax2.set(xlabel='true values', ylabel='predicted values')
Out[125]:
[Text(0, 0.5, 'predicted values'), Text(0.5, 0, 'true values')]

We have no improved our model. That means the Random Search has no explorated the default parameter solution, and all of the other ones had worse performance than the first one.

In order to quantify the usefulness of all the variables in the entire random forest, we can look at the relative importances of the variables. The importances returned in Skicit-learn represent how much including a particular variable improves the prediction.

In [126]:
# Get numerical feature importances
importances = list(rf_bestRandParam.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(X_train.columns, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];
Variable: maximum_motorSpeed   Importance: 0.31
Variable: mean_motorSpeed      Importance: 0.25
Variable: minimum_motorSpeed   Importance: 0.16
Variable: mean_torque          Importance: 0.07
Variable: rootMeanSquare_motorSpeed Importance: 0.04
Variable: rootMeanSquare_is    Importance: 0.04
Variable: minimum_torque       Importance: 0.02
Variable: maximum_torque       Importance: 0.02
Variable: minimum_is           Importance: 0.02
Variable: skewnees_motorSpeed  Importance: 0.01
Variable: kurtos_motorSpeed    Importance: 0.01
Variable: kurtos_torque        Importance: 0.01
Variable: maximum_is           Importance: 0.01
Variable: skewnees_ambient     Importance: 0.0
Variable: kurtos_ambient       Importance: 0.0
Variable: rootMeanSquare_ambient Importance: 0.0
Variable: mean_ambient         Importance: 0.0
Variable: skewnees_torque      Importance: 0.0
Variable: skewnees_pm          Importance: 0.0
Variable: kurtos_pm            Importance: 0.0
Variable: rootMeanSquare_pm    Importance: 0.0
Variable: mean_pm              Importance: 0.0
Variable: skewnees_statorYoke  Importance: 0.0
Variable: kurtos_statorYoke    Importance: 0.0
Variable: rootMeanSquare_statorYoke Importance: 0.0
Variable: mean_statorYoke      Importance: 0.0
Variable: skewnees_is          Importance: 0.0
Variable: kurtos_is            Importance: 0.0

In future implementations of the model, we can remove those variables that have no importance and the performance will not suffer. Additionally, if we are using a different model, let's say a support vector machine, we could use the random forest feature importances as a kind of feature selection method

In [127]:
sorted_idx = (-rf_bestRandParam.feature_importances_).argsort()

plt.figure(figsize = (8,3))
pyplot.bar(range(len(rf_bestRandParam.feature_importances_[sorted_idx])), rf_bestRandParam.feature_importances_[sorted_idx])
# Tick labels for x axis
plt.xticks(range(len(rf_bestRandParam.feature_importances_[sorted_idx])), X.columns[sorted_idx], rotation='90')
# Axis labels and title
plt.ylabel('Importance') 
#plt.xlabel('Variable')
plt.title('Variable Importances for the Random Forest model')
pyplot.show()
In [128]:
sortImportance = rf_bestRandParam.feature_importances_[sorted_idx]
namesOrderFeatures = X_train.columns[sorted_idx]

totalGain = 0
numbImportFeatures = 0
for i in range(0, X.shape[1]):
    if totalGain >0.95:
        n = i
        break;
    else:
        totalGain = totalGain + sortImportance[i]  
        
        
a = 'In order to obtain a total gain of %.2f' %(100*totalGain)
print(a + '%,\n we will consider only the following ',n , 'features: \n \n', namesOrderFeatures[:n].values)
In order to obtain a total gain of 95.25%,
 we will consider only the following  14 features: 
 
 ['maximum_motorSpeed' 'mean_motorSpeed' 'minimum_motorSpeed' 'mean_torque'
 'rootMeanSquare_is' 'rootMeanSquare_motorSpeed' 'maximum_torque'
 'minimum_torque' 'minimum_is' 'kurtos_motorSpeed' 'maximum_is'
 'skewnees_motorSpeed' 'kurtos_torque' 'skewnees_torque']
In [72]:
X_optFeat_rf = X[['mean_motorSpeed', 'maximum_motorSpeed', 'minimum_motorSpeed', 'mean_torque',
 'rootMeanSquare_is', 'rootMeanSquare_motorSpeed', 'maximum_torque',
 'minimum_is', 'minimum_torque', 'maximum_is', 'skewnees_motorSpeed',
 'kurtos_torque', 'kurtos_motorSpeed', 'skewnees_torque']]
                                                    
# New random forest with only the t most important variables


ytest_pred = np.zeros((numSimulations, lengthTrainSet))
error = np.zeros((numSimulations, lengthTrainSet))
mape = np.zeros((numSimulations, lengthTrainSet))
accuracy = np.zeros(numSimulations)
mae = np.zeros(numSimulations)
rmse = np.zeros(numSimulations)
#aic = np.zeros(numSimulations)


for i in range(0,numSimulations):
    X_train, X_test, y_train, y_test = train_test_split(X_optFeat_rf, y, test_size = 0.20, random_state=i)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=i)

                                    
    rf_bestRandParam = rf_bestRandParam.fit(X_train, y_train)
    ytest_pred[i] = rf_bestRandParam.predict(X_test)
    
    # Calculate the absolute errors vector
    errors[i] = abs(y_test - ytest_pred[i])
    # Calculate mean absolute percentage error vector(MAPE)
    mape[i] = 100 * (errors[i] / y_test)
    
    # Calculate the vector metrics
    accuracy[i] = 100 - np.mean(mape[i])
    mae[i] = np.mean(errors[i])
    rmse[i] = np.sqrt(mean_squared_error(y_test, ytest_pred[i]))
    #aic[i] = 2*numFeatures - 2*rf_mostImportFeat.llf
    

# Display the metrics
print('The results of the Random Forest model with a subset of the features:\n')
print('Mean Absolute Error: %.2f ' %np.mean(mae))  
print('The Root Mean Square Error: %.2f ' %np.mean(rmse))
print('Accuracy:', round(np.mean(accuracy), 2), '%.')
The results of the Random Forest model with a subset of the features:

Mean Absolute Error: 0.08 
The Root Mean Square Error: 0.12 
Accuracy: 92.48 %.
In [74]:
X_train, X_test, y_train, y_test = train_test_split(X_optFeat_rf, y, test_size = 0.20, random_state=1)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=1)

# Train the model on training data
rf_bestRandParam.fit(X_train, y_train);
    
ytest_pred = rf_bestRandParam.predict(X_test)

fig, (ax1, ax2) = plt.subplots(2, figsize=(10,7))
fig.suptitle('Random Forest model after features selection')

ax1.plot(y_test.values, label= 'true values')
ax1.plot(ytest_pred, 'ro', label= 'predicted values')
ax1.legend()
ax1.set_title('True values VS predicted values')
ax1.set(xlabel='', ylabel='mean_us values')


ax2.plot(y_test.values[sort_index], y_test.values[sort_index])
ax2.plot(y_test.values[sort_index], ytest_pred[sort_index], 'ro')
ax2.set(xlabel='true values', ylabel='predicted values')
Out[74]:
[Text(0, 0.5, 'predicted values'), Text(0.5, 0, 'true values')]
Random Forest with a Grid Search for tuning the parameters
In [ ]:
from sklearn.model_selection import GridSearchCV

Random search allowed us to narrow down the range for each hyperparameter. Now that we know where to concentrate our search, we can explicitly specify every combination of settings to try. We do this with GridSearchCV, a method that, instead of sampling randomly from a distribution, evaluates all combinations we define. To use Grid Search, we make another grid based on the best values provided by random search

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 1)

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [False, True],
    'max_depth': [40, 60, 80, None],
    'max_features': [ 'auto', 'sqrt', len(X.columns)/2],
    #'min_samples_leaf': [1, 2],
    'min_samples_split': [1, 2, 3, 5],
    'max_samples' : [0.2, 0.5, 0,9, 1.0],
    'n_estimators': [80, 100, 200, 500, 700, 900]
}
# Create a based model
rf = RandomForestRegressor(random_state = 42, criterion = 'mae')
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, scoring = 'neg_mean_absolute_error', 
                          cv = 5, n_jobs = -1, verbose = 2)
In [84]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
Fitting 5 folds for each of 2880 candidates, totalling 14400 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 476 tasks      | elapsed: 13.1min
[Parallel(n_jobs=-1)]: Done 1055 tasks      | elapsed: 18.8min
[Parallel(n_jobs=-1)]: Done 1965 tasks      | elapsed: 25.4min
[Parallel(n_jobs=-1)]: Done 2763 tasks      | elapsed: 37.7min
[Parallel(n_jobs=-1)]: Done 3825 tasks      | elapsed: 48.7min
[Parallel(n_jobs=-1)]: Done 5058 tasks      | elapsed: 58.8min
[Parallel(n_jobs=-1)]: Done 6488 tasks      | elapsed: 78.0min
[Parallel(n_jobs=-1)]: Done 8018 tasks      | elapsed: 81.7min
[Parallel(n_jobs=-1)]: Done 9735 tasks      | elapsed: 84.7min
[Parallel(n_jobs=-1)]: Done 11616 tasks      | elapsed: 88.3min
[Parallel(n_jobs=-1)]: Done 13903 tasks      | elapsed: 92.0min
[Parallel(n_jobs=-1)]: Done 14400 out of 14400 | elapsed: 92.1min finished
Out[84]:
GridSearchCV(cv=5,
             estimator=RandomForestRegressor(criterion='mae', random_state=42),
             n_jobs=-1,
             param_grid={'bootstrap': [False, True],
                         'max_depth': [40, 60, 80, None],
                         'max_features': ['auto', 'sqrt', 14.0],
                         'max_samples': [0.2, 0.5, 0, 9, 1.0],
                         'min_samples_split': [1, 2, 3, 5],
                         'n_estimators': [80, 100, 200, 500, 700, 900]},
             scoring='neg_mean_absolute_error', verbose=2)
In [85]:
best_grid = grid_search.best_params_
best_grid
Out[85]:
{'bootstrap': False,
 'max_depth': 40,
 'max_features': 'sqrt',
 'max_samples': 0.2,
 'min_samples_split': 3,
 'n_estimators': 500}
In [129]:
# Instantiate model with the best parameters that we have found
rf_bestGridParam = RandomForestRegressor(n_estimators = 500,
                                         min_samples_split = 3,
                                         #min_samples_leaf = 1,
                                         max_samples = 0.2,
                                         max_features = 'sqrt',
                                         max_depth = 40,
                                         bootstrap = False,
                                         #criterion = 'mae',
                                         random_state = 42)

ytest_pred = np.zeros((numSimulations, lengthTrainSet))
error = np.zeros((numSimulations, lengthTrainSet))
mape = np.zeros((numSimulations, lengthTrainSet))
accuracy = np.zeros(numSimulations)
mae = np.zeros(numSimulations)
rmse = np.zeros(numSimulations)
#aic = np.zeros(numSimulations)

for i in range(0,100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=i)

    # Train the model on training data
    rf_bestGridParam.fit(X_train, y_train);
    # Use the forest's predict method on the test data
    ytest_pred[i] = rf_bestGridParam.predict(X_test)

    # Calculate the absolute errors vector
    errors[i] = abs(y_test - ytest_pred[i])
    # Calculate mean absolute percentage error vector(MAPE)
    mape[i] = 100 * (errors[i] / y_test)
    
    # Calculate the vector metrics
    accuracy[i] = 100 - np.mean(mape[i])
    mae[i] = np.mean(errors[i])
    rmse[i] = np.sqrt(mean_squared_error(y_test, ytest_pred[i]))
    #aic[i] = 2*numFeatures - 2*rf_bestParam.llf
    

# Display the metrics
print('The results of the first Random Forest model:\n')
print('Mean Absolute Error: %.2f ' %np.mean(mae))  
print('The Root Mean Square Error: %.2f ' %np.mean(rmse))
print('Accuracy:', round(np.mean(accuracy), 2), '%.')
The results of the first Random Forest model:

Mean Absolute Error: 0.08 
The Root Mean Square Error: 0.12 
Accuracy: 92.2 %.
In [130]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 1)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = 1)


# Train the model on training data
rf_bestGridParam.fit(X_train, y_train);
    
ytest_pred = rf_bestGridParam.predict(X_test)

fig, (ax1, ax2) = plt.subplots(2, figsize=(10,7))
fig.suptitle('Random Forest model with a grid search for tuning the parameter')

ax1.plot(y_test.values, label= 'true values')
ax1.plot(ytest_pred, 'ro', label= 'predicted values')
ax1.legend()
ax1.set_title('True values VS predicted values')
ax1.set(xlabel='', ylabel='mean_us values')


ax2.plot(y_test.values[sort_index], y_test.values[sort_index])
ax2.plot(y_test.values[sort_index], ytest_pred[sort_index], 'ro')
ax2.set(xlabel='true values', ylabel='predicted values')
Out[130]:
[Text(0, 0.5, 'predicted values'), Text(0.5, 0, 'true values')]

Random Forest model with optimal parameter and features selection.

In order to quantify the usefulness of all the variables in the entire random forest, we can look at the relative importances of the variables. The importances returned in Skicit-learn represent how much including a particular variable improves the prediction.

In [131]:
# Get numerical feature importances
importances = list(rf_bestGridParam.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(X_train, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];
Variable: mean_motorSpeed      Importance: 0.19
Variable: maximum_motorSpeed   Importance: 0.15
Variable: minimum_motorSpeed   Importance: 0.1
Variable: minimum_is           Importance: 0.07
Variable: mean_torque          Importance: 0.06
Variable: rootMeanSquare_motorSpeed Importance: 0.05
Variable: minimum_torque       Importance: 0.05
Variable: maximum_torque       Importance: 0.03
Variable: mean_statorYoke      Importance: 0.03
Variable: maximum_is           Importance: 0.03
Variable: rootMeanSquare_is    Importance: 0.03
Variable: mean_ambient         Importance: 0.02
Variable: skewnees_motorSpeed  Importance: 0.02
Variable: mean_pm              Importance: 0.02
Variable: kurtos_is            Importance: 0.02
Variable: skewnees_ambient     Importance: 0.01
Variable: kurtos_ambient       Importance: 0.01
Variable: rootMeanSquare_ambient Importance: 0.01
Variable: kurtos_motorSpeed    Importance: 0.01
Variable: skewnees_torque      Importance: 0.01
Variable: kurtos_torque        Importance: 0.01
Variable: skewnees_pm          Importance: 0.01
Variable: kurtos_pm            Importance: 0.01
Variable: rootMeanSquare_pm    Importance: 0.01
Variable: skewnees_statorYoke  Importance: 0.01
Variable: kurtos_statorYoke    Importance: 0.01
Variable: rootMeanSquare_statorYoke Importance: 0.01
Variable: skewnees_is          Importance: 0.01

In future implementations of the model, we can remove those variables that have no importance and the performance will not suffer. Additionally, if we are using a different model, say a support vector machine, we could use the random forest feature importances as a kind of feature selection method

In [132]:
sorted_idx = (-rf_bestGridParam.feature_importances_).argsort()

plt.figure(figsize = (8,3))
pyplot.bar(range(len(rf_bestGridParam.feature_importances_[sorted_idx])), rf_bestGridParam.feature_importances_[sorted_idx])
# Tick labels for x axis
plt.xticks(range(len(rf_bestGridParam.feature_importances_[sorted_idx])), X.columns[sorted_idx], rotation='90')
# Axis labels and title
plt.ylabel('Importance') 
#plt.xlabel('Variable')
plt.title('Variable Importances for the Random Forest model')
pyplot.show()
In [133]:
sortImportance = rf_bestGridParam.feature_importances_[sorted_idx]
namesOrderFeatures = X_train.columns[sorted_idx]

totalGain = 0
numbImportFeatures = 0
for i in range(0, X.shape[1]):
    if totalGain >0.95:
        n = i
        break;
    else:
        totalGain = totalGain + sortImportance[i]  
        
        
a = 'In order to obtain a total gain of %.2f' %(100*totalGain)
print(a + '%, we will consider only the following ',n , 'features: \n \n', namesOrderFeatures[:n].values)
In order to obtain a total gain of 95.60%, we will consider only the following  21 features: 
 
 ['mean_motorSpeed' 'maximum_motorSpeed' 'minimum_motorSpeed' 'minimum_is'
 'mean_torque' 'rootMeanSquare_motorSpeed' 'minimum_torque'
 'rootMeanSquare_is' 'maximum_is' 'maximum_torque' 'mean_statorYoke'
 'kurtos_is' 'mean_pm' 'skewnees_motorSpeed' 'mean_ambient'
 'kurtos_torque' 'rootMeanSquare_ambient' 'kurtos_motorSpeed'
 'skewnees_is' 'rootMeanSquare_statorYoke' 'skewnees_torque']
In [134]:
X_optFeat_rf = X[['mean_motorSpeed', 'maximum_motorSpeed', 'minimum_motorSpeed', 'minimum_is',
 'mean_torque', 'rootMeanSquare_motorSpeed', 'minimum_torque',
 'rootMeanSquare_is', 'maximum_is', 'maximum_torque', 'mean_statorYoke',
 'kurtos_is', 'mean_pm', 'skewnees_motorSpeed', 'mean_ambient',
 'kurtos_torque', 'rootMeanSquare_ambient', 'kurtos_motorSpeed',
 'skewnees_is', 'rootMeanSquare_statorYoke', 'skewnees_torque']]
                                                    
# New random forest with only the t most important variables


ytest_pred = np.zeros((numSimulations, lengthTrainSet))
error = np.zeros((numSimulations, lengthTrainSet))
mape = np.zeros((numSimulations, lengthTrainSet))
accuracy = np.zeros(numSimulations)
mae = np.zeros(numSimulations)
rmse = np.zeros(numSimulations)
#aic = np.zeros(numSimulations)


for i in range(0,numSimulations):
    X_train, X_test, y_train, y_test = train_test_split(X_optFeat_rf, y, test_size = 0.20, random_state=i)
    #X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=i)

                                    
    rf_bestGridParam = rf_bestGridParam.fit(X_train, y_train)
    ytest_pred[i] = rf_bestGridParam.predict(X_test)
    
    # Calculate the absolute errors vector
    errors[i] = abs(y_test - ytest_pred[i])
    # Calculate mean absolute percentage error vector(MAPE)
    mape[i] = 100 * (errors[i] / y_test)
    
    # Calculate the vector metrics
    accuracy[i] = 100 - np.mean(mape[i])
    mae[i] = np.mean(errors[i])
    rmse[i] = np.sqrt(mean_squared_error(y_test, ytest_pred[i]))
    #aic[i] = 2*numFeatures - 2*rf_mostImportFeat.llf
    

# Display the metrics
print('The results of the Random Forest model \n with a Grid Search and a subset of the features:\n')
print('Mean Absolute Error: %.2f ' %np.mean(mae))  
print('The Root Mean Square Error: %.2f ' %np.mean(rmse))
print('Accuracy:', round(np.mean(accuracy), 2), '%.')
The results of the Random Forest model 
 with a Grid Search and a subset of the features:

Mean Absolute Error: 0.07 
The Root Mean Square Error: 0.11 
Accuracy: 92.72 %.

We have obtained a more accurated predictions than the first model bul worse than the one with the parameter selected with a random search!!

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X_optFeat_rf, y, test_size = 0.20, random_state=1)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=1)

# Train the model on training data
rf_bestGridParam.fit(X_train, y_train);
    
ytest_pred = rf_bestGridParam.predict(X_test)

fig, (ax1, ax2) = plt.subplots(2, figsize=(10,7))
fig.suptitle('Random Forest model after features selection')

ax1.plot(y_test.values, label= 'true values')
ax1.plot(ytest_pred, 'ro', label= 'predicted values')
ax1.legend()
ax1.set_title('True values VS predicted values')
ax1.set(xlabel='', ylabel='mean_us values')


ax2.plot(y_test.values[sort_index], y_test.values[sort_index])
ax2.plot(y_test.values[sort_index], ytest_pred[sort_index], 'ro')
ax2.set(xlabel='true values', ylabel='predicted values')
Out[84]:
[Text(0, 0.5, 'predicted values'), Text(0.5, 0, 'true values')]
In [ ]: