import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
# Upload the aggregated data
dataFrame = pd.read_csv('../finalAggrData.csv')
aggregatedData = dataFrame.drop(dataFrame.columns[0], axis=1)
aggregatedData
The first model we want to try is the simplest one.
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
from sklearn import metrics
X = aggregatedData.drop('mean_us', axis= 1)
y = aggregatedData['mean_us']
The result of one single linear regression model
# Slit the data in train, validation and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=1)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=1)
# Add a column of ones for the intercept coefficient
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)
# Fit the linear model
regrModel = sm.OLS(y_train, X_train)
regrModel = regrModel.fit()
print(regrModel.summary())
ytest_pred = regrModel.predict(X_test)
fig, (ax1, ax2) = plt.subplots(2, figsize=(10,7))
fig.suptitle('First linear model')
ax1.plot(y_test.values, label= 'true values')
ax1.plot(ytest_pred.values, 'ro', label= 'predicted values')
ax1.legend()
ax1.set_title('True values VS predicted values')
ax1.set(xlabel='', ylabel='mean_us values')
sort_index = np.argsort(y_test.values)
ax2.plot(y_test.values[sort_index], y_test.values[sort_index])
ax2.plot(y_test.values[sort_index], ytest_pred.values[sort_index], 'ro')
ax2.set(xlabel='true values', ylabel='predicted values')
Now we want to compute some metrics in order to better understand the results:
Moreover, for having a robust metrics for analysing the models, the output will be averaged on the different simulations that will be computed by considering different split of the data.
numSimulations = 100
lengthTrainSet = X_val.shape[0]
numFeatures = X.shape[1]
ytest_pred = np.zeros((numSimulations, lengthTrainSet))
errors = np.zeros((numSimulations, lengthTrainSet))
mape = np.zeros((numSimulations, lengthTrainSet))
accuracy = np.zeros(numSimulations)
mae = np.zeros(numSimulations)
rmse = np.zeros(numSimulations)
aic = np.zeros(numSimulations)
for i in range(0,100):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=i)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=i)
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)
regrModel = sm.OLS(y_train, X_train)
regrModel = regrModel.fit()
ytest_pred[i] = regrModel.predict(X_test)
# Calculate the absolute errors vector
errors[i] = abs(y_test - ytest_pred[i])
# Calculate mean absolute percentage error vector(MAPE)
mape[i] = 100 * (errors[i] / y_test)
# Calculate the vector metrics
accuracy[i] = 100 - np.mean(mape[i])
mae[i] = np.mean(errors[i])
rmse[i] = np.sqrt(mean_squared_error(y_test, ytest_pred[i]))
aic[i] = 2*numFeatures - 2*regrModel.llf
# Display the metrics
print('The results of the first linear model:\n')
print('Mean Absolute Error: %.2f ' %np.mean(mae))
print('The Root Mean Square Error: %.2f ' %np.mean(rmse))
print('Accuracy:', round(np.mean(accuracy), 2), '%.')
print('AIC : %.2f' %np.mean(aic) )
It is evident that not all the feature are important. That is, we can't exclude that most of the coefficient are zero. Infact the p-value associated to the null hypotesis that some coefficients beta_i are null, is very large for some of them. Regarding to the hypothesis that the model with only the intercept is equal to the one just computed, It's reasonable to reject it, since the F-static is quite big.
Now we analyze another model by considering only the following variables:
# Slit the data in train, validation and test set
X_train, X_test, y_train, y_test = train_test_split(X[['mean_ambient', 'skewnees_motorSpeed', 'mean_motorSpeed',
'minimum_torque', 'maximum_torque', 'mean_torque', 'rootMeanSquare_is',
'rootMeanSquare_statorYoke', 'minimum_is', 'skewnees_is']], y, test_size = 0.20, random_state=1)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=1)
# Add a column of ones for the intercept coefficient
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)
regrModel2 = sm.OLS(y_train, X_train)
regrModel2 = regrModel2.fit()
print(regrModel2.summary())
ytest_pred = regrModel2.predict(X_test)
fig, (ax1, ax2) = plt.subplots(2, figsize=(10,7))
fig.suptitle('Linear model more parsimonious')
ax1.plot(y_test.values, label= 'true values')
ax1.plot(ytest_pred.values, 'ro', label= 'predicted values')
ax1.legend()
ax1.set_title('True values VS predicted values')
ax1.set(xlabel='', ylabel='mean_us values')
ax2.plot(y_test.values[sort_index], y_test.values[sort_index])
ax2.plot(y_test.values[sort_index], ytest_pred.values[sort_index], 'ro')
ax2.set(xlabel='true values', ylabel='predicted values')
X_morePars = X[['mean_ambient', 'skewnees_motorSpeed', 'mean_motorSpeed',
'minimum_torque', 'maximum_torque', 'mean_torque', 'rootMeanSquare_is',
'rootMeanSquare_statorYoke', 'minimum_is', 'skewnees_is']]
numFeatures = X_morePars.shape[1]
ytest_pred = np.zeros((numSimulations, lengthTrainSet))
error = np.zeros((numSimulations, lengthTrainSet))
mape = np.zeros((numSimulations, lengthTrainSet))
accuracy = np.zeros(numSimulations)
mae = np.zeros(numSimulations)
rmse = np.zeros(numSimulations)
aic = np.zeros(numSimulations)
for i in range(0,numSimulations):
X_train, X_test, y_train, y_test = train_test_split(X_morePars, y, test_size = 0.20, random_state=i)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=i)
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)
regrModel = sm.OLS(y_train, X_train)
regrModel = regrModel.fit()
ytest_pred[i] = regrModel.predict(X_test)
# Calculate the absolute errors vector
errors[i] = abs(y_test - ytest_pred[i])
# Calculate mean absolute percentage error vector(MAPE)
mape[i] = 100 * (errors[i] / y_test)
# Calculate the vector metrics
accuracy[i] = 100 - np.mean(mape[i])
mae[i] = np.mean(errors[i])
rmse[i] = np.sqrt(mean_squared_error(y_test, ytest_pred[i]))
aic[i] = 2*numFeatures - 2*regrModel.llf
# Display the metrics
print('The results of the linear model without the feature with a p-value higher than 0.5:\n')
print('Mean Absolute Error: %.2f' %np.mean(mae))
print('The Root Mean Square Error: %.2f ' %np.mean(rmse))
print('Accuracy:', round(np.mean(accuracy), 2), '%.')
print('AIC : %.2f' %np.mean(aic) )
It seems that the model has not been made worse or improved. Using a p-value cutoff of 0.05, means that if you add 100 features to a model that are pure noise, 5 of them (on average) will still be counted as significant. Moreover, the feature associated to a larger values than 0.5, is few important only if the model is fitted with all the features except itself. But if we remove some feature, the p-value of that features could decrease (or increase)!
Now we want to select a subset of the initial features. We do that by taking into account the AIC and RMSE lowest values.
We can't compute all the combination because it would be too computational expensive!! If we suppose that we want to compute all the combinations of the features by taking into account 14 of them, the machine should consider 28 binomial 14= 28!/14!14! = 4010^6 iteration. If we suppose that each iteration lasts 0.08 seconds --> the time needed to complete the code is about 891 hours! Only for one set of variables!!
import time
# I divide the data in train and test set.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=1)
# Split the train data in train and validation set.
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=1)
import itertools
import numpy as np
import pandas as pd
import statsmodels.api as sm
AICs = {}
RMSEs = {}
rang = [1, 2, 3, 4, 5, 6, 24, 25, 26, 27, 28] # We can't compute all the combination because it would too computational expensive
# range(1,len(X_train.columns)+1, 13)
for k in rang:#
t = time.time()
# do stuff
for variables in itertools.combinations(X_train.columns, k):
predictorsTrain = X_train[list(variables)]
predictorsTest = X_val[list(variables)]
#print(variables)
#predictors['Intercept'] = 1
res = sm.OLS(y_train, sm.add_constant(predictorsTrain)).fit()
yVal_pred = res.predict(sm.add_constant(predictorsTest))
AICs[variables] = 2*(k) - 2*res.llf
RMSEs[variables] = np.sqrt(mean_squared_error(y_val, yVal_pred))
elapsed = time.time() - t
#print(elapsed)
features_minAIC = list(AICs.keys())[list(AICs.values()).index(min(AICs.values()))]
print('The features associated to a minimum AIC are: \n', features_minAIC)
print('\nThe number of features is ', len(features_minAIC), ', and the AIC obtained is equal to %.3f' %min(AICs.values()))
featuresAIC_Excluded = np.setdiff1d(X_train.columns, features_minAIC)
print('The features that have been removed when we were looking for the minimum AIC are: \n', featuresAIC_Excluded)
Now we can fit our last linear model using the features above mentioned!
# I divide the data in train and test set.
X_train, X_test, y_train, y_test = train_test_split(X[['skewnees_ambient', 'kurtos_ambient', 'rootMeanSquare_ambient',
'mean_ambient', 'minimum_motorSpeed', 'skewnees_motorSpeed',
'rootMeanSquare_motorSpeed', 'mean_motorSpeed', 'minimum_torque',
'maximum_torque', 'skewnees_torque', 'kurtos_torque', 'mean_torque',
'skewnees_pm', 'kurtos_pm', 'rootMeanSquare_pm', 'mean_pm',
'kurtos_statorYoke', 'rootMeanSquare_statorYoke', 'minimum_is',
'maximum_is', 'skewnees_is', 'kurtos_is', 'rootMeanSquare_is']],
y, test_size = 0.20, random_state=1)
regrModelFinal = sm.OLS(y_train, sm.add_constant(X_train)).fit()
print(regrModelFinal.summary())
ytest_pred = regrModelFinal.predict(sm.add_constant(X_test)) #[list(features_minAIC)]
fig, (ax1, ax2) = plt.subplots(2, figsize=(10,7))
fig.suptitle('Linear model with a subset of the optimal features')
ax1.plot(y_test.values, label= 'true values')
ax1.plot(ytest_pred.values, 'ro', label= 'predicted values')
#plt.ylim((0.4,2.6))
ax1.legend()
ax1.set_title('True VS predicted values')
ax1.set(xlabel='', ylabel='mean_us values')
ax2.plot(y_test.values[sort_index], y_test.values[sort_index])
ax2.plot(y_test.values[sort_index], ytest_pred.values[sort_index], 'ro')
ax2.set(xlabel='true values', ylabel='predicted values')
X_optFeat_linRegr = X[['skewnees_ambient', 'kurtos_ambient', 'rootMeanSquare_ambient',
'mean_ambient', 'minimum_motorSpeed', 'skewnees_motorSpeed',
'rootMeanSquare_motorSpeed', 'mean_motorSpeed', 'minimum_torque',
'maximum_torque', 'skewnees_torque', 'kurtos_torque', 'mean_torque',
'skewnees_pm', 'kurtos_pm', 'rootMeanSquare_pm', 'mean_pm',
'kurtos_statorYoke', 'rootMeanSquare_statorYoke', 'minimum_is',
'maximum_is', 'skewnees_is', 'kurtos_is', 'rootMeanSquare_is']]
X_optFeat_linRegr = sm.add_constant(X_optFeat_linRegr)
numSimulations = 100
lengthTrainSet = X_val.shape[0]
numFeatures = X_optFeat_linRegr.shape[1]
ytest_pred = np.zeros((numSimulations, lengthTrainSet))
error = np.zeros((numSimulations, lengthTrainSet))
mape = np.zeros((numSimulations, lengthTrainSet))
accuracy = np.zeros(numSimulations)
mae = np.zeros(numSimulations)
rmse = np.zeros(numSimulations)
aic = np.zeros(numSimulations)
for i in range(0,numSimulations):
X_train, X_test, y_train, y_test = train_test_split(X_optFeat_linRegr, y, test_size = 0.20, random_state=i)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=itest
regrModel = sm.OLS(y_train, X_train)
regrModel = regrModel.fit()
ytest_pred[i] = regrModel.predict(X_test)
# Calculate the absolute errors vector
errors[i] = abs(y_test - ytest_pred[i])
# Calculate mean absolute percentage error vector(MAPE)
mape[i] = 100 * (errors[i] / y_test)
# Calculate the vector metrics
accuracy[i] = 100 - np.mean(mape[i])
mae[i] = np.mean(errors[i])
rmse[i] = np.sqrt(mean_squared_error(y_test, ytest_pred[i]))
aic[i] = 2*numFeatures - 2*regrModel.llf
# Display the metrics
print('The results of the linear regression model \nwith optimal features are:\n')
print('Mean Absolute Error: %.2f ' %np.mean(mae))
print('Root Mean Square Error: %.2f ' %np.mean(rmse))
print('Accuracy:', round(np.mean(accuracy), 2), '%.')
print('AIC : %.2f' %np.mean(aic) )
Even if we haven't been able to select the best features ever for the linear regression, it is evident that this model doesn't fit good the data: the performance are so far to be good. These metrics can be taken into account as starting point..
I will build the model using Trees as base learners (which are the default base learners) using XGBoost's scikit-learn compatible API. I will also perform, along the way, some of the tuning parameters which XGBoost provides in order to improve the model's performance, and using the root mean squared error (RMSE) performance metric to check the performance of the trained model on the test set.
Now I will convert the dataset into an optimized data structure called Dmatrix that XGBoost supports and gives it acclaimed performance and efficiency gains.
import xgboost as xgb
from matplotlib import pyplot
The next step is to instantiate an XGBoost regressor object by calling the XGBRegressor() class from the XGBoost library with the hyper-parameters passed as arguments. For classification problems, I should have used the XGBClassifier() class.
The most common parameters that we should know are:
XGBoost also supports regularization parameters to penalize models as they become more complex and reduce them to simple (parsimonious) models.
# Slit the data in train, validation and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=1)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=1)
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', seed = 1)
xg_reg.fit(X_train,y_train)
ytest_pred = xg_reg.predict(X_test)
fig, (ax1, ax2) = plt.subplots(2, figsize=(10,7))
fig.suptitle('First XGBoost model')
ax1.plot(y_test.values, label= 'true values')
ax1.plot(ytest_pred, 'ro', label= 'predicted values')
ax1.legend()
ax1.set_title('True values VS predicted values')
ax1.set(xlabel='', label='mean_us values')
ax2.plot(y_test.values[sort_index], y_test.values[sort_index])
ax2.plot(y_test.values[sort_index], ytest_pred[sort_index], 'ro')
ax2.set(xlabel='true values', ylabel='predicted values')
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', seed = 1)
ytest_pred = np.zeros((numSimulations, lengthTrainSet))
error = np.zeros((numSimulations, lengthTrainSet))
mape = np.zeros((numSimulations, lengthTrainSet))
accuracy = np.zeros(numSimulations)
mae = np.zeros(numSimulations)
rmse = np.zeros(numSimulations)
#aic = np.zeros(numSimulations)
for i in range(0,numSimulations):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=i)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=i)
xg_reg.fit(X_train,y_train)
ytest_pred[i] = xg_reg.predict(X_test)
# Calculate the absolute errors vector
errors[i] = abs(y_test - ytest_pred[i])
# Calculate mean absolute percentage error vector(MAPE)
mape[i] = 100 * (errors[i] / y_test)
# Calculate the vector metrics
accuracy[i] = 100 - np.mean(mape[i])
mae[i] = np.mean(errors[i])
rmse[i] = np.sqrt(mean_squared_error(y_test, ytest_pred[i]))
aic[i] = 2*numFeatures - 2*regrModel.llf
# Display the metrics
print('The results of the XGBoost model with \n the parameters chosen by default:\n')
print('Mean Absolute Error: %.2f ' %np.mean(mae))
print('Root Mean Square Error: %.2f ' %np.mean(rmse))
print('Accuracy:', round(np.mean(accuracy), 2), '%.')
These are definetely good results! The predicted values seem to fit the real ones and the performance are very good by reaching an accuracy higher than 90%!
In order to build more robust models, it is common to do a k-fold cross validation for tuning the parameters, where all the entries in the original training dataset are used for both training as well as validation. Also, each entry is used for validation just once. XGBoost supports k-fold cross validation via the cv() method. All we need to do is specify the nfolds parameter, which is the number of cross validation sets I want to build
The first parameter we will look at is not part of the params dictionary, but will be passed as a standalone argument to the training method. This parameter is called num_boost_round and corresponds to the number of boosting rounds or trees to build
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=1)
dtrain = xgb.DMatrix(data = X_train,label=y_train)
dval = xgb.DMatrix(data = X_val, label=y_val)
params = {
# Parameters that we are going to tune.
'max_depth':6,
'min_child_weight': 1,
'eta':.3,
'learning_rate': 0.1,
'subsample': 1,
'colsample_bytree': 1,
'alpha': 0,
# Other parameters
'objective':'reg:squarederror',
}
XGBoost provides a nice way to find the best number of rounds whilst training. Since trees are built sequentially, instead of fixing the number of rounds at the beginning, we can test our model at each step and see if adding a new tree/round improves performance
First, we need to add the evaluation metric we are interested in to our params dictionary
params['eval_metric'] = "rmse"
We still need to pass a num_boost_round which corresponds to the maximum number of boosting rounds that we allow. We set it to a large value hoping to find the optimal number of rounds before reaching it, if we haven't improved performance on our test dataset in early_stopping_round rounds
num_boost_round = 1000
model = xgb.train(
params,
dtrain,
num_boost_round=num_boost_round,
evals=[(dval, "Test")],
early_stopping_rounds=15
)
print("Best RMSE: {:.2f} with {} rounds".format(
model.best_score,
model.best_iteration+1))
As we can see we stopped before reaching the maximum number of boosting rounds, that’s because after the 85th tree adding more rounds did not lead to improvements of RMSE on the test dataset.
In order to tune the other hyperparameters, we will use the cv function from XGBoost. It allows us to run cross-validation on our training dataset and returns a mean RMSE score. We don’t need to pass a test dataset here. It’s because the cross-validation function is splitting the train dataset into nfolds and iteratively keeps one of the folds for test purposes
# Let’s make a list containing all the combinations of the parameters we want to try.
num_boost_round = 200
learning_rateList = [0.05 ,0.10, 0.15, 0.30 , 0.6, 1.0]
max_depthList = [ 3, 4, 5, 6, 8, 10, 12, 15]
min_child_weightList = [ 1, 2, 3, 4, 5]
gammaList = [ 0.0, 0.1, 0.2, 0.3, 0.4]
colsample_bytreeList = [ 0.5, 0.7, 0.9, 1.0]
alphaList = [0, 3, 6, 9]
gridsearch_params = [
(max_depth, min_child_weight, learning_rate, gamma, colsample_bytree, alpha)
for max_depth in max_depthList
for min_child_weight in min_child_weightList
for learning_rate in learning_rateList
for gamma in gammaList
for colsample_bytree in colsample_bytreeList
for alpha in alphaList
]
Let’s run cross validation on each of those pairs.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=1)
dtrain = xgb.DMatrix(data = X_train,label=y_train)
# Define initial best params
min_RMSE = float("Inf")
best_params = None
t = time.time()
# do
for max_depth, min_child_weight, learning_rate, gamma, colsample_bytree, alpha in gridsearch_params:
#print("CV with max_depth={}, min_child_weight={}".format(
# max_depth,
# min_child_weight))
# Update our parameters
params['max_depth'] = max_depth
params['min_child_weight'] = min_child_weight
params['learning_rate'] = learning_rate
params['gamma'] = gamma
params['colsample_bytree'] = colsample_bytree
params['alpha'] = alpha
# Run CV
cv_results = xgb.cv(
params,
dtrain,
num_boost_round=num_boost_round,
seed=42,
nfold=5,
metrics={'rmse'},
early_stopping_rounds=15
)
# Update best RMSE
mean_RMSE = cv_results['test-rmse-mean'].min()
boost_rounds = cv_results['test-rmse-mean'].argmin()
#print("\tMAE {} for {} rounds".format(mean_RMSE, boost_rounds))
if mean_RMSE < min_RMSE:
min_RMSE = mean_RMSE
best_params = (max_depth, min_child_weight, learning_rate, gamma, colsample_bytree, alpha)
elapsed = time.time() - t
print("Best parameters for the XGBoost model are: \n max_depth {}, \n min_child_weight {}, \n learning_rate {}, \n gamma {}, \n colsample_bytree {}, \n alpha{}.\n with a RMSE equal to {}".format(best_params[0], best_params[1], best_params[2], best_params[3], best_params[4], best_params[5], min_RMSE))
print('The time elapsed to tune the parameters of the XGBoost model is', elapsed)
# Slit the data in train, validation and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=1)
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', max_depth = 3,
min_child_weight = 3,
learning_rate = 0.15,
gamma = 0.0,
colsample_bytree = 0.9,
alpha = 0,
n_estimators = 150,
seed = 1)
xg_reg.fit(X_train,y_train)
ytest_pred = xg_reg.predict(X_test)
fig, (ax1, ax2) = plt.subplots(2, figsize=(10,7))
fig.suptitle('XGBoost model after tuning the parameters')
ax1.plot(y_test.values, label= 'true values')
ax1.plot(ytest_pred, 'ro', label= 'predicted values')
ax1.legend()
ax1.set_title('True values VS predicted values')
ax1.set(xlabel='', ylabel='mean_us values')
ax2.plot(y_test.values[sort_index], y_test.values[sort_index])
ax2.plot(y_test.values[sort_index], ytest_pred[sort_index], 'ro')
ax2.set(xlabel='true values', ylabel='predicted values')
ytest_pred = np.zeros((numSimulations, lengthTrainSet))
error = np.zeros((numSimulations, lengthTrainSet))
mape = np.zeros((numSimulations, lengthTrainSet))
accuracy = np.zeros(numSimulations)
mae = np.zeros(numSimulations)
rmse = np.zeros(numSimulations)
#aic = np.zeros(numSimulations)
for i in range(0,numSimulations):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=i)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=i)
xg_reg = xg_reg.fit(X_train, y_train)
ytest_pred[i] = xg_reg.predict(X_test)
# Calculate the absolute errors vector
errors[i] = abs(y_test - ytest_pred[i])
# Calculate mean absolute percentage error vector(MAPE)
mape[i] = 100 * (errors[i] / y_test)
# Calculate the vector metrics
accuracy[i] = 100 - np.mean(mape[i])
mae[i] = np.mean(errors[i])
rmse[i] = np.sqrt(mean_squared_error(y_test, ytest_pred[i]))
aic[i] = 2*numFeatures - 2*regrModel.llf
# Display the metrics
print('The results of the XGBoost model with optimal tuning parameter:\n')
print('Mean Absolute Error: %.2f ' %np.mean(mae))
print('Root Mean Square Error: %.2f ' %np.mean(rmse))
print('Accuracy:', round(np.mean(accuracy), 2), '%.')
Another method to triyng to improve the performances is removing some useless features by visualizing your XGBoost models is to examine the importance of each feature column in the original dataset within the model.
There are several types of importance in the Xgboost - it can be computed in several different ways. The default type is gain that shows the average gain across all splits where feature was used
sorted_idx = (-xg_reg.feature_importances_).argsort()
plt.figure(figsize = (8,3))
pyplot.bar(range(len(xg_reg.feature_importances_[sorted_idx])), xg_reg.feature_importances_[sorted_idx])
# Tick labels for x axis
plt.xticks(range(len(xg_reg.feature_importances_[sorted_idx])), X_train.columns[sorted_idx], rotation='90')
# Axis labels and title
plt.ylabel('Importance')
#plt.xlabel('Variable')
plt.title('Variable Importances for the XGBoost model')
pyplot.show()
We want to choose all the feature that give a gain more than 95%
sortImportance = xg_reg.feature_importances_[sorted_idx]
namesOrderFeatures = X_train.columns[sorted_idx]
totalGain = 0
numbImportFeatures = 0
for i in range(0, X.shape[1]):
if totalGain >0.95:
n = i
break;
else:
totalGain = totalGain + sortImportance[i]
a = 'In order to obtain a total gain of %.2f' %(100*totalGain)
print(a + '%, we will consider only the following ',n , 'features: \n \n', namesOrderFeatures[:n].values)
X_optFeat_xgb = X[['maximum_motorSpeed', 'mean_motorSpeed', 'minimum_motorSpeed',
'rootMeanSquare_motorSpeed', 'mean_torque', 'rootMeanSquare_is',
'minimum_torque', 'maximum_is', 'maximum_torque', 'skewnees_motorSpeed',
'mean_ambient', 'skewnees_statorYoke', 'kurtos_is', 'minimum_is']]
X_train, X_test, y_train, y_test = train_test_split(X_optFeat_xgb, y, test_size = 0.20, random_state=1)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=1)
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', max_depth = 3,
min_child_weight = 3,
learning_rate = 0.15,
gamma = 0.0,
colsample_bytree = 0.9,
alpha = 0,
n_estimators = 150,
seed = 1)
xg_reg.fit(X_train,y_train)
ytest_pred = xg_reg.predict(X_test)
fig, (ax1, ax2) = plt.subplots(2, figsize=(10,7))
fig.suptitle('XGBoost model after tuning the parameters')
ax1.plot(y_test.values, label= 'true values')
ax1.plot(ytest_pred, 'ro', label= 'predicted values')
ax1.legend()
ax1.set_title('True values VS predicted values')
ax1.set(xlabel='', ylabel='mean_us values')
ax2.plot(y_test.values[sort_index], y_test.values[sort_index])
ax2.plot(y_test.values[sort_index], ytest_pred[sort_index], 'ro')
ax2.set(xlabel='true values', ylabel='predicted values')
ytest_pred = np.zeros((numSimulations, lengthTrainSet))
error = np.zeros((numSimulations, lengthTrainSet))
mape = np.zeros((numSimulations, lengthTrainSet))
accuracy = np.zeros(numSimulations)
mae = np.zeros(numSimulations)
rmse = np.zeros(numSimulations)
#aic = np.zeros(numSimulations)
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', max_depth = 3,
min_child_weight = 3,
learning_rate = 0.15,
gamma = 0.0,
colsample_bytree = 0.9,
alpha = 0,
n_estimators = 150,
seed = 1)
for i in range(0,numSimulations):
X_train, X_test, y_train, y_test = train_test_split(X_optFeat_xgb, y, test_size = 0.20, random_state=i)
#xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', max_depth = 3,
# min_child_weight = 5,
# learning_rate = 0.15,
# gamma = 0.0,
# colsample_bytree = 0.7,
# alpha = 0,
# n_estimators = 1000,
# seed = 1)
xg_reg = xg_reg.fit(X_train, y_train)
ytest_pred[i] = xg_reg.predict(X_test)
# Calculate the absolute errors vector
errors[i] = abs(y_test - ytest_pred[i])
# Calculate mean absolute percentage error vector(MAPE)
mape[i] = 100 * (errors[i] / y_test)
# Calculate the vector metrics
accuracy[i] = 100 - np.mean(mape[i])
mae[i] = np.mean(errors[i])
rmse[i] = np.sqrt(mean_squared_error(y_test, ytest_pred[i]))
#aic[i] = 2*numFeatures - 2*xg_reg.llf
# Display the metrics
print('The results of the XGBoost model with optimal tuning parameter:\n')
print('Mean Absolute Error: %.2f ' %np.mean(mae))
print('Root Mean Square Error: %.2f ' %np.mean(rmse))
print('Accuracy:', round(np.mean(accuracy), 2), '%.')
from numpy import mean
from numpy import std
from numpy import arange
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns;
#from sklearn.datasets import make_classification
#from sklearn.model_selection import cross_val_score
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.model_selection import RepeatedStratifiedKFold
#from sklearn.ensemble import BaggingClassifier
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(random_state = 0)
ytest_pred = np.zeros((numSimulations, lengthTrainSet))
error = np.zeros((numSimulations, lengthTrainSet))
mape = np.zeros((numSimulations, lengthTrainSet))
accuracy = np.zeros(numSimulations)
mae = np.zeros(numSimulations)
rmse = np.zeros(numSimulations)
#aic = np.zeros(numSimulations)
for i in range(0,numSimulations):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=i)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=i)
# Train the model on training data
rf.fit(X_train, y_train);
# Use the forest's predict method on the test data
ytest_pred[i] = rf.predict(X_test)
# Calculate the absolute errors vector
errors[i] = abs(y_test - ytest_pred[i])
# Calculate mean absolute percentage error vector(MAPE)
mape[i] = 100 * (errors[i] / y_test)
# Calculate the vector metrics
accuracy[i] = 100 - np.mean(mape[i])
mae[i] = np.mean(errors[i])
rmse[i] = np.sqrt(mean_squared_error(y_test, ytest_pred[i]))
aic[i] = 2*numFeatures - 2*regrModel.llf
# Display the metrics
print('The results of the first Random Forest model:\n')
print('Mean Absolute Error: %.2f ' %np.mean(mae))
print('Root Mean Square Error: %.2f ' %np.mean(rmse))
print('Accuracy:', round(np.mean(accuracy), 2), '%.')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=1)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=1)
# Train the model on training data
rf.fit(X_train, y_train);
ytest_pred = rf.predict(X_test)
fig, (ax1, ax2) = plt.subplots(2, figsize=(10,7))
fig.suptitle('First Random Forest model')
ax1.plot(y_test.values, label= 'true values')
ax1.plot(ytest_pred, 'ro', label= 'predicted values')
ax1.legend()
ax1.set_title('True values VS predicted values')
ax1.set(xlabel='', ylabel='mean_us values')
ax2.plot(y_test.values[sort_index], y_test.values[sort_index])
ax2.plot(y_test.values[sort_index], ytest_pred[sort_index], 'ro')
ax2.set(xlabel='true values', ylabel='predicted values')
# Split the data in train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=1)
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
# Number of features to consider at every split
numbOfFeat = len(X.columns)
max_features = ['auto', 'sqrt', numbOfFeat/2]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 100, num = 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Max number of samples to take into account at each tree
max_samples = [0.1, 0.3, 0.5, 0.7, 0.9, 1]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'max_samples' : max_samples,
'bootstrap': bootstrap}
print(random_grid)
On each iteration, the algorithm will choose a difference combination of the features. Altogether, there are 10 3 11 3 3* 6 = 17820 settings! However, the benefit of a random search is that we are not trying every combination, but selecting at random to sample a wide range of values
Now, we instantiate the random search and fit it like any Scikit-Learn model. The most important arguments in RandomizedSearchCV are n_iter, which controls the number of different combinations to try, and cv which is the number of folds to use for cross validation (we use 100 and 3 respectively). More iterations will cover a wider search space and more cv folds reduces the chances of overfitting, but raising each will increase the run time. Machine learning is a field of trade-offs, and performance vs time is one of the most fundamental.
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor(random_state = 42)
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, scoring = 'neg_mean_absolute_error',
n_iter = 500, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train) #or X_train
rf_random.best_params_
# Instantiate model with the best parameters that we have found
rf_bestRandParam = RandomForestRegressor(n_estimators = 500,
min_samples_split = 5,
min_samples_leaf = 2,
max_samples = 0.9,
max_features = 'auto',
max_depth = 60,
bootstrap = True,
random_state = 42)
ytest_pred = np.zeros((numSimulations, lengthTrainSet))
error = np.zeros((numSimulations, lengthTrainSet))
mape = np.zeros((numSimulations, lengthTrainSet))
accuracy = np.zeros(numSimulations)
mae = np.zeros(numSimulations)
rmse = np.zeros(numSimulations)
#aic = np.zeros(numSimulations)
for i in range(0,numSimulations):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=i)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=i)
# Train the model on training data
rf_bestRandParam.fit(X_train, y_train);
# Use the forest's predict method on the test data
ytest_pred[i] = rf_bestRandParam.predict(X_test)
# Calculate the absolute errors vector
errors[i] = abs(y_test - ytest_pred[i])
# Calculate mean absolute percentage error vector(MAPE)
mape[i] = 100 * (errors[i] / y_test)
# Calculate the vector metrics
accuracy[i] = 100 - np.mean(mape[i])
mae[i] = np.mean(errors[i])
rmse[i] = np.sqrt(mean_squared_error(y_test, ytest_pred[i]))
#aic[i] = 2*numFeatures - 2*rf_bestParam.llf
# Display the metrics
print('The results of the Random Forest model with a Random Search \nfor tuning the parameters :\n')
print('Mean Absolute Error: %.2f ' %np.mean(mae))
print('Root Mean Square Error: %.2f ' %np.mean(rmse))
print('Accuracy:', round(np.mean(accuracy), 2), '%.')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 1)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = 1)
# Train the model on training data
rf_bestRandParam.fit(X_train, y_train);
ytest_pred = rf_bestRandParam.predict(X_test)
fig, (ax1, ax2) = plt.subplots(2, figsize=(10,7))
fig.suptitle('Random Forest model with a Random Search')
ax1.plot(y_test.values, label= 'true values')
ax1.plot(ytest_pred, 'ro', label= 'predicted values')
ax1.legend()
ax1.set_title('True values VS predicted values')
ax1.set(xlabel='', ylabel='mean_us values')
ax2.plot(y_test.values[sort_index], y_test.values[sort_index])
ax2.plot(y_test.values[sort_index], ytest_pred[sort_index], 'ro')
ax2.set(xlabel='true values', ylabel='predicted values')
We have no improved our model. That means the Random Search has no explorated the default parameter solution, and all of the other ones had worse performance than the first one.
In order to quantify the usefulness of all the variables in the entire random forest, we can look at the relative importances of the variables. The importances returned in Skicit-learn represent how much including a particular variable improves the prediction.
# Get numerical feature importances
importances = list(rf_bestRandParam.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(X_train.columns, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];
In future implementations of the model, we can remove those variables that have no importance and the performance will not suffer. Additionally, if we are using a different model, let's say a support vector machine, we could use the random forest feature importances as a kind of feature selection method
sorted_idx = (-rf_bestRandParam.feature_importances_).argsort()
plt.figure(figsize = (8,3))
pyplot.bar(range(len(rf_bestRandParam.feature_importances_[sorted_idx])), rf_bestRandParam.feature_importances_[sorted_idx])
# Tick labels for x axis
plt.xticks(range(len(rf_bestRandParam.feature_importances_[sorted_idx])), X.columns[sorted_idx], rotation='90')
# Axis labels and title
plt.ylabel('Importance')
#plt.xlabel('Variable')
plt.title('Variable Importances for the Random Forest model')
pyplot.show()
sortImportance = rf_bestRandParam.feature_importances_[sorted_idx]
namesOrderFeatures = X_train.columns[sorted_idx]
totalGain = 0
numbImportFeatures = 0
for i in range(0, X.shape[1]):
if totalGain >0.95:
n = i
break;
else:
totalGain = totalGain + sortImportance[i]
a = 'In order to obtain a total gain of %.2f' %(100*totalGain)
print(a + '%,\n we will consider only the following ',n , 'features: \n \n', namesOrderFeatures[:n].values)
X_optFeat_rf = X[['mean_motorSpeed', 'maximum_motorSpeed', 'minimum_motorSpeed', 'mean_torque',
'rootMeanSquare_is', 'rootMeanSquare_motorSpeed', 'maximum_torque',
'minimum_is', 'minimum_torque', 'maximum_is', 'skewnees_motorSpeed',
'kurtos_torque', 'kurtos_motorSpeed', 'skewnees_torque']]
# New random forest with only the t most important variables
ytest_pred = np.zeros((numSimulations, lengthTrainSet))
error = np.zeros((numSimulations, lengthTrainSet))
mape = np.zeros((numSimulations, lengthTrainSet))
accuracy = np.zeros(numSimulations)
mae = np.zeros(numSimulations)
rmse = np.zeros(numSimulations)
#aic = np.zeros(numSimulations)
for i in range(0,numSimulations):
X_train, X_test, y_train, y_test = train_test_split(X_optFeat_rf, y, test_size = 0.20, random_state=i)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=i)
rf_bestRandParam = rf_bestRandParam.fit(X_train, y_train)
ytest_pred[i] = rf_bestRandParam.predict(X_test)
# Calculate the absolute errors vector
errors[i] = abs(y_test - ytest_pred[i])
# Calculate mean absolute percentage error vector(MAPE)
mape[i] = 100 * (errors[i] / y_test)
# Calculate the vector metrics
accuracy[i] = 100 - np.mean(mape[i])
mae[i] = np.mean(errors[i])
rmse[i] = np.sqrt(mean_squared_error(y_test, ytest_pred[i]))
#aic[i] = 2*numFeatures - 2*rf_mostImportFeat.llf
# Display the metrics
print('The results of the Random Forest model with a subset of the features:\n')
print('Mean Absolute Error: %.2f ' %np.mean(mae))
print('The Root Mean Square Error: %.2f ' %np.mean(rmse))
print('Accuracy:', round(np.mean(accuracy), 2), '%.')
X_train, X_test, y_train, y_test = train_test_split(X_optFeat_rf, y, test_size = 0.20, random_state=1)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=1)
# Train the model on training data
rf_bestRandParam.fit(X_train, y_train);
ytest_pred = rf_bestRandParam.predict(X_test)
fig, (ax1, ax2) = plt.subplots(2, figsize=(10,7))
fig.suptitle('Random Forest model after features selection')
ax1.plot(y_test.values, label= 'true values')
ax1.plot(ytest_pred, 'ro', label= 'predicted values')
ax1.legend()
ax1.set_title('True values VS predicted values')
ax1.set(xlabel='', ylabel='mean_us values')
ax2.plot(y_test.values[sort_index], y_test.values[sort_index])
ax2.plot(y_test.values[sort_index], ytest_pred[sort_index], 'ro')
ax2.set(xlabel='true values', ylabel='predicted values')
from sklearn.model_selection import GridSearchCV
Random search allowed us to narrow down the range for each hyperparameter. Now that we know where to concentrate our search, we can explicitly specify every combination of settings to try. We do this with GridSearchCV, a method that, instead of sampling randomly from a distribution, evaluates all combinations we define. To use Grid Search, we make another grid based on the best values provided by random search
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 1)
# Create the parameter grid based on the results of random search
param_grid = {
'bootstrap': [False, True],
'max_depth': [40, 60, 80, None],
'max_features': [ 'auto', 'sqrt', len(X.columns)/2],
#'min_samples_leaf': [1, 2],
'min_samples_split': [1, 2, 3, 5],
'max_samples' : [0.2, 0.5, 0,9, 1.0],
'n_estimators': [80, 100, 200, 500, 700, 900]
}
# Create a based model
rf = RandomForestRegressor(random_state = 42, criterion = 'mae')
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, scoring = 'neg_mean_absolute_error',
cv = 5, n_jobs = -1, verbose = 2)
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
best_grid = grid_search.best_params_
best_grid
# Instantiate model with the best parameters that we have found
rf_bestGridParam = RandomForestRegressor(n_estimators = 500,
min_samples_split = 3,
#min_samples_leaf = 1,
max_samples = 0.2,
max_features = 'sqrt',
max_depth = 40,
bootstrap = False,
#criterion = 'mae',
random_state = 42)
ytest_pred = np.zeros((numSimulations, lengthTrainSet))
error = np.zeros((numSimulations, lengthTrainSet))
mape = np.zeros((numSimulations, lengthTrainSet))
accuracy = np.zeros(numSimulations)
mae = np.zeros(numSimulations)
rmse = np.zeros(numSimulations)
#aic = np.zeros(numSimulations)
for i in range(0,100):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=i)
# Train the model on training data
rf_bestGridParam.fit(X_train, y_train);
# Use the forest's predict method on the test data
ytest_pred[i] = rf_bestGridParam.predict(X_test)
# Calculate the absolute errors vector
errors[i] = abs(y_test - ytest_pred[i])
# Calculate mean absolute percentage error vector(MAPE)
mape[i] = 100 * (errors[i] / y_test)
# Calculate the vector metrics
accuracy[i] = 100 - np.mean(mape[i])
mae[i] = np.mean(errors[i])
rmse[i] = np.sqrt(mean_squared_error(y_test, ytest_pred[i]))
#aic[i] = 2*numFeatures - 2*rf_bestParam.llf
# Display the metrics
print('The results of the first Random Forest model:\n')
print('Mean Absolute Error: %.2f ' %np.mean(mae))
print('The Root Mean Square Error: %.2f ' %np.mean(rmse))
print('Accuracy:', round(np.mean(accuracy), 2), '%.')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 1)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = 1)
# Train the model on training data
rf_bestGridParam.fit(X_train, y_train);
ytest_pred = rf_bestGridParam.predict(X_test)
fig, (ax1, ax2) = plt.subplots(2, figsize=(10,7))
fig.suptitle('Random Forest model with a grid search for tuning the parameter')
ax1.plot(y_test.values, label= 'true values')
ax1.plot(ytest_pred, 'ro', label= 'predicted values')
ax1.legend()
ax1.set_title('True values VS predicted values')
ax1.set(xlabel='', ylabel='mean_us values')
ax2.plot(y_test.values[sort_index], y_test.values[sort_index])
ax2.plot(y_test.values[sort_index], ytest_pred[sort_index], 'ro')
ax2.set(xlabel='true values', ylabel='predicted values')
In order to quantify the usefulness of all the variables in the entire random forest, we can look at the relative importances of the variables. The importances returned in Skicit-learn represent how much including a particular variable improves the prediction.
# Get numerical feature importances
importances = list(rf_bestGridParam.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(X_train, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];
In future implementations of the model, we can remove those variables that have no importance and the performance will not suffer. Additionally, if we are using a different model, say a support vector machine, we could use the random forest feature importances as a kind of feature selection method
sorted_idx = (-rf_bestGridParam.feature_importances_).argsort()
plt.figure(figsize = (8,3))
pyplot.bar(range(len(rf_bestGridParam.feature_importances_[sorted_idx])), rf_bestGridParam.feature_importances_[sorted_idx])
# Tick labels for x axis
plt.xticks(range(len(rf_bestGridParam.feature_importances_[sorted_idx])), X.columns[sorted_idx], rotation='90')
# Axis labels and title
plt.ylabel('Importance')
#plt.xlabel('Variable')
plt.title('Variable Importances for the Random Forest model')
pyplot.show()
sortImportance = rf_bestGridParam.feature_importances_[sorted_idx]
namesOrderFeatures = X_train.columns[sorted_idx]
totalGain = 0
numbImportFeatures = 0
for i in range(0, X.shape[1]):
if totalGain >0.95:
n = i
break;
else:
totalGain = totalGain + sortImportance[i]
a = 'In order to obtain a total gain of %.2f' %(100*totalGain)
print(a + '%, we will consider only the following ',n , 'features: \n \n', namesOrderFeatures[:n].values)
X_optFeat_rf = X[['mean_motorSpeed', 'maximum_motorSpeed', 'minimum_motorSpeed', 'minimum_is',
'mean_torque', 'rootMeanSquare_motorSpeed', 'minimum_torque',
'rootMeanSquare_is', 'maximum_is', 'maximum_torque', 'mean_statorYoke',
'kurtos_is', 'mean_pm', 'skewnees_motorSpeed', 'mean_ambient',
'kurtos_torque', 'rootMeanSquare_ambient', 'kurtos_motorSpeed',
'skewnees_is', 'rootMeanSquare_statorYoke', 'skewnees_torque']]
# New random forest with only the t most important variables
ytest_pred = np.zeros((numSimulations, lengthTrainSet))
error = np.zeros((numSimulations, lengthTrainSet))
mape = np.zeros((numSimulations, lengthTrainSet))
accuracy = np.zeros(numSimulations)
mae = np.zeros(numSimulations)
rmse = np.zeros(numSimulations)
#aic = np.zeros(numSimulations)
for i in range(0,numSimulations):
X_train, X_test, y_train, y_test = train_test_split(X_optFeat_rf, y, test_size = 0.20, random_state=i)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=i)
rf_bestGridParam = rf_bestGridParam.fit(X_train, y_train)
ytest_pred[i] = rf_bestGridParam.predict(X_test)
# Calculate the absolute errors vector
errors[i] = abs(y_test - ytest_pred[i])
# Calculate mean absolute percentage error vector(MAPE)
mape[i] = 100 * (errors[i] / y_test)
# Calculate the vector metrics
accuracy[i] = 100 - np.mean(mape[i])
mae[i] = np.mean(errors[i])
rmse[i] = np.sqrt(mean_squared_error(y_test, ytest_pred[i]))
#aic[i] = 2*numFeatures - 2*rf_mostImportFeat.llf
# Display the metrics
print('The results of the Random Forest model \n with a Grid Search and a subset of the features:\n')
print('Mean Absolute Error: %.2f ' %np.mean(mae))
print('The Root Mean Square Error: %.2f ' %np.mean(rmse))
print('Accuracy:', round(np.mean(accuracy), 2), '%.')
We have obtained a more accurated predictions than the first model bul worse than the one with the parameter selected with a random search!!
X_train, X_test, y_train, y_test = train_test_split(X_optFeat_rf, y, test_size = 0.20, random_state=1)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=1)
# Train the model on training data
rf_bestGridParam.fit(X_train, y_train);
ytest_pred = rf_bestGridParam.predict(X_test)
fig, (ax1, ax2) = plt.subplots(2, figsize=(10,7))
fig.suptitle('Random Forest model after features selection')
ax1.plot(y_test.values, label= 'true values')
ax1.plot(ytest_pred, 'ro', label= 'predicted values')
ax1.legend()
ax1.set_title('True values VS predicted values')
ax1.set(xlabel='', ylabel='mean_us values')
ax2.plot(y_test.values[sort_index], y_test.values[sort_index])
ax2.plot(y_test.values[sort_index], ytest_pred[sort_index], 'ro')
ax2.set(xlabel='true values', ylabel='predicted values')