import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
# read data
raw_df = pd.read_csv('../pmsm_temperature_data.csv')
pd.options.display.max_columns = None # to all columns be visualized
raw_df.head(5)
############################################## Check point #############################################################
df = raw_df.copy()
df.info()
Distances from the core of motor : pm < stator_tooth < stator_winding < stator_yoke (?).
Each row represents one snapshot of sensor data at a certain time step (0.5s).
Records with the same profile id sorted by time of recording.
# some statistics
df.describe().T
# Count the number of null values for each column
df.isnull().sum()
fig = plt.figure(figsize=(18,6))
grpd = df.groupby(['profile_id'])
_df = grpd.size().sort_values().rename('time').reset_index()
ordered_ids = _df.profile_id.values.tolist()
sns.barplot(y='time', x='profile_id', data=_df, order=ordered_ids)
tcks = plt.yticks(2*3600*np.arange(1, 8), [f'{a} hrs' for a in range(1, 8)]) # 2Hz sample rate
Plot shows that sessions' time are not same and its between almost 20 min to around 6 hours. The short sessions with ids "47","46" might be not very representetive as temperatures inside electric motors neet time to vary. It can understand that longer session are more reliable as well as should properly consider in both train and testing.
len(df.profile_id.unique())
there are 52 indipendent sessions.
df_extra_feat = df.copy()
extra_feats = {
'i_s': lambda x: np.sqrt(x['i_d']**2 + x['i_q']**2), # current vector norm
'u_s': lambda x: np.sqrt(x['u_d']**2 + x['u_q']**2), # voltage vector norm
#'S_el': lambda x: x['i_s']*x['u_s'], # apparent power
#'P_el': lambda x: x['i_d'] * x['u_d'] + x['i_q'] *x['u_q'], # effective power
#'i_s_x_w': lambda x: x['i_s']*x['motor_speed'], # Motor speed and current interaction
#'S_x_w': lambda x: x['S_el']*x['motor_speed'], # Motor speed and power interaction
}
df = df_extra_feat.assign(**extra_feats)
df.head()
It gives better view in order to realized the range of the data
# plot the boxplots of all features
plt.tight_layout(pad=0.9)
fig = plt.figure(figsize=(20,15))
plt.subplots_adjust(wspace=0.2)
nbr_columns = 4
df1 = df.drop(['profile_id'], axis=1)
nbr_graphs = len(df1.columns)
nbr_rows = int(np.ceil(nbr_graphs/nbr_columns))
columns = list(df1.columns.values)
with sns.axes_style("whitegrid"):
for i in range(0,len(columns)):
plt.subplot(nbr_rows,nbr_columns,i+1)
ax1=sns.boxplot(x= columns[i], data= df1, orient="h",color=sns.color_palette("Blues")[3])
plt.show()
corr = df1.corr()
# generate mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# generate a custom diverding colormap
cmap = sns.diverging_palette(220,10,as_cmap=True)
plt.figure(figsize=(30, 8.8))
_ = sns.heatmap(corr,mask=mask,cmap=cmap,center=0,square=True,linewidths=.5, cbar_kws={"shrink":.5},annot=True)
### bivariate scatterplot
reduced_df = df.drop(['profile_id'], axis=1) #, 'u_d', 'u_q', 'i_d', 'i_q', 'stator_tooth', 'stator_winding'
plt.rcParams.update({'font.size':25}) # customizing the plt style
g = sns.PairGrid(reduced_df.sample(frac=0.04)) # plotting pairwise relationships in dataset
#g.map_diag(sns.kdeplot) #Plot with a univariate function on each diagonal subplot.
g.map_lower(sns.scatterplot) # bivariate
The pair scatter-plots give a somewhat clearer where the linear correlation comes from.
Now we make still some analysis, to better understand the correlation within some features.
columns = ['stator_yoke','stator_tooth','stator_winding']
profile_id_list = df.profile_id.unique()
profile_id_list = np.random.choice(profile_id_list, size=6, replace=False)
nbr_column = 2
nbr_graph = len(profile_id_list)
nbr_row = int(np.ceil(nbr_graph/nbr_column))
kolomlijst = list(df.columns.values)
plt.figure(figsize=(30,nbr_row*5))
with sns.axes_style("whitegrid"):
for i in range(0,nbr_graph):
plt.subplot(nbr_row,nbr_column,i+1)
temp = df.loc[df['profile_id'] == profile_id_list[i]]
temp = temp.loc[:,columns]
#temp = temp.iloc[::70, :]
ax1=sns.lineplot(data=temp.loc[:,columns],
dashes = False,
palette=sns.color_palette('Dark2',n_colors=len(columns)))
ax1.set_title("profile id: {0}".format(profile_id_list[i]))
plt.show
The lineplots confirm that all three temperatures follow the same trend. The stator winding temperatures shows the biggest variation followed by the stator tooth and stator yoke temperature. This is especially noticeable when there is a lot of variation in the stator winding temperature. If this is the case, the stator tooth** and yoke temperatures follow a smoother path than the temperature recorded on the stator winding. In other words, the heat dissipated by the stator windings takes some time to heat up the stator tooth and yoke due to the thermal inertia of both stator parts.
columns = ['torque', 'i_q', 'u_d']
profile_id_list = df.profile_id.unique()
#[57, 71, 60, 61, 75, 77]
profile_id_list = np.random.choice(profile_id_list, size=8, replace=False)
nbr_column = 2
nbr_graph = len(profile_id_list)
nbr_row = int(np.ceil(nbr_graph/nbr_column))
kolomlijst = list(df.columns.values)
plt.figure(figsize=(30,nbr_row*5))
with sns.axes_style("whitegrid"):
for i in range(0,nbr_graph):
plt.subplot(nbr_row,nbr_column,i+1)
temp = df.loc[df['profile_id'] == profile_id_list[i]]
temp = temp.loc[:,columns]
#temp = temp.iloc[::70, :]
ax1=sns.lineplot(data=temp.loc[:,columns],
dashes = False,
palette=sns.color_palette('Dark2',n_colors=len(columns)))
ax1.set_title("profile id: {0}".format(profile_id_list[i]))
plt.show
The image confirm what we expected: the 'i_q' and 'torque' curves are almost overlapped, compared to the 'u_q' curve that follows an opposite trend.
columns = ['coolant', 'stator_yoke', 'u_s']
df.profile_id.unique()
#profile_id_list = [57, 71, 60, 61, 75, 77]
profile_id_list = np.random.choice(profile_id_list, size=6, replace=False)
nbr_column = 2
nbr_graph = len(profile_id_list)
nbr_row = int(np.ceil(nbr_graph/nbr_column))
kolomlijst = list(df.columns.values)
plt.figure(figsize=(30,nbr_row*5))
with sns.axes_style("whitegrid"):
for i in range(0,nbr_graph):
plt.subplot(nbr_row,nbr_column,i+1)
temp = df.loc[df['profile_id'] == profile_id_list[i]]
temp = temp.loc[:,columns]
#temp = temp.iloc[::70, :]
ax1=sns.lineplot(data=temp.loc[:,columns],
dashes = False,
palette=sns.color_palette('Dark2',n_colors=len(columns)))
ax1.set_title("profile id: {0}".format(profile_id_list[i]))
plt.show
We can observe that the coolant and stator_yoke variables follow the same trend. 'Coolant' presents a lot of flat zones, with some sudden changes. The voltage vector (u_s) does not seem to depend on those changes.
reduced_df = df.drop(['u_d', 'u_q', 'i_d', 'i_q', 'stator_tooth', 'stator_winding', 'coolant'], axis=1)
reduced_df.head()
import math as mt
I want to consider first, all the sections with same duration. Let's take the ones last almost two hours.
I want to split each sessions in a certain number of record such that I can explain the most possible signals.
profile_id_1 = 60
profile_id_2 = 61
profile_id_3 = 77 # Three profiles with same duration
feat_plot_1 = ['stator_yoke','pm']
feat_plot_2 = ['u_s']
feat_plot_3 = ['torque','motor_speed']
temp_1 = reduced_df.loc[df['profile_id'] == profile_id_1]
temp_1 = temp_1.iloc[:, :]
temp_2 = reduced_df.loc[df['profile_id'] == profile_id_2]
temp_2 = temp_2.iloc[:, :]
temp_3 = reduced_df.loc[df['profile_id'] == profile_id_3]
temp_3 = temp_3.iloc[:, :]
"""
fig.add_subplot(ROW,COLUMN,POSITION)
ROW=number of rows,
COLUMN=number of columns,
POSITION= position of the graph you are plotting
"""
with sns.axes_style("whitegrid"):
fig = plt.figure(figsize=(15, 10))
ax1 = fig.add_subplot(331)
ax1 = sns.lineplot(data=temp_1.loc[:,feat_plot_1], dashes = False,
palette=sns.color_palette('Blues',n_colors=len(feat_plot_1)),linewidth=0.8)
ax1.set_title("profile id: {0}".format(profile_id_1))
ax2 = fig.add_subplot(332)
ax2 = sns.lineplot(data=temp_2.loc[:,feat_plot_1], dashes = False,
palette=sns.color_palette('Blues',n_colors=len(feat_plot_1)),linewidth=0.8)
ax2.set_title("profile id: {0}".format(profile_id_2))
ax3 = fig.add_subplot(333)
ax3 = sns.lineplot(data=temp_3.loc[:,feat_plot_1], dashes = False,
palette=sns.color_palette('Blues',n_colors=len(feat_plot_1)),linewidth=0.8)
ax3.set_title("profile id: {0}".format(profile_id_3))
ax4 = fig.add_subplot(334)
ax4 = sns.lineplot(data=temp_1.loc[:,feat_plot_2], dashes = False,
palette=sns.color_palette('Dark2',n_colors=len(feat_plot_2)),linewidth=0.8)
ax5 = fig.add_subplot(335)
ax5 = sns.lineplot(data=temp_2.loc[:,feat_plot_2], dashes = False,
palette=sns.color_palette('Dark2',n_colors=len(feat_plot_2)),linewidth=0.8)
ax6 = fig.add_subplot(336)
ax6 = sns.lineplot(data=temp_3.loc[:,feat_plot_2], dashes = False,
palette=sns.color_palette('Dark2',n_colors=len(feat_plot_2)),linewidth=0.8)
ax7 = fig.add_subplot(337)
ax7 = sns.lineplot(data=temp_1.loc[:,feat_plot_3], dashes = False,
palette=sns.color_palette('YlOrBr',n_colors=len(feat_plot_3)),linewidth=0.8)
ax8 = fig.add_subplot(338)
ax8 = sns.lineplot(data=temp_2.loc[:,feat_plot_3], dashes = False,
palette=sns.color_palette('YlOrBr',n_colors=len(feat_plot_3)),linewidth=0.8)
ax9 = fig.add_subplot(339)
ax9 = sns.lineplot(data=temp_3.loc[:,feat_plot_3], dashes = False,
palette=sns.color_palette('YlOrBr',n_colors=len(feat_plot_3)),linewidth=0.8)
plt.show()
By analysing the first row, we can see a similira pattern:
It's interesting to observe, that in the center area, the vector voltage u_s, the torque and the motor_speed are costant.
profile_id_1 = 60
profile_id_2 = 61
profile_id_3 = 77
feat_plot_1 = ['ambient']
feat_plot_2 = ['u_s']
feat_plot_3 = ['i_s']
temp_1 = reduced_df.loc[df['profile_id'] == profile_id_1]
temp_1 = temp_1.iloc[:, :]
temp_2 = reduced_df.loc[df['profile_id'] == profile_id_2]
temp_2 = temp_2.iloc[:, :]
temp_3 = reduced_df.loc[df['profile_id'] == profile_id_3]
temp_3 = temp_3.iloc[:, :]
"""
fig.add_subplot(ROW,COLUMN,POSITION)
ROW=number of rows,
COLUMN=number of columns,
POSITION= position of the graph you are plotting
"""
with sns.axes_style("whitegrid"):
fig = plt.figure(figsize=(15, 10))
ax1 = fig.add_subplot(331)
ax1 = sns.lineplot(data=temp_1.loc[:,feat_plot_1], dashes = False,
palette=sns.color_palette('Blues',n_colors=len(feat_plot_1)),linewidth=0.8)
ax1.set_title("profile id: {0}".format(profile_id_1))
ax2 = fig.add_subplot(332)
ax2 = sns.lineplot(data=temp_2.loc[:,feat_plot_1], dashes = False,
palette=sns.color_palette('Blues',n_colors=len(feat_plot_1)),linewidth=0.8)
ax2.set_title("profile id: {0}".format(profile_id_2))
ax3 = fig.add_subplot(333)
ax3 = sns.lineplot(data=temp_3.loc[:,feat_plot_1], dashes = False,
palette=sns.color_palette('Blues',n_colors=len(feat_plot_1)),linewidth=0.8)
ax3.set_title("profile id: {0}".format(profile_id_3))
ax4 = fig.add_subplot(334)
ax4 = sns.lineplot(data=temp_1.loc[:,feat_plot_2], dashes = False,
palette=sns.color_palette('Dark2',n_colors=len(feat_plot_2)),linewidth=0.8)
ax5 = fig.add_subplot(335)
ax5 = sns.lineplot(data=temp_2.loc[:,feat_plot_2], dashes = False,
palette=sns.color_palette('Dark2',n_colors=len(feat_plot_2)),linewidth=0.8)
ax6 = fig.add_subplot(336)
ax6 = sns.lineplot(data=temp_3.loc[:,feat_plot_2], dashes = False,
palette=sns.color_palette('Dark2',n_colors=len(feat_plot_2)),linewidth=0.8)
ax7 = fig.add_subplot(337)
ax7 = sns.lineplot(data=temp_1.loc[:,feat_plot_3], dashes = False,
palette=sns.color_palette('YlOrBr',n_colors=len(feat_plot_3)),linewidth=0.8)
ax8 = fig.add_subplot(338)
ax8 = sns.lineplot(data=temp_2.loc[:,feat_plot_3], dashes = False,
palette=sns.color_palette('YlOrBr',n_colors=len(feat_plot_3)),linewidth=0.8)
ax9 = fig.add_subplot(339)
ax9 = sns.lineplot(data=temp_3.loc[:,feat_plot_3], dashes = False,
palette=sns.color_palette('YlOrBr',n_colors=len(feat_plot_3)),linewidth=0.8)
plt.show()
We can observe that the ambient temperature present some outliers that have to be handled.
We want to find an adeguate number of rows in which to break up each signals, in order to obtain a similar pattern for each of the areas that will result after the division.
Profile 60
# The first signal that we want to anylise is pm:
pm60 = reduced_df.loc[df['profile_id'] == profile_id_1]
pm60 = pm60.loc[:, 'pm'].values
fig = plt.figure(figsize=(15, 5))
plt.ylabel('values')
plt.xlabel('time')
plt.xticks(2*1200*np.arange(1, 10), [f'{a} min' for a in np.arange(20, 200, 20)]) # 2Hz sample rate
plt.title("Pm signal for the profile id 60")
plt.plot(pm60)
pmZom60 = pm60
fig = plt.figure(figsize=(18, 5))
plt.plot(pmZom60, label='pm')
plt.ylabel('values')
plt.xlabel('time')
plt.xticks(2*1200*np.arange(1, 10), [f'{a} min' for a in np.arange(20, 200, 20)]) # 2Hz sample rate
numVertLines = mt.ceil(len(pmZom60)/1850) # 1850
intervals = np.linspace(1850, len(pmZom60), num= numVertLines)
# single vline with full ymin and ymax
plt.title("Pm signal for the profile id 60")
plt.vlines(x=intervals, ymin=-2, ymax=1.5, colors='green', ls='--', lw=2, label='split lines')
plt.legend()
After a lot of trials, I choose to aggregate the data every 1850 rows, that are more or less equivalent to 15 minutes. I want to check if this choice is reasonable for all the signals and for all profiles.
I observe some results.
Profile 61
# The first signal that we want to anylise is pm:
pm61 = reduced_df.loc[df['profile_id'] == 61]
pm61 = pm61.loc[:, 'pm'].values
fig = plt.figure(figsize=(10, 5))
plt.plot(pm61)
plt.ylabel('values')
plt.xlabel('time')
plt.xticks(2*1200*np.arange(1, 7), [f'{a} min' for a in np.arange(20, 130, 20)]) # 2Hz sample rate
plt.title("Pm signal for the profile id 61")
pmZom61 = pm61
fig = plt.figure(figsize=(10, 5))
plt.plot(pmZom61, label= 'pm')
plt.ylabel('values')
plt.xlabel('time')
plt.xticks(2*1200*np.arange(1, 10), [f'{a} min' for a in np.arange(20, 200, 20)]) # 2Hz sample rate
plt.title("Pm signal for the profile id 61")
numVertLines = mt.ceil(len(pmZom61)/1850)
intervals = np.linspace(1850, len(pmZom61), num= numVertLines)
# single vline with full ymin and ymax
plt.vlines(x=intervals, ymin=-0.5, ymax=2, colors='green', ls='--', lw=2, label='split line')
plt.legend()
Profile 66
# The first signal that we want to anylise is pm:
pm66 = reduced_df.loc[df['profile_id'] == 66]
pm66 = pm66.loc[:, 'pm'].values
fig = plt.figure(figsize=(15, 5))
plt.plot(pm66, label= 'pm')
plt.ylabel('values')
plt.xlabel('time')
plt.xticks(2*1200*np.arange(1, 16), [f'{a} min' for a in np.arange(20, 310, 20)]) # 2Hz sample rate
plt.title("Pm signal for the profile id 66")
plt.legend()
# The first signal that we want to anylise is pm:
pmZom66 = pm66[:30000]
fig = plt.figure(figsize=(12, 5))
plt.plot(pmZom66, label= 'pm')
plt.ylabel('values')
plt.xlabel('time')
plt.xticks(2*1200*np.arange(1, 16), [f'{a} min' for a in np.arange(20, 310, 20)]) # 2Hz sample rate
numVertLines = mt.ceil(len(pmZom66)/1850)
intervals = np.linspace(1850, len(pmZom66), num= numVertLines)
# single vline with full ymin and ymax
plt.title("Pm signal for the profile id 66")
plt.vlines(x=intervals, ymin=-1, ymax=2, colors='green', ls='--', lw=2, label='split line')
plt.legend()
# The first signal that we want to anylise is pm:
torque = reduced_df.loc[reduced_df['profile_id'] == profile_id_1]
torque = torque.loc[:, 'torque'].values
fig = plt.figure(figsize=(15, 5))
plt.plot(torque[:], label= 'torque')
plt.ylabel('values')
plt.xlabel('time')
plt.xticks(2*1200*np.arange(1, 7), [f'{a} min' for a in np.arange(20, 130, 20)]) # 2Hz sample rate
plt.title("Torque signal for the profile id 60")
plt.legend()
# The first signal that we want to anylise is pm:
fig = plt.figure(figsize=(15, 5))
plt.plot(torque, label= 'torque')
plt.ylabel('values')
plt.xlabel('time')
plt.xticks(2*1200*np.arange(1, 7), [f'{a} min' for a in np.arange(20, 130, 20)]) # 2Hz sample rate
numVertLines = mt.ceil(len(torque)/1850)
intervals = np.linspace(1850, len(torque), num= numVertLines)
# single vline with full ymin and ymax
plt.vlines(x=intervals, ymin=-3, ymax=2.7, colors='green', ls='--', lw=2, label='split line')
plt.title("Torque signal for the profile id 60")
plt.legend()
Also for the torque, 1850 seems to be a reasonable number for which to divide the signal.
# profile_id_list = df.profile_id.unique()
# profile_id_rand = np.random.choice(profile_id_list, size=1, replace=False)
profile_id_1 = 60
ambient = df.loc[df['profile_id'] == 60]
ambient = ambient.loc[:, 'ambient'].values
fig = plt.figure(figsize=(15, 5))
plt.plot(ambient, label= 'ambient')#[1035:1050]
plt.ylabel('values')
plt.xlabel('time')
plt.xticks(2*1200*np.arange(1, 7), [f'{a} min' for a in np.arange(20, 130, 20)]) # 2Hz sample rate
plt.ylim(-5, 2)
plt.title("Signal of the ambient feature of the profile id {0}".format(profile_id_1))
plt.legend()
First of all we observe that there are some outliers! It's seems to be reasonable removing all the value below a certain treshold. We want to check if this information is true. That is, we want to check
ambient = reduced_df.loc[reduced_df['profile_id'] == 60]
ambient = ambient.loc[:, 'ambient'].values
fig = plt.figure(figsize=(15, 5))
plt.plot(ambient[1035:1050], label= 'ambient')
plt.ylabel('values')
plt.title("A zoomed area (after 10 minutes) of the 'ambient' signal for the profile id {0}".format(profile_id_1))
ambient = reduced_df.loc[reduced_df['profile_id'] == 60]
ambient = ambient.loc[:, 'ambient'].values
fig = plt.figure(figsize=(15, 5))
plt.plot(ambient[4762:4780], label= 'ambient')#[1035:1050]
plt.ylabel('values')
plt.title("A zoomed area (at 40 minutes) of the 'ambient' signal for the profile id {0}".format(profile_id_1))
#plt.legend()
By zoomming on the areas where outliers are present, we can state that these points are not due to some kind of errors. Infact the peaks are not isolated points, but their neighborhood points go along with the trend.
We want to see if they are consequences of a strange trend of the rotor and stator's temperatures. It could be possible that, in the neighborhood of these points, the activity of the motor would be very few resulting a less values also for the ambient temperature.
columns = ['ambient', 'stator_yoke', 'pm']
#profile_id_list = df.profile_id.unique()
profile_id_list = [57, 71, 60, 61, 75, 77]
#profile_id_list = np.random.choice(profile_id_list, size=12, replace=False)
nbr_column = 3
nbr_graph = len(profile_id_list)
nbr_row = int(np.ceil(nbr_graph/nbr_column))
kolomlijst = list(df.columns.values)
plt.figure(figsize=(30,nbr_row*5))
with sns.axes_style("whitegrid"):
for i in range(0,nbr_graph):
plt.subplot(nbr_row,nbr_column,i+1)
temp = df.loc[df['profile_id'] == profile_id_list[i]]
temp = temp.loc[:,columns]
#temp = temp.iloc[::70, :]
ax1=sns.lineplot(data=temp.loc[:,columns],
dashes = False,
palette=sns.color_palette('Dark2',n_colors=len(columns)))
#plt.xticks(2*1200*np.arange(1, 7), [f'{a} min' for a in np.arange(20, 130, 20)]) # 2Hz sample rate
ax1.set_title("profile id: {0}".format(profile_id_list[i]))
plt.show
It's evident that there is no common relationships between these signals. For some sessions and some points, the downward peaks are linked to downward trend of the stator_yoke and pm features, for others, they are associated to an increase values in both these temperatures.
An idea could be split the data into groups. For each of them taking the mean value and after removing the values that are out of the interval
(-0.5 + Mean; Mean + 0.5).
At the end replacing these values with the one that we will obtain using the interpolate function of python.
We can see the signal for six different profileId
columns = ['ambient']
profile_id_list = [57, 71, 60, 61, 75, 77]
nbr_column = 3
nbr_graph = len(profile_id_list)
nbr_row = int(np.ceil(nbr_graph/nbr_column))
kolomlijst = list(df.columns.values)
plt.figure(figsize=(30,nbr_row*5))
with sns.axes_style("whitegrid"):
for i in range(0,nbr_graph):
plt.subplot(nbr_row,nbr_column,i+1)
temp = df.loc[df['profile_id'] == profile_id_list[i]]
temp = temp.loc[:,columns]
#temp = temp.iloc[::70, :]
ax1=sns.lineplot(data=temp.loc[:,columns],
dashes = False,
palette=sns.color_palette('Dark2',n_colors=len(columns)))
ax1.set(ylim=(-5,2))
#ax1.xticks(2*1200*np.arange(1, 7), [f'{a} min' for a in np.arange(20, 130, 20)]) # 2Hz sample rate
ax1.set_title("profile id: {0}".format(profile_id_list[i]))
plt.show
# Two functions for detecting and removing the outliers
def discoverOutlier(arrayToDiscover, startInd, radius):
outlierList = list()
meanVector = np.mean(arrayToDiscover)
for i in range(0, len(arrayToDiscover)):
ind = arrayToDiscover[i]
if ((abs(ind - meanVector)> radius)):
outInd = i + startInd
#print(outInd)
outlierList.append(outInd)
return outlierList
def outlierRemoving(columnWithOutlier, outlierIndexes):
column = columnWithOutlier.copy()
column[outlierIndexes] = np.nan
columnWithoutOutlier = column.interpolate(limit_direction= 'forward')
return columnWithoutOutlier
I work first of all by considering the signal (ambient) of the profile id 60
outlierIndexes = list()
ambientId60 = df.loc[df['profile_id'] == 60]
startIndex = ambientId60.loc[:,'ambient'].index[0]
ambientId = ambientId60.loc[:,'ambient'].values
lenght = len(ambientId)
numIntervals = mt.ceil(lenght/1850)
intervals = np.linspace(1850, lenght, num= numIntervals)
start = 0
for n in intervals:
n=mt.ceil(n)
listOut = list(discoverOutlier(ambientId[start:n], startIndex+start, 0.5))
outlierIndexes.extend(listOut)
start = n
outlierIndexes
ambientTota = df.loc[df['profile_id'] == 60]
ambientTota = ambientTota.loc[:,'ambient']
ambientNoOutlier = outlierRemoving(ambientTota, outlierIndexes)
ambientNoOutlier.isna().sum()
profile_id_1 = 60
ambient = reduced_df.loc[reduced_df['profile_id'] == 60]
ambient = ambient.loc[:, 'ambient'] #.values
fig = plt.figure(figsize=(15, 5))
plt.plot(ambientNoOutlier.values, label= 'ambient')#[1035:1050]
plt.ylabel('values')
plt.xlabel('time')
plt.xticks(2*1200*np.arange(1, 7), [f'{a} min' for a in np.arange(20, 130, 20)]) # 2Hz sample rate
plt.ylim(-5, 2)
plt.title("Signal of the ambient feature after removing the outliers of the profile id {0}".format(profile_id_1))
plt.legend()
profile_id_list = df.profile_id.unique().tolist()
outlierIndexes = list()
for ids in profile_id_list:
ambientIds = df.loc[df['profile_id'] == ids]
startIndex = ambientIds.loc[:,'ambient'].index[0]
#print(startIndex)
ambientId = ambientIds.loc[:,'ambient'].values
#end = ambientId.index[len(ambientId.index)]
lenght = len(ambientId)
numIntervals = mt.ceil(lenght/1850)
intervals = np.linspace(1850, lenght, num= numIntervals)
start = 0
for n in intervals:
n=mt.ceil(n)
outlierIndexes.extend(discoverOutlier(ambientId[start:n], startIndex+ start, 0.5))
start = n
ambientTotal = df.copy()['ambient']
ambientNoOutlier = outlierRemoving(ambientTotal, outlierIndexes)
reduced_df['ambient'] = ambientNoOutlier
print('Number of outlier detected:')
print(len(outlierIndexes))
print('')
print('Number of null values:')
print(sum(reduced_df['ambient'].isnull().values))
#ambientNoOutlier
#reduced_df['ambient'].interpolate(method='linear', limit_direction='forward')
columns = ['ambient']
#profile_id_list = df.profile_id.unique()
profile_id_list = [57, 71, 60, 61, 75, 77]
#profile_id_list = np.random.choice(profile_id_list, size=12, replace=False)
nbr_column = 3
nbr_graph = len(profile_id_list)
nbr_row = int(np.ceil(nbr_graph/nbr_column))
kolomlijst = list(df.columns.values)
plt.figure(figsize=(30,nbr_row*5))
with sns.axes_style("whitegrid"):
for i in range(0,nbr_graph):
plt.subplot(nbr_row,nbr_column,i+1)
temp = reduced_df.loc[reduced_df['profile_id'] == profile_id_list[i]]
temp = temp.loc[:,columns]
#temp = temp.iloc[::70, :]
ax1=sns.lineplot(data=temp.loc[:,columns],
dashes = False,
palette=sns.color_palette('Dark2',n_colors=len(columns)))
ax1.set(ylim=(-5, 1.7))
ax1.set_title("profile id: {0}".format(profile_id_list[i]))
plt.show
We can state that the algorihtm worked and that the outliers have been removing and replacing correctly
# The first signal that we want to anylise is pm:
ambient = reduced_df.loc[reduced_df['profile_id'] == 60]
ambient = ambient.loc[:, 'ambient'].values
fig = plt.figure(figsize=(15, 5))
plt.plot(ambient, label= 'ambient')
plt.legend()
#plt.xticks(2*1200*np.arange(1, 7), [f'{a} min' for a in np.arange(20, 130, 20)]) # 2Hz sample rate
plt.ylim(-2, 2)
numVertLines = mt.ceil(len(ambient)/1850)
intervals = np.linspace(1850, len(ambient), num= numVertLines)
plt.vlines(x=intervals, ymin=-3, ymax=2, colors='green', ls='--', lw=2, label='split line')
# single vline with full ymin and ymax
profile_id_list = reduced_df.profile_id.unique()
profile_id_rand = np.random.choice(profile_id_list, size=1, replace=False)#58
i_s = reduced_df.loc[reduced_df['profile_id'] == 58] #profile_id_rand[0]
i_s = i_s.loc[:, 'i_s'].values
fig = plt.figure(figsize=(15, 5))
plt.plot(i_s, label= 'i_s')#[1035:1050]
plt.title("Signal of i_s feature of the profile id 58") #.format(profile_id_rand)
plt.legend()
plt.ylabel('values')
plt.xlabel('time')
plt.xticks(2*1200*np.arange(1, 19), [f'{a} min' for a in np.arange(20, 280, 20)])
numVertLines = mt.ceil(len(i_s)/1850)
intervals = np.linspace(1850, len(i_s), num= numVertLines)
# single vline with full ymin and ymax
plt.vlines(x=intervals, ymin=-3, ymax=3.4, colors='green', ls='--', lw=2, label='vline_multiple - partial height')
import statistics
from scipy.stats import skew, kurtosis
### Distributions
plt.rcParams.update({'font.size':22}) # customizing the plt style
g = sns.PairGrid(reduced_df.drop(['profile_id'], axis=1).sample(frac=0.04)) # plotting pairwise relationships in dataset
g.map_diag(sns.kdeplot) #Plot with a univariate function on each diagonal subplot.
g.map_upper(sns.kdeplot, n_levels=6) # bivariate Kernel Density Estimation
Note, however, that these distributions where calculated on a random sample of 4% of the data, so skews and shifts might take place for the full sample.
reduced_df.head()
profile_id_list = reduced_df.profile_id.unique()
minimum_ambient = list()
maximum_ambient = list()
skewnees_ambient = list()
kurtos_ambient = list()
rootMeanSquare_ambient = list()
mean_ambient = list()
minimum_motorSpeed = list()
maximum_motorSpeed = list()
skewnees_motorSpeed = list()
kurtos_motorSpeed = list()
rootMeanSquare_motorSpeed = list()
mean_motorSpeed = list()
minimum_torque = list()
maximum_torque = list()
skewnees_torque = list()
kurtos_torque = list()
rootMeanSquare_torque = list()
mean_torque = list()
minimum_pm = list()
maximum_pm = list()
skewnees_pm = list()
kurtos_pm = list()
rootMeanSquare_pm = list()
mean_pm = list()
minimum_statorYoke = list()
maximum_statorYoke = list()
skewnees_statorYoke = list()
kurtos_statorYoke = list()
rootMeanSquare_statorYoke = list()
mean_statorYoke = list()
minimum_is = list()
maximum_is = list()
skewnees_is = list()
kurtos_is = list()
rootMeanSquare_is = list()
mean_is = list()
mean_us = list()
for pro_Ids in profile_id_list:
i_s = reduced_df.loc[reduced_df['profile_id'] == pro_Ids]
i_s = i_s.loc[:, 'i_s'].values
ambient = reduced_df.loc[reduced_df['profile_id'] == pro_Ids]
ambient = ambient.loc[:, 'ambient'].values
stator_yoke = reduced_df.loc[reduced_df['profile_id'] == pro_Ids]
stator_yoke = stator_yoke.loc[:, 'stator_yoke'].values
pm = reduced_df.loc[reduced_df['profile_id'] == pro_Ids]
pm = pm.loc[:, 'pm'].values
torque = reduced_df.loc[reduced_df['profile_id'] == pro_Ids]
torque = torque.loc[:, 'torque'].values
motor_speed = reduced_df.loc[reduced_df['profile_id'] == pro_Ids]
motor_speed = motor_speed.loc[:, 'motor_speed'].values
u_s = reduced_df.loc[reduced_df['profile_id'] == pro_Ids]
u_s = u_s.loc[:, 'u_s'].values
lenght = len(i_s)
numIntervals = mt.ceil(lenght/1850)
intervals = np.linspace(1850, lenght, num= numIntervals)
start = 0
for n in intervals:
n=mt.ceil(n)
minimum_ambient.append(min(ambient[start:n]))
maximum_ambient.append(max(ambient[start:n]))
skewnees_ambient.append(skew(ambient[start:n]))
kurtos_ambient.append(kurtosis(ambient[start:n]))
rootMeanSquare_ambient.append(np.sqrt(np.mean(ambient[start:n]**2)))
mean_ambient.append(statistics.mean(ambient[start:n]))
minimum_motorSpeed.append(min(motor_speed[start:n]))
maximum_motorSpeed.append(max(motor_speed[start:n]))
skewnees_motorSpeed.append(skew(motor_speed[start:n]))
kurtos_motorSpeed.append(kurtosis(motor_speed[start:n]))
rootMeanSquare_motorSpeed.append(np.sqrt(np.mean(motor_speed[start:n]**2)))
mean_motorSpeed.append(statistics.mean(motor_speed[start:n]))
minimum_torque.append(min(torque[start:n]))
maximum_torque.append(max(torque[start:n]))
skewnees_torque.append(skew(torque[start:n]))
kurtos_torque.append(kurtosis(torque[start:n]))
rootMeanSquare_torque.append(np.sqrt(np.mean(torque[start:n]**2)))
mean_torque.append(statistics.mean(torque[start:n]))
minimum_pm.append(min(pm[start:n]))
maximum_pm.append(max(pm[start:n]))
skewnees_pm.append(skew(pm[start:n]))
kurtos_pm.append(kurtosis(pm[start:n]))
rootMeanSquare_pm.append(np.sqrt(np.mean(pm[start:n]**2)))
mean_pm.append(statistics.mean(pm[start:n]))
minimum_statorYoke.append(min(stator_yoke[start:n]))
maximum_statorYoke.append(max(stator_yoke[start:n]))
skewnees_statorYoke.append(skew(stator_yoke[start:n]))
kurtos_statorYoke.append(kurtosis(stator_yoke[start:n]))
rootMeanSquare_statorYoke.append(np.sqrt(np.mean(stator_yoke[start:n]**2)))
mean_statorYoke.append(statistics.mean(stator_yoke[start:n]))
minimum_is.append(min(i_s[start:n]))
maximum_is.append(max(i_s[start:n]))
skewnees_is.append(skew(i_s[start:n]))
kurtos_is.append(kurtosis(i_s[start:n]))
rootMeanSquare_is.append(np.sqrt(np.mean(i_s[start:n]**2)))
mean_is.append(statistics.mean(i_s[start:n]))
mean_us.append(statistics.mean(u_s[start:n]))
start = n
dataAggreg = list(zip(minimum_ambient, maximum_ambient, skewnees_ambient, kurtos_ambient, rootMeanSquare_ambient, mean_ambient,
minimum_motorSpeed, maximum_motorSpeed, skewnees_motorSpeed, kurtos_motorSpeed, rootMeanSquare_motorSpeed, mean_motorSpeed,
minimum_torque, maximum_torque, skewnees_torque, kurtos_torque, rootMeanSquare_torque, mean_torque,
minimum_pm, maximum_pm, skewnees_pm, kurtos_pm, rootMeanSquare_pm, mean_pm,
minimum_statorYoke, maximum_statorYoke, skewnees_statorYoke, kurtos_statorYoke, rootMeanSquare_statorYoke, mean_statorYoke,
minimum_is, maximum_is, skewnees_is, kurtos_is, rootMeanSquare_is, mean_is,
mean_us))
aggregated_Data= pd.DataFrame(data=dataAggreg, columns= [
'minimum_ambient', 'maximum_ambient', 'skewnees_ambient', 'kurtos_ambient', 'rootMeanSquare_ambient', 'mean_ambient',
'minimum_motorSpeed', 'maximum_motorSpeed', 'skewnees_motorSpeed', 'kurtos_motorSpeed', 'rootMeanSquare_motorSpeed', 'mean_motorSpeed',
'minimum_torque', 'maximum_torque', 'skewnees_torque', 'kurtos_torque', 'rootMeanSquare_torque', 'mean_torque',
'minimum_pm', 'maximum_pm', 'skewnees_pm', 'kurtos_pm', 'rootMeanSquare_pm', 'mean_pm',
'minimum_statorYoke', 'maximum_statorYoke', 'skewnees_statorYoke', 'kurtos_statorYoke', 'rootMeanSquare_statorYoke', 'mean_statorYoke',
'minimum_is', 'maximum_is', 'skewnees_is', 'kurtos_is', 'rootMeanSquare_is', 'mean_is',
'mean_us'])
The result obtained by aggregating the data, is the following:
aggregated_Data
Now we can observe the correlation between the new statistics.
corr = aggregated_Data.corr()
# generate mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# generate a custom diverding colormap
cmap = sns.diverging_palette(220,10,as_cmap=True)
plt.figure(figsize=(100, 100))
_ = sns.heatmap(corr,mask=mask,cmap=cmap,center=0,square=True,linewidths=.5, cbar_kws={"shrink":.5},annot=True)
It could be opportune to remove the feature that have very high correlation. We observe that:
Then the final dataset containg the aggregated data will be
finalAggrData = aggregated_Data.drop(['minimum_pm', 'maximum_pm', 'minimum_statorYoke', 'maximum_statorYoke', 'mean_is', 'rootMeanSquare_torque', 'minimum_ambient', 'maximum_ambient'], axis=1)
finalAggrData
finalAggrData.
corr = finalAggrData.corr()
# generate mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# generate a custom diverding colormap
cmap = sns.diverging_palette(220,10,as_cmap=True)
plt.figure(figsize=(100, 100))
_ = sns.heatmap(corr,mask=mask,cmap=cmap,center=0,square=True,linewidths=.5, cbar_kws={"shrink":.5},annot=True)
As we expectd, no more high correlation are visible!
#finalAggrData.to_csv('finalAggrData.csv')