[c1ec9e]: / code / statistical_analysis_time_domain.py

Download this file

339 lines (282 with data), 15.7 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
import timeit
import numpy as np
import pandas as pd
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
def convert_to_csv(device, signal):
"""
This function convert all the raw data to csv file and provides proper header and file name accordingly.
:param device: can be either 'phone' or 'watch'
:param signal: can be either 'accel' or 'gyro'
:return: saves raw dataset with header and correct file name as a csv file
"""
subject_counter = 0
for subject_ID in range(1600, 1651):
data = pd.read_csv(f'./raw/{device}/{signal}/data_{subject_ID}_{signal}_{device}.txt', sep=",", header=None)
data.columns = ["subject_ID", "activity_ID", "Timestamp", f"x_{device}_{signal}", f"y_{device}_{signal}",
f"z_{device}_{signal}"]
saveing_directory = f'{device}_{signal}/S{subject_counter}_{device}_{signal}.csv'
data.to_csv(saveing_directory)
subject_counter += 1
print(subject_counter)
# convert_to_csv(device='watch', signal='gyro')
def zero_crossing(window):
"""
:param window: specific window of the row dataset that we want to calculate zero_crossing for it.
:return: an integer representing the zero crossing rate
"""
file_sign = np.sign(window)
file_sign[file_sign == 0] = -1
zero_crossing = np.where(np.diff(file_sign))[0]
return len(zero_crossing)
def mean_crossing_rate(window):
"""
:param window: specific window of the row dataset that we want to calculate mean_crossing for it.
:return: an integer representing the mean crossing rate
"""
mean_crossing_counter = 0
mean = window.mean()
subtraction = window - mean
file_sign = np.sign(subtraction)
for i in range(len(file_sign)):
if (file_sign.iloc[i]).all() == 1:
mean_crossing_counter += 1
return mean_crossing_counter
def statistical_feature_extraction(window_size, signal, axis, device, subject_ID):
"""
1) THis function first load the raw data from directory, if it could not find the directory, it threw an exception
2) Segments the signal into 10 seconds window sizes
3) Calculates features min, max, mean, standard deviation, median, variance, zero_crossing, mean_crossing for
each window and each signal
4) Saves all the extracted features related to each signal in a csv file.
:param window_size: must be in seconds
:param signal: can be either 'accel' or 'gyro'
:param axis: 'x', 'y', 'z'
:param device: can be either 'phone' or 'watch'
:param subject_ID: should be any integer between 0 to 50
:return:
"""
start_running = timeit.default_timer()
try:
directory = f'data/row_data/{device}_{signal}/S{subject_ID}_{device}_{signal}.csv'
sampling_rate = 20
window_size = int(sampling_rate * window_size)
# print(window_size)
except:
print('Error! Can not find such directory.')
raw_signal = pd.read_csv(directory)
win_count = 0
total_win_count = 0
features_for_all_windows_one_activity = []
features_for_all_windows_all_activities = []
column_title = f'{axis}_{device}_{signal}'
for class_label in np.append(range(1, 14), range(15, 20)):
activity_ID = chr(class_label + 64)
raw_data_one_activity = np.array(raw_signal.loc[raw_signal['activity_ID'] == activity_ID, [column_title]])
raw_data_one_activity = pd.DataFrame(raw_data_one_activity)
for data_point in range(0, len(raw_data_one_activity), window_size):
win_count += 1
start = data_point
end = start + window_size
time_domain_window = raw_data_one_activity[start:end]
time_mean = pd.Series(time_domain_window.mean()).rename(f'{axis}_{signal}_mean')
time_min = pd.Series(time_domain_window.min()).rename(f'{axis}_{signal}_min')
time_max = pd.Series(time_domain_window.max()).rename(f'{axis}_{signal}_max')
time_std = pd.Series(time_domain_window.std()).rename(f'{axis}_{signal}_std')
time_median = pd.Series(time_domain_window.median()).rename(f'{axis}_{signal}_median')
time_variance = pd.Series(time_domain_window.var()).rename(f'{axis}_{signal}_variance')
zero_crossing_rate = pd.Series(zero_crossing(time_domain_window)).rename(
f'{axis}_{signal}_zero_crossing')
mean_crossing = pd.Series(mean_crossing_rate(time_domain_window)).rename(
f'{axis}_{signal}_mean_crossing')
activity_id_ = pd.Series(activity_ID).rename('Activity_ID')
features_for_one_window_one_activity = pd.concat(
[time_mean, time_min, time_max, time_std, time_median, time_variance, zero_crossing_rate, mean_crossing,
activity_id_], axis=1)
features_for_all_windows_one_activity.append(features_for_one_window_one_activity)
# print(features_for_all_windows)
print('Window count', win_count)
total_win_count += win_count
win_count = 0
features_for_all_windows_all_activities.append(features_for_all_windows_one_activity)
features = pd.concat(features_for_all_windows_all_activities[0], ignore_index=False)
print(features)
save_as_directory = f'feature_label_tables/feature_{device}_{signal}/feature_S{subject_ID}_{axis}_{device}_{signal}.csv'
features.to_csv(save_as_directory, encoding='utf-8', index=False)
finish_running = timeit.default_timer()
print('Total number of windows: ', total_win_count)
print('Running time: ', finish_running - start_running)
# statistical_feature_extraction(window_size=10, signal='accel', axis='z', device='phone', subject_ID=0)
def feature_extraction_for_all_subjects():
device_list = ['phone', 'watch']
signal_list = ['accel', 'gyro']
axis_list = ['x', 'y', 'z']
for device in device_list:
for signal in signal_list:
for axis in axis_list:
for subject_ID in range(0, 51):
print('calculating: ', device, signal, axis, subject_ID)
print('=====================================================\
========================================================')
statistical_feature_extraction(window_size=10, signal=signal, axis=axis, device=device,
subject_ID=subject_ID)
# feature_extraction_for_all_subjects()
def input_features_labels(device, signal, subject_ID):
"""
This function prepares the feature matrix and corresponding label vector for the classifiers
1) drops empty fields
2) drops two less informative features zero_crossing and mean_crossing
3) splits the train and test set
4) finally, normalizes the features with scalar.transform function
:param device: 'phone' or 'watch'
:param signal: 'accel' or 'gyro'
:param subject_ID: int between 0 to 50
:return: dataframes containing normalized_feature_train, normalized_feature_test, label_train, label_test,
normalized_all_feature, all_labels
"""
directory = f'data/feature_label_tables/feature_{device}_{signal}/feature_S{subject_ID}_all_axis_{device}_{signal}'
data = pd.read_csv(directory)
data = data.dropna()
# since all zero_crossing and mean_crossing metrics are zero and 200, respectively,
# regardless of the signal and the activity, we ignore this feature.
features = data.drop(columns=[f'x_{signal}_zero_crossing', f'x_{signal}_mean_crossing',
f'y_{signal}_zero_crossing', f'y_{signal}_mean_crossing',
f'z_{signal}_zero_crossing', f'z_{signal}_mean_crossing',
'Activity_ID'])
all_labels = data[['Activity_ID']]
feature_train, feature_test, label_train, label_test = train_test_split(
features, all_labels, test_size=0.2, shuffle=True)
# feature normalization
scalar = StandardScaler().fit(feature_train)
normalized_feature_train = scalar.transform(feature_train)
normalized_feature_test = scalar.transform(feature_test)
normalized_all_feature = scalar.transform(features)
# convert 'numpy.ndarray' to pandas dataframe
normalized_feature_train = pd.DataFrame(normalized_feature_train)
normalized_feature_test = pd.DataFrame(normalized_feature_test)
normalized_all_feature = pd.DataFrame(normalized_all_feature)
return normalized_feature_train, normalized_feature_test, label_train, label_test, normalized_all_feature, all_labels
# input_features_labels(device='phone', signal='accel', subject_ID=10)
def plotitng_confusion_matrix(conf_matrix, evaluation_mode, subject_ID):
'''
:param conf_matrix:confusion_matrix
:param evaluation_mode: 'personal' for 10-fold cross-validation, 'impersonal' for LOSO cross-validation
:param subject_ID:int between 0 to 50
:return:
'''
df_cm = pd.DataFrame(conf_matrix, index=[i for i in "ABCDEFGHIJKLMOPQRS"],
columns=[i for i in "ABCDEFGHIJKLMOPQRS"])
plt.figure(figsize=(10, 10))
sns.heatmap(df_cm, annot=True)
plt.show()
# plt.savefig(f'figures/{evaluation_mode}/{subject_ID}.jpg')
# plotitng_confusion_matrix(confusion_matrix)
def accuracy_per_class(conf_matrix, row_index, to_print=True):
"""
code reference: https://stackoverflow.com/questions/35572000/how-can-i-plot-a-confusion-matrix
:param conf_matrix: np.array containing the confusion matrix
:param row_index: must be integer that means which class to choose to do one-vs-the-rest calculation
:param to_print: if True it will prints the TP, TN, FP and FP measurements
:return: a float number representing the accuracy
"""
TP = conf_matrix[row_index, row_index] # correctly labeled as i
FN = conf_matrix[:, row_index].sum() - TP # incorrectly labeled as i
FP = conf_matrix[row_index, :].sum() - TP # incorrectly labeled as non-i
TN = conf_matrix.sum().sum() - TP - FN - FP
if to_print:
print('TP: {}'.format(TP))
print('FN: {}'.format(FN))
print('FP: {}'.format(FP))
print('TN: {}'.format(TN))
accuracy = (TN + TP) / (TP + FN + FP + TN)
return accuracy
def personal_model_rf(device, signal, subject_ID):
"""
This function trains a random forest classifier. Then evaluates the model using 10-fold cross-validation
approach for each individual. Since this model evaluates based on each subject, it is called personal model.
:param device: can be either 'phone' or 'watch'
:param signal: can be either 'accel' or 'gyro'
:param subject_ID: should be any integer between 0 to 50
:return:
"""
# getting normalized features and labels for train and test set from "input_features_labels" function
feature_train, feature_test, label_train, label_test, _, _ = input_features_labels(device=device, signal=signal,
subject_ID=subject_ID)
label_train = label_train.values.ravel()
label_test = label_test.values.ravel()
# classifier configuration
cross_validation = StratifiedKFold(n_splits=10, shuffle=True, random_state=50)
rf = RandomForestClassifier(random_state=0)
hyperparams = {"n_estimators": [30, 50, 100], "max_depth": [10, 30, 50]}
clf = GridSearchCV(estimator=rf, param_grid=hyperparams, scoring="accuracy", cv=cross_validation, refit=True,
verbose=0)
clf.fit(feature_train, label_train)
print('Best parameters: ', clf.best_params_)
prediction = clf.predict(feature_test)
report = sklearn.metrics.classification_report(label_test, prediction, digits=3, zero_division=1)
conf_matrix = confusion_matrix(label_test, prediction)
print(conf_matrix.shape[0])
# plotitng_confusion_matrix(confusion_matrix=con_matrix, evaluation_mode='personal', subject_ID=subject_ID)
print(report)
for row in range(conf_matrix.shape[0]):
print(f'Accuracy for class {row}: ',
accuracy_per_class(conf_matrix=conf_matrix, row_index=row, to_print=False))
for subject in range(0, 51):
print(f'***********************For subject ID ({subject}) ***********************************')
personal_model_rf(device='phone', signal='accel', subject_ID=subject)
# personal_model_rf(device='phone', signal='gyro', subject_ID=subject)
# personal_model_rf(device='watch', signal='accel', subject_ID=subject)
# personal_model_rf(device='watch', signal='gyro', subject_ID=subject)
def LOSO_cross_validation(signal, device):
"""
This function evaluate the model based on Leave_one_subject_out (LOSO) cross validation or impersonal model.
:param device: can be either 'phone' or 'watch'
:param signal: can be either 'accel' or 'gyro'
:return:
"""
rf = RandomForestClassifier(random_state=0)
hyperparams = {"n_estimators": [30, 50, 100], "max_depth": [10, 30, 50]}
clf = GridSearchCV(estimator=rf, param_grid=hyperparams, scoring="accuracy", cv=None, refit=True, verbose=0)
all_subjects_but_one = []
for subject_out in range(0, 51):
for subject_in in range(0, 51):
if subject_in == subject_out:
print(f'Leaving subject {subject_out} out: \n ==================================================')
else:
_, _, _, _, normalized_all_feature, all_labels = input_features_labels(device=device,
signal=signal,
subject_ID=subject_in)
feature_label = pd.concat([normalized_all_feature, all_labels], axis=1)
all_subjects_but_one.append(feature_label)
all_subjects_but_one = pd.concat(all_subjects_but_one, axis=0)
all_subjects_but_one = all_subjects_but_one.dropna()
feature_train = all_subjects_but_one.drop(columns=['Activity_ID'])
label_train = all_subjects_but_one['Activity_ID']
print(feature_train)
print(label_train)
_, _, _, _, feature_test, label_test = input_features_labels(device=device,
signal=signal, subject_ID=subject_out)
print(feature_test)
print(label_test)
clf.fit(feature_train, label_train)
print('Best parameters: ', clf.best_params_)
prediction = clf.predict(feature_test)
report = sklearn.metrics.classification_report(label_test, prediction, digits=3, zero_division=1)
conf_matrix = confusion_matrix(label_test, prediction)
print(conf_matrix.shape[0])
# plotitng_confusion_matrix(confusion_matrix=con_matrix, evaluation_mode='personal', subject_ID=subject_ID)
print(report)
for row in range(conf_matrix.shape[0]):
print(f'Accuracy for class {row}: ',
accuracy_per_class(conf_matrix=conf_matrix, row_index=row, to_print=False))
all_subjects_but_one = []
LOSO_cross_validation(signal='accel', device='phone')
LOSO_cross_validation(signal='gyro', device='phone')
LOSO_cross_validation(signal='accel', device='watch')
LOSO_cross_validation(signal='gyro', device='watch')