a b/py_version/models_ml.py
1
import numpy as np
2
import matplotlib.pyplot as plt
3
import pandas as pd
4
import seaborn as sn
5
6
# Machine Learning libraries
7
from sklearn.ensemble import RandomForestClassifier
8
from sklearn.svm import SVC
9
from sklearn.linear_model import LogisticRegression
10
from sklearn.neighbors import KNeighborsClassifier
11
12
# Model evaluation libraries
13
from sklearn.model_selection import cross_val_score
14
from sklearn.metrics import accuracy_score
15
from sklearn.metrics import confusion_matrix
16
17
18
19
### Random Forest Classfier
20
rf = RandomForestClassifier()
21
22
### Support Vector Classifier
23
svc = SVC()
24
25
### Logistic Regression 
26
lr = LogisticRegression(solver='liblinear')
27
28
### K Nearest Neighbors
29
knn = KNeighborsClassifier()
30
31
x_data = np.load('featurized_data.npy', allow_pickle = True)
32
y_data = np.load('labels.npy', allow_pickle = True)
33
34
if __name__ == "__main__":
35
    rf_f_scores = cross_val_score(rf, x_data, y_data, cv=5)
36
    rf_f_acc = np.mean(rf_f_scores)
37
38
    svc_f_scores = cross_val_score(svc, x_data, y_data, cv=5)
39
    svc_f_acc = np.mean(svc_f_scores)
40
41
    lr_f_scores = cross_val_score(lr, x_data, y_data, cv=5)
42
    lr_f_acc = np.mean(lr_f_scores)
43
44
    knn_f_scores = cross_val_score(knn, x_data, y_data, cv=5)
45
    knn_f_acc = np.mean(knn_f_scores)
46
47
    # Visualize performance
48
    data_r = {'RF':rf_f_acc, 'SVC':svc_f_acc, 'LR':lr_f_acc, 'kNN':knn_f_acc}
49
    algorithm = list(data_r.keys())
50
    accuracy = list(data_r.values())
51
    fig = plt.figure(figsize = (10, 5))
52
    plt.bar(algorithm, accuracy, color ='red', width = 0.4)
53
    plt.xlabel("ML models", fontsize = 18)
54
    plt.ylabel("5 fold accuracy", fontsize = 18)
55
    plt.title("Result", fontsize = 18)
56
    plt.xticks(fontsize = 14)
57
    plt.yticks(fontsize = 14)
58
    plt.ylim([0, 1])
59
    plt.show()
60
61
    print('Random Forest Accuracy: ', rf_f_acc*100)
62
    print('Support Vector Classifier Accuracy: ', svc_f_acc*100)
63
    print('Logistic Regression Accuracy: ', lr_f_acc*100)
64
    print('K Nearest Neighbours Accuracy: ', knn_f_acc*100)
65
66
67
    ### Retraining RF on shuffeled data
68
    X_train = []
69
    X_test = []
70
    y_train = []
71
    y_test = []
72
    for i in range(7):
73
        current_class_data = x_data[i*20: i*20 + 20]
74
        X_train.append(current_class_data[0: 16])
75
        X_test.append(current_class_data[16: ])
76
        current_class_labels = y_data[i*20: i*20 + 20]
77
        y_train.append(current_class_labels[0: 16])
78
        y_test.append(current_class_labels[16: ])
79
    X_train = np.array(X_train).reshape(-1, 320)
80
    X_test = np.array(X_test).reshape(-1, 320)
81
    y_train = np.array(y_train).reshape(-1)
82
    y_test = np.array(y_test).reshape(-1)
83
84
    rf = RandomForestClassifier()
85
    rf.fit(X_train, y_train)
86
    predictions = rf.predict(X_test)
87
    accuracy = accuracy_score(predictions, y_test)
88
    print('Accuracy: ', accuracy)
89
90
    # Confusion Matrix
91
    conf_matrix = confusion_matrix(y_test, predictions)
92
    df_cm = pd.DataFrame(conf_matrix, index = [i for i in "0123456"], columns = [i for i in "0123456"])
93
    plt.figure(figsize = (10,7))
94
    sn.set(font_scale=1.4)
95
    sn.heatmap(df_cm, annot=True, annot_kws={"size": 16})
96
    plt.ylabel('True label')
97
    plt.xlabel('Predicted label')
98
    plt.show()
99
100
101
    # Dropping class 4 Datapoints
102
    idx = (y_data != 4)
103
    x_data = x_data[idx]
104
    y_data = np.array([i for i in range(6) for j in range(20)])
105
106
    #Retrain shallow ML algorithms without class 4
107
    rf = RandomForestClassifier()
108
    rf_f_scores = cross_val_score(rf, x_data, y_data, cv=5)
109
    rf_f_acc = np.mean(rf_f_scores)
110
111
    svc = SVC()
112
    svc_f_scores = cross_val_score(svc, x_data, y_data, cv=5)
113
    svc_f_acc = np.mean(svc_f_scores)
114
115
    lr = LogisticRegression(solver='liblinear')
116
    lr_f_scores = cross_val_score(lr, x_data, y_data, cv=5)
117
    lr_f_acc = np.mean(lr_f_scores)
118
119
    knn = KNeighborsClassifier()
120
    knn_f_scores = cross_val_score(knn, x_data, y_data, cv=5)
121
    knn_f_acc = np.mean(knn_f_scores)
122
    
123
    
124
    data_r = {'RF':rf_f_acc, 'SVC':svc_f_acc, 'LR':lr_f_acc, 'kNN':knn_f_acc}
125
    algorithm = list(data_r.keys())
126
    accuracy = list(data_r.values())
127
    fig = plt.figure(figsize = (10, 5))
128
    plt.bar(algorithm, accuracy, color ='red', width = 0.4)
129
    plt.xlabel("ML models", fontsize = 18)
130
    plt.ylabel("5 fold accuracy", fontsize = 18)
131
    plt.title("Result", fontsize = 18)
132
    plt.xticks(fontsize = 14)
133
    plt.yticks(fontsize = 14)
134
    plt.ylim([0, 1])
135
    plt.show()
136
 
137
138
    print('Random Forest Accuracy: ', rf_f_acc*100)
139
    print('Support Vector Classifier Accuracy: ', svc_f_acc*100)
140
    print('Logistic Regression Accuracy: ', lr_f_acc*100)
141
    print('K Nearest Neighbours Accuracy: ', knn_f_acc*100)
142
143
144
145
    # Creating train and test set without class 4
146
    X_train = []
147
    X_test = []
148
    y_train = []
149
    y_test = []
150
    for i in range(6):
151
        current_class_data = x_data[i*20: i*20 + 20]
152
        X_train.append(current_class_data[0: 16])
153
        X_test.append(current_class_data[16: ])
154
        current_class_labels = y_data[i*20: i*20 + 20]
155
        y_train.append(current_class_labels[0: 16])
156
        y_test.append(current_class_labels[16: ])
157
    X_train = np.array(X_train).reshape(-1, 320)
158
    X_test = np.array(X_test).reshape(-1, 320)
159
    y_train = np.array(y_train).reshape(-1)
160
    y_test = np.array(y_test).reshape(-1)
161
162
    # Training the best model (Random Forest)
163
    rf = RandomForestClassifier()
164
    rf.fit(X_train, y_train)
165
    predictions = rf.predict(X_test)
166
    accuracy = accuracy_score(predictions, y_test)
167
    print('Accuracy: ', accuracy)
168
169
    # See new confusion matrix of best model without class 4
170
    conf_matrix = confusion_matrix(y_test, predictions)
171
    df_cm = pd.DataFrame(conf_matrix, index = [i for i in "012356"], columns = [i for i in "012356"])
172
    plt.figure(figsize = (10,7))
173
    sn.set(font_scale=1.4)
174
    sn.heatmap(df_cm, annot=True, annot_kws={"size": 16})
175
    plt.ylabel('True label')
176
    plt.xlabel('Predicted label')
177
    plt.show()
178
179