Diff of /py_version/models_ml.py [000000] .. [39d39d]

Switch to side-by-side view

--- a
+++ b/py_version/models_ml.py
@@ -0,0 +1,179 @@
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sn
+
+# Machine Learning libraries
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.svm import SVC
+from sklearn.linear_model import LogisticRegression
+from sklearn.neighbors import KNeighborsClassifier
+
+# Model evaluation libraries
+from sklearn.model_selection import cross_val_score
+from sklearn.metrics import accuracy_score
+from sklearn.metrics import confusion_matrix
+
+
+
+### Random Forest Classfier
+rf = RandomForestClassifier()
+
+### Support Vector Classifier
+svc = SVC()
+
+### Logistic Regression 
+lr = LogisticRegression(solver='liblinear')
+
+### K Nearest Neighbors
+knn = KNeighborsClassifier()
+
+x_data = np.load('featurized_data.npy', allow_pickle = True)
+y_data = np.load('labels.npy', allow_pickle = True)
+
+if __name__ == "__main__":
+    rf_f_scores = cross_val_score(rf, x_data, y_data, cv=5)
+    rf_f_acc = np.mean(rf_f_scores)
+
+    svc_f_scores = cross_val_score(svc, x_data, y_data, cv=5)
+    svc_f_acc = np.mean(svc_f_scores)
+
+    lr_f_scores = cross_val_score(lr, x_data, y_data, cv=5)
+    lr_f_acc = np.mean(lr_f_scores)
+
+    knn_f_scores = cross_val_score(knn, x_data, y_data, cv=5)
+    knn_f_acc = np.mean(knn_f_scores)
+
+    # Visualize performance
+    data_r = {'RF':rf_f_acc, 'SVC':svc_f_acc, 'LR':lr_f_acc, 'kNN':knn_f_acc}
+    algorithm = list(data_r.keys())
+    accuracy = list(data_r.values())
+    fig = plt.figure(figsize = (10, 5))
+    plt.bar(algorithm, accuracy, color ='red', width = 0.4)
+    plt.xlabel("ML models", fontsize = 18)
+    plt.ylabel("5 fold accuracy", fontsize = 18)
+    plt.title("Result", fontsize = 18)
+    plt.xticks(fontsize = 14)
+    plt.yticks(fontsize = 14)
+    plt.ylim([0, 1])
+    plt.show()
+
+    print('Random Forest Accuracy: ', rf_f_acc*100)
+    print('Support Vector Classifier Accuracy: ', svc_f_acc*100)
+    print('Logistic Regression Accuracy: ', lr_f_acc*100)
+    print('K Nearest Neighbours Accuracy: ', knn_f_acc*100)
+
+
+    ### Retraining RF on shuffeled data
+    X_train = []
+    X_test = []
+    y_train = []
+    y_test = []
+    for i in range(7):
+        current_class_data = x_data[i*20: i*20 + 20]
+        X_train.append(current_class_data[0: 16])
+        X_test.append(current_class_data[16: ])
+        current_class_labels = y_data[i*20: i*20 + 20]
+        y_train.append(current_class_labels[0: 16])
+        y_test.append(current_class_labels[16: ])
+    X_train = np.array(X_train).reshape(-1, 320)
+    X_test = np.array(X_test).reshape(-1, 320)
+    y_train = np.array(y_train).reshape(-1)
+    y_test = np.array(y_test).reshape(-1)
+
+    rf = RandomForestClassifier()
+    rf.fit(X_train, y_train)
+    predictions = rf.predict(X_test)
+    accuracy = accuracy_score(predictions, y_test)
+    print('Accuracy: ', accuracy)
+
+    # Confusion Matrix
+    conf_matrix = confusion_matrix(y_test, predictions)
+    df_cm = pd.DataFrame(conf_matrix, index = [i for i in "0123456"], columns = [i for i in "0123456"])
+    plt.figure(figsize = (10,7))
+    sn.set(font_scale=1.4)
+    sn.heatmap(df_cm, annot=True, annot_kws={"size": 16})
+    plt.ylabel('True label')
+    plt.xlabel('Predicted label')
+    plt.show()
+
+
+    # Dropping class 4 Datapoints
+    idx = (y_data != 4)
+    x_data = x_data[idx]
+    y_data = np.array([i for i in range(6) for j in range(20)])
+
+    #Retrain shallow ML algorithms without class 4
+    rf = RandomForestClassifier()
+    rf_f_scores = cross_val_score(rf, x_data, y_data, cv=5)
+    rf_f_acc = np.mean(rf_f_scores)
+
+    svc = SVC()
+    svc_f_scores = cross_val_score(svc, x_data, y_data, cv=5)
+    svc_f_acc = np.mean(svc_f_scores)
+
+    lr = LogisticRegression(solver='liblinear')
+    lr_f_scores = cross_val_score(lr, x_data, y_data, cv=5)
+    lr_f_acc = np.mean(lr_f_scores)
+
+    knn = KNeighborsClassifier()
+    knn_f_scores = cross_val_score(knn, x_data, y_data, cv=5)
+    knn_f_acc = np.mean(knn_f_scores)
+    
+    
+    data_r = {'RF':rf_f_acc, 'SVC':svc_f_acc, 'LR':lr_f_acc, 'kNN':knn_f_acc}
+    algorithm = list(data_r.keys())
+    accuracy = list(data_r.values())
+    fig = plt.figure(figsize = (10, 5))
+    plt.bar(algorithm, accuracy, color ='red', width = 0.4)
+    plt.xlabel("ML models", fontsize = 18)
+    plt.ylabel("5 fold accuracy", fontsize = 18)
+    plt.title("Result", fontsize = 18)
+    plt.xticks(fontsize = 14)
+    plt.yticks(fontsize = 14)
+    plt.ylim([0, 1])
+    plt.show()
+ 
+
+    print('Random Forest Accuracy: ', rf_f_acc*100)
+    print('Support Vector Classifier Accuracy: ', svc_f_acc*100)
+    print('Logistic Regression Accuracy: ', lr_f_acc*100)
+    print('K Nearest Neighbours Accuracy: ', knn_f_acc*100)
+
+
+
+    # Creating train and test set without class 4
+    X_train = []
+    X_test = []
+    y_train = []
+    y_test = []
+    for i in range(6):
+        current_class_data = x_data[i*20: i*20 + 20]
+        X_train.append(current_class_data[0: 16])
+        X_test.append(current_class_data[16: ])
+        current_class_labels = y_data[i*20: i*20 + 20]
+        y_train.append(current_class_labels[0: 16])
+        y_test.append(current_class_labels[16: ])
+    X_train = np.array(X_train).reshape(-1, 320)
+    X_test = np.array(X_test).reshape(-1, 320)
+    y_train = np.array(y_train).reshape(-1)
+    y_test = np.array(y_test).reshape(-1)
+
+    # Training the best model (Random Forest)
+    rf = RandomForestClassifier()
+    rf.fit(X_train, y_train)
+    predictions = rf.predict(X_test)
+    accuracy = accuracy_score(predictions, y_test)
+    print('Accuracy: ', accuracy)
+
+    # See new confusion matrix of best model without class 4
+    conf_matrix = confusion_matrix(y_test, predictions)
+    df_cm = pd.DataFrame(conf_matrix, index = [i for i in "012356"], columns = [i for i in "012356"])
+    plt.figure(figsize = (10,7))
+    sn.set(font_scale=1.4)
+    sn.heatmap(df_cm, annot=True, annot_kws={"size": 16})
+    plt.ylabel('True label')
+    plt.xlabel('Predicted label')
+    plt.show()
+
+    
\ No newline at end of file