|
a |
|
b/code/ModelEvaluation.py |
|
|
1 |
#!/usr/bin/env python |
|
|
2 |
# coding: utf-8 |
|
|
3 |
|
|
|
4 |
# In[1]: |
|
|
5 |
|
|
|
6 |
|
|
|
7 |
import pandas as pd |
|
|
8 |
import numpy as np |
|
|
9 |
import pickle |
|
|
10 |
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer |
|
|
11 |
import warnings |
|
|
12 |
warnings.filterwarnings('ignore') |
|
|
13 |
|
|
|
14 |
def matric_table(model_list, name_list,y_data, X_data): |
|
|
15 |
result = [] |
|
|
16 |
for m,n,a,b in zip(model_list, name_list, y_data, X_data): |
|
|
17 |
report = [] |
|
|
18 |
report.append(n) |
|
|
19 |
report.append(accuracy_score(a[0], m.predict(b[0])) * 100) |
|
|
20 |
report.append(accuracy_score(a[1], m.predict(b[1])) * 100) |
|
|
21 |
report.append(recall_score(a[1], m.predict(b[1]),average = 'weighted') * 100) |
|
|
22 |
report.append(precision_score(a[1], m.predict(b[1]),average = 'weighted') * 100) |
|
|
23 |
report.append(f1_score(a[1], m.predict(b[1]),average = 'weighted') * 100) |
|
|
24 |
result.append(report) |
|
|
25 |
df = pd.DataFrame(data = result, columns=['Model', 'Training Accuracy %', 'Testing Accuracy %','Testing precision %', 'Testing recall %', 'Testing f1_score %']) |
|
|
26 |
df = df.set_index('Model') |
|
|
27 |
return df.style.highlight_max(color = 'lightgreen', axis = 0) |
|
|
28 |
|
|
|
29 |
|
|
|
30 |
# In[2]: |
|
|
31 |
|
|
|
32 |
|
|
|
33 |
X_bow_train = pd.read_csv('X_bow_train.csv') |
|
|
34 |
X_bow_test = pd.read_csv('X_bow_test.csv') |
|
|
35 |
y_bow_train = pd.read_csv('y_bow_train.csv') |
|
|
36 |
y_bow_test = pd.read_csv('y_bow_test.csv') |
|
|
37 |
|
|
|
38 |
|
|
|
39 |
X_tf_train = pd.read_csv('X_tf_train.csv') |
|
|
40 |
X_tf_test = pd.read_csv('X_tf_test.csv') |
|
|
41 |
y_tf_train = pd.read_csv('y_tf_train.csv') |
|
|
42 |
y_tf_test = pd.read_csv('y_tf_test.csv') |
|
|
43 |
|
|
|
44 |
X_hash_train = pd.read_csv('X_hash_train.csv') |
|
|
45 |
X_hash_test = pd.read_csv('X_hash_test.csv') |
|
|
46 |
y_hash_train = pd.read_csv('y_hash_train.csv') |
|
|
47 |
y_hash_test = pd.read_csv('y_hash_test.csv') |
|
|
48 |
|
|
|
49 |
X_w2v_train = pd.read_csv('X_w2v_train.csv') |
|
|
50 |
X_w2v_test = pd.read_csv('X_w2v_test.csv') |
|
|
51 |
y_w2v_train = pd.read_csv('y_w2v_train.csv') |
|
|
52 |
y_w2v_test = pd.read_csv('y_w2v_test.csv') |
|
|
53 |
|
|
|
54 |
|
|
|
55 |
# ### Random Forest with word2vec extracted data is the best among all Random Forest Classifers |
|
|
56 |
|
|
|
57 |
# In[3]: |
|
|
58 |
|
|
|
59 |
|
|
|
60 |
rf_bow = pickle.load(open('rf_bow.pkl','rb')) |
|
|
61 |
rf_tf = pickle.load(open('rf_tf.pkl','rb')) |
|
|
62 |
rf_hash = pickle.load(open('rf_hash.pkl','rb')) |
|
|
63 |
rf_w2v = pickle.load(open('rf_w2v.pkl','rb')) |
|
|
64 |
|
|
|
65 |
model_list = [rf_bow,rf_tf,rf_hash,rf_w2v] |
|
|
66 |
name_list = ["Random Forest with bow","Random Forest with tf_idf", "Random Forest with hash","Random Forest with word2vec"] |
|
|
67 |
y_data = [[y_bow_train,y_bow_test], [y_tf_train,y_tf_test], [y_hash_train,y_hash_test],[y_w2v_train,y_w2v_test]] |
|
|
68 |
X_data = [[X_bow_train,X_bow_test], [X_tf_train,X_tf_test], [X_hash_train,X_hash_test],[X_w2v_train,X_w2v_test]] |
|
|
69 |
matric_table(model_list, name_list, y_data, X_data) |
|
|
70 |
|
|
|
71 |
|
|
|
72 |
# ### Logistic Regression with bag-of-words extracted data is the best among all logistic Regression Classifers |
|
|
73 |
|
|
|
74 |
# In[4]: |
|
|
75 |
|
|
|
76 |
|
|
|
77 |
# print result, the warning indicate there are some type the classifers never predict. but since data is imbalence in that rare class so the accuracy won't be impacted |
|
|
78 |
lr_bow = pickle.load(open('lr_bow.pkl','rb')) |
|
|
79 |
lr_tf = pickle.load(open('lr_tf.pkl','rb')) |
|
|
80 |
lr_hash = pickle.load(open('lr_hash.pkl','rb')) |
|
|
81 |
lr_w2v = pickle.load(open('lr_w2v.pkl','rb')) |
|
|
82 |
|
|
|
83 |
model_list = [lr_bow,lr_tf,lr_hash,lr_w2v] |
|
|
84 |
name_list = ["Logistic Regression with bow","Logistic Regression with tf_idf", "Logistic Regression with hash","Logistic Regressiont with word2vec"] |
|
|
85 |
y_data = [[y_bow_train,y_bow_test], [y_tf_train,y_tf_test], [y_hash_train,y_hash_test],[y_w2v_train,y_w2v_test]] |
|
|
86 |
X_data = [[X_bow_train,X_bow_test], [X_tf_train,X_tf_test], [X_hash_train,X_hash_test],[X_w2v_train,X_w2v_test]] |
|
|
87 |
matric_table(model_list, name_list, y_data, X_data) |
|
|
88 |
|
|
|
89 |
|
|
|
90 |
# ### Support Vectors Machine with tf-idf extracted data is the best among all SVCs |
|
|
91 |
|
|
|
92 |
# In[5]: |
|
|
93 |
|
|
|
94 |
|
|
|
95 |
svc_bow = pickle.load(open('svc_bow.pkl','rb')) |
|
|
96 |
svc_tf = pickle.load(open('svc_tf.pkl','rb')) |
|
|
97 |
svc_hash = pickle.load(open('svc_hash.pkl','rb')) |
|
|
98 |
svc_w2v = pickle.load(open('svc_w2v.pkl','rb')) |
|
|
99 |
# print result |
|
|
100 |
model_list = [svc_bow,svc_tf,svc_hash,svc_w2v] |
|
|
101 |
name_list = ["SVC with bow","SVC with tf_idf", "SVC with hash","SVC with word2vec"] |
|
|
102 |
y_data = [[y_bow_train,y_bow_test], [y_tf_train,y_tf_test], [y_hash_train,y_hash_test],[y_w2v_train,y_w2v_test]] |
|
|
103 |
X_data = [[X_bow_train,X_bow_test], [X_tf_train,X_tf_test], [X_hash_train,X_hash_test],[X_w2v_train,X_w2v_test]] |
|
|
104 |
matric_table(model_list, name_list, y_data, X_data) |
|
|
105 |
|
|
|
106 |
|
|
|
107 |
# ### K Nearest Neighours with bag-of-words extracted data is the best among all KNNs |
|
|
108 |
|
|
|
109 |
# In[6]: |
|
|
110 |
|
|
|
111 |
|
|
|
112 |
knn_bow = pickle.load(open('knn_bow.pkl','rb')) |
|
|
113 |
knn_tf = pickle.load(open('knn_tf.pkl','rb')) |
|
|
114 |
knn_hash = pickle.load(open('knn_hash.pkl','rb')) |
|
|
115 |
knn_w2v = pickle.load(open('knn_w2v.pkl','rb')) |
|
|
116 |
|
|
|
117 |
model_list = [knn_bow,knn_tf,knn_hash,knn_w2v] |
|
|
118 |
name_list = ["KNN with bow","KNN with tf_idf", "KNN with hash","KNN with word2vec"] |
|
|
119 |
y_data = [[y_bow_train,y_bow_test], [y_tf_train,y_tf_test], [y_hash_train,y_hash_test],[y_w2v_train,y_w2v_test]] |
|
|
120 |
X_data = [[X_bow_train,X_bow_test], [X_tf_train,X_tf_test], [X_hash_train,X_hash_test],[X_w2v_train,X_w2v_test]] |
|
|
121 |
matric_table(model_list, name_list, y_data, X_data) |
|
|
122 |
|
|
|
123 |
|
|
|
124 |
# ### Binomial Naive Bayes with tf_idf extracted data is the best among all BNBs |
|
|
125 |
|
|
|
126 |
# In[7]: |
|
|
127 |
|
|
|
128 |
|
|
|
129 |
bnb_bow = pickle.load(open('bnb_bow.pkl','rb')) |
|
|
130 |
bnb_tf = pickle.load(open('bnb_tf.pkl','rb')) |
|
|
131 |
bnb_hash = pickle.load(open('bnb_hash.pkl','rb')) |
|
|
132 |
bnb_w2v = pickle.load(open('bnb_w2v.pkl','rb')) |
|
|
133 |
|
|
|
134 |
model_list = [bnb_bow,bnb_tf,bnb_hash,bnb_w2v] |
|
|
135 |
name_list = ["Binomial Naive Bayes with bow","Binomial Naive Bayes with tf_idf", "Binomial Naive Bayes with hash","Binomial Naive Bayes with word2vec"] |
|
|
136 |
y_data = [[y_bow_train,y_bow_test], [y_tf_train,y_tf_test], [y_hash_train,y_hash_test],[y_w2v_train,y_w2v_test]] |
|
|
137 |
X_data = [[X_bow_train,X_bow_test], [X_tf_train,X_tf_test], [X_hash_train,X_hash_test],[X_w2v_train,X_w2v_test]] |
|
|
138 |
matric_table(model_list, name_list, y_data, X_data) |
|
|
139 |
|
|
|
140 |
|
|
|
141 |
# ### In conclusion, Random Forest with word2vec dataset is the winner among all classifers with highest score on test accuracy, precision, recall and F1 scores |
|
|
142 |
|
|
|
143 |
# In[8]: |
|
|
144 |
|
|
|
145 |
|
|
|
146 |
### Find the best classifer among all classifers |
|
|
147 |
model_list = [rf_w2v,lr_bow,svc_tf,knn_bow,bnb_tf] |
|
|
148 |
name_list = ["Random Forest with word2vec","Logistic Regression with bag-of-words", "SVC with tf_idf","KNN with bag-of-words","Binomial Naive Bayes with tf_idf"] |
|
|
149 |
y_data = [[y_w2v_train,y_w2v_test], [y_bow_train,y_bow_test], [y_tf_train,y_tf_test],[y_bow_train,y_bow_test],[y_tf_train,y_tf_test]] |
|
|
150 |
X_data = [[X_w2v_train,X_w2v_test], [X_bow_train,X_bow_test], [X_tf_train,X_tf_test],[X_bow_train,X_bow_test],[X_tf_train,X_tf_test]] |
|
|
151 |
matric_table(model_list, name_list, y_data, X_data) |
|
|
152 |
|
|
|
153 |
|
|
|
154 |
# In[ ]: |
|
|
155 |
|
|
|
156 |
|
|
|
157 |
|
|
|
158 |
|