[85d58a]: / code / ModelEvaluation.py

Download this file

159 lines (106 with data), 6.5 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
import warnings
warnings.filterwarnings('ignore')
def matric_table(model_list, name_list,y_data, X_data):
result = []
for m,n,a,b in zip(model_list, name_list, y_data, X_data):
report = []
report.append(n)
report.append(accuracy_score(a[0], m.predict(b[0])) * 100)
report.append(accuracy_score(a[1], m.predict(b[1])) * 100)
report.append(recall_score(a[1], m.predict(b[1]),average = 'weighted') * 100)
report.append(precision_score(a[1], m.predict(b[1]),average = 'weighted') * 100)
report.append(f1_score(a[1], m.predict(b[1]),average = 'weighted') * 100)
result.append(report)
df = pd.DataFrame(data = result, columns=['Model', 'Training Accuracy %', 'Testing Accuracy %','Testing precision %', 'Testing recall %', 'Testing f1_score %'])
df = df.set_index('Model')
return df.style.highlight_max(color = 'lightgreen', axis = 0)
# In[2]:
X_bow_train = pd.read_csv('X_bow_train.csv')
X_bow_test = pd.read_csv('X_bow_test.csv')
y_bow_train = pd.read_csv('y_bow_train.csv')
y_bow_test = pd.read_csv('y_bow_test.csv')
X_tf_train = pd.read_csv('X_tf_train.csv')
X_tf_test = pd.read_csv('X_tf_test.csv')
y_tf_train = pd.read_csv('y_tf_train.csv')
y_tf_test = pd.read_csv('y_tf_test.csv')
X_hash_train = pd.read_csv('X_hash_train.csv')
X_hash_test = pd.read_csv('X_hash_test.csv')
y_hash_train = pd.read_csv('y_hash_train.csv')
y_hash_test = pd.read_csv('y_hash_test.csv')
X_w2v_train = pd.read_csv('X_w2v_train.csv')
X_w2v_test = pd.read_csv('X_w2v_test.csv')
y_w2v_train = pd.read_csv('y_w2v_train.csv')
y_w2v_test = pd.read_csv('y_w2v_test.csv')
# ### Random Forest with word2vec extracted data is the best among all Random Forest Classifers
# In[3]:
rf_bow = pickle.load(open('rf_bow.pkl','rb'))
rf_tf = pickle.load(open('rf_tf.pkl','rb'))
rf_hash = pickle.load(open('rf_hash.pkl','rb'))
rf_w2v = pickle.load(open('rf_w2v.pkl','rb'))
model_list = [rf_bow,rf_tf,rf_hash,rf_w2v]
name_list = ["Random Forest with bow","Random Forest with tf_idf", "Random Forest with hash","Random Forest with word2vec"]
y_data = [[y_bow_train,y_bow_test], [y_tf_train,y_tf_test], [y_hash_train,y_hash_test],[y_w2v_train,y_w2v_test]]
X_data = [[X_bow_train,X_bow_test], [X_tf_train,X_tf_test], [X_hash_train,X_hash_test],[X_w2v_train,X_w2v_test]]
matric_table(model_list, name_list, y_data, X_data)
# ### Logistic Regression with bag-of-words extracted data is the best among all logistic Regression Classifers
# In[4]:
# print result, the warning indicate there are some type the classifers never predict. but since data is imbalence in that rare class so the accuracy won't be impacted
lr_bow = pickle.load(open('lr_bow.pkl','rb'))
lr_tf = pickle.load(open('lr_tf.pkl','rb'))
lr_hash = pickle.load(open('lr_hash.pkl','rb'))
lr_w2v = pickle.load(open('lr_w2v.pkl','rb'))
model_list = [lr_bow,lr_tf,lr_hash,lr_w2v]
name_list = ["Logistic Regression with bow","Logistic Regression with tf_idf", "Logistic Regression with hash","Logistic Regressiont with word2vec"]
y_data = [[y_bow_train,y_bow_test], [y_tf_train,y_tf_test], [y_hash_train,y_hash_test],[y_w2v_train,y_w2v_test]]
X_data = [[X_bow_train,X_bow_test], [X_tf_train,X_tf_test], [X_hash_train,X_hash_test],[X_w2v_train,X_w2v_test]]
matric_table(model_list, name_list, y_data, X_data)
# ### Support Vectors Machine with tf-idf extracted data is the best among all SVCs
# In[5]:
svc_bow = pickle.load(open('svc_bow.pkl','rb'))
svc_tf = pickle.load(open('svc_tf.pkl','rb'))
svc_hash = pickle.load(open('svc_hash.pkl','rb'))
svc_w2v = pickle.load(open('svc_w2v.pkl','rb'))
# print result
model_list = [svc_bow,svc_tf,svc_hash,svc_w2v]
name_list = ["SVC with bow","SVC with tf_idf", "SVC with hash","SVC with word2vec"]
y_data = [[y_bow_train,y_bow_test], [y_tf_train,y_tf_test], [y_hash_train,y_hash_test],[y_w2v_train,y_w2v_test]]
X_data = [[X_bow_train,X_bow_test], [X_tf_train,X_tf_test], [X_hash_train,X_hash_test],[X_w2v_train,X_w2v_test]]
matric_table(model_list, name_list, y_data, X_data)
# ### K Nearest Neighours with bag-of-words extracted data is the best among all KNNs
# In[6]:
knn_bow = pickle.load(open('knn_bow.pkl','rb'))
knn_tf = pickle.load(open('knn_tf.pkl','rb'))
knn_hash = pickle.load(open('knn_hash.pkl','rb'))
knn_w2v = pickle.load(open('knn_w2v.pkl','rb'))
model_list = [knn_bow,knn_tf,knn_hash,knn_w2v]
name_list = ["KNN with bow","KNN with tf_idf", "KNN with hash","KNN with word2vec"]
y_data = [[y_bow_train,y_bow_test], [y_tf_train,y_tf_test], [y_hash_train,y_hash_test],[y_w2v_train,y_w2v_test]]
X_data = [[X_bow_train,X_bow_test], [X_tf_train,X_tf_test], [X_hash_train,X_hash_test],[X_w2v_train,X_w2v_test]]
matric_table(model_list, name_list, y_data, X_data)
# ### Binomial Naive Bayes with tf_idf extracted data is the best among all BNBs
# In[7]:
bnb_bow = pickle.load(open('bnb_bow.pkl','rb'))
bnb_tf = pickle.load(open('bnb_tf.pkl','rb'))
bnb_hash = pickle.load(open('bnb_hash.pkl','rb'))
bnb_w2v = pickle.load(open('bnb_w2v.pkl','rb'))
model_list = [bnb_bow,bnb_tf,bnb_hash,bnb_w2v]
name_list = ["Binomial Naive Bayes with bow","Binomial Naive Bayes with tf_idf", "Binomial Naive Bayes with hash","Binomial Naive Bayes with word2vec"]
y_data = [[y_bow_train,y_bow_test], [y_tf_train,y_tf_test], [y_hash_train,y_hash_test],[y_w2v_train,y_w2v_test]]
X_data = [[X_bow_train,X_bow_test], [X_tf_train,X_tf_test], [X_hash_train,X_hash_test],[X_w2v_train,X_w2v_test]]
matric_table(model_list, name_list, y_data, X_data)
# ### In conclusion, Random Forest with word2vec dataset is the winner among all classifers with highest score on test accuracy, precision, recall and F1 scores
# In[8]:
### Find the best classifer among all classifers
model_list = [rf_w2v,lr_bow,svc_tf,knn_bow,bnb_tf]
name_list = ["Random Forest with word2vec","Logistic Regression with bag-of-words", "SVC with tf_idf","KNN with bag-of-words","Binomial Naive Bayes with tf_idf"]
y_data = [[y_w2v_train,y_w2v_test], [y_bow_train,y_bow_test], [y_tf_train,y_tf_test],[y_bow_train,y_bow_test],[y_tf_train,y_tf_test]]
X_data = [[X_w2v_train,X_w2v_test], [X_bow_train,X_bow_test], [X_tf_train,X_tf_test],[X_bow_train,X_bow_test],[X_tf_train,X_tf_test]]
matric_table(model_list, name_list, y_data, X_data)
# In[ ]: