Diff of /code/ModelEvaluation.py [000000] .. [85d58a]

Switch to unified view

a b/code/ModelEvaluation.py
1
#!/usr/bin/env python
2
# coding: utf-8
3
4
# In[1]:
5
6
7
import pandas as pd
8
import numpy as np
9
import pickle
10
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
11
import warnings
12
warnings.filterwarnings('ignore')
13
14
def matric_table(model_list, name_list,y_data, X_data):
15
    result = []
16
    for m,n,a,b in zip(model_list, name_list, y_data, X_data):
17
        report = []
18
        report.append(n)
19
        report.append(accuracy_score(a[0], m.predict(b[0])) * 100)
20
        report.append(accuracy_score(a[1], m.predict(b[1])) * 100)
21
        report.append(recall_score(a[1], m.predict(b[1]),average = 'weighted') * 100)
22
        report.append(precision_score(a[1], m.predict(b[1]),average = 'weighted') * 100)
23
        report.append(f1_score(a[1], m.predict(b[1]),average = 'weighted') * 100)
24
        result.append(report)
25
    df = pd.DataFrame(data = result, columns=['Model', 'Training Accuracy %', 'Testing Accuracy %','Testing precision %', 'Testing recall %', 'Testing f1_score %'])
26
    df = df.set_index('Model')
27
    return df.style.highlight_max(color = 'lightgreen', axis = 0)
28
29
30
# In[2]:
31
32
33
X_bow_train = pd.read_csv('X_bow_train.csv')
34
X_bow_test  = pd.read_csv('X_bow_test.csv')
35
y_bow_train = pd.read_csv('y_bow_train.csv')
36
y_bow_test  = pd.read_csv('y_bow_test.csv')
37
38
39
X_tf_train = pd.read_csv('X_tf_train.csv')
40
X_tf_test  = pd.read_csv('X_tf_test.csv')
41
y_tf_train = pd.read_csv('y_tf_train.csv')
42
y_tf_test  = pd.read_csv('y_tf_test.csv')
43
44
X_hash_train = pd.read_csv('X_hash_train.csv')
45
X_hash_test  = pd.read_csv('X_hash_test.csv')
46
y_hash_train = pd.read_csv('y_hash_train.csv')
47
y_hash_test  = pd.read_csv('y_hash_test.csv')
48
49
X_w2v_train = pd.read_csv('X_w2v_train.csv')
50
X_w2v_test  = pd.read_csv('X_w2v_test.csv')
51
y_w2v_train = pd.read_csv('y_w2v_train.csv')
52
y_w2v_test  = pd.read_csv('y_w2v_test.csv')
53
54
55
# ### Random Forest with word2vec extracted data is the best among all Random Forest Classifers
56
57
# In[3]:
58
59
60
rf_bow = pickle.load(open('rf_bow.pkl','rb'))
61
rf_tf = pickle.load(open('rf_tf.pkl','rb'))
62
rf_hash = pickle.load(open('rf_hash.pkl','rb'))
63
rf_w2v = pickle.load(open('rf_w2v.pkl','rb'))
64
65
model_list = [rf_bow,rf_tf,rf_hash,rf_w2v]
66
name_list = ["Random Forest with bow","Random Forest with tf_idf", "Random Forest with hash","Random Forest with word2vec"]
67
y_data = [[y_bow_train,y_bow_test], [y_tf_train,y_tf_test], [y_hash_train,y_hash_test],[y_w2v_train,y_w2v_test]]
68
X_data = [[X_bow_train,X_bow_test], [X_tf_train,X_tf_test], [X_hash_train,X_hash_test],[X_w2v_train,X_w2v_test]]
69
matric_table(model_list, name_list, y_data, X_data)
70
71
72
# ### Logistic Regression with bag-of-words extracted data is the best among all logistic Regression Classifers
73
74
# In[4]:
75
76
77
# print result, the warning indicate there are some type the classifers never predict. but since data is imbalence in that rare class so the accuracy won't be impacted
78
lr_bow = pickle.load(open('lr_bow.pkl','rb'))
79
lr_tf = pickle.load(open('lr_tf.pkl','rb'))
80
lr_hash = pickle.load(open('lr_hash.pkl','rb'))
81
lr_w2v = pickle.load(open('lr_w2v.pkl','rb'))
82
83
model_list = [lr_bow,lr_tf,lr_hash,lr_w2v]
84
name_list = ["Logistic Regression with bow","Logistic Regression with tf_idf", "Logistic Regression with hash","Logistic Regressiont with word2vec"]
85
y_data = [[y_bow_train,y_bow_test], [y_tf_train,y_tf_test], [y_hash_train,y_hash_test],[y_w2v_train,y_w2v_test]]
86
X_data = [[X_bow_train,X_bow_test], [X_tf_train,X_tf_test], [X_hash_train,X_hash_test],[X_w2v_train,X_w2v_test]]
87
matric_table(model_list, name_list, y_data, X_data)
88
89
90
# ### Support Vectors Machine with tf-idf extracted data is the best among all SVCs 
91
92
# In[5]:
93
94
95
svc_bow = pickle.load(open('svc_bow.pkl','rb'))
96
svc_tf = pickle.load(open('svc_tf.pkl','rb'))
97
svc_hash = pickle.load(open('svc_hash.pkl','rb'))
98
svc_w2v = pickle.load(open('svc_w2v.pkl','rb'))
99
# print result
100
model_list = [svc_bow,svc_tf,svc_hash,svc_w2v]
101
name_list = ["SVC with bow","SVC with tf_idf", "SVC with hash","SVC with word2vec"]
102
y_data = [[y_bow_train,y_bow_test], [y_tf_train,y_tf_test], [y_hash_train,y_hash_test],[y_w2v_train,y_w2v_test]]
103
X_data = [[X_bow_train,X_bow_test], [X_tf_train,X_tf_test], [X_hash_train,X_hash_test],[X_w2v_train,X_w2v_test]]
104
matric_table(model_list, name_list, y_data, X_data)
105
106
107
# ### K Nearest Neighours with bag-of-words extracted data is the best among all KNNs 
108
109
# In[6]:
110
111
112
knn_bow = pickle.load(open('knn_bow.pkl','rb'))
113
knn_tf = pickle.load(open('knn_tf.pkl','rb'))
114
knn_hash = pickle.load(open('knn_hash.pkl','rb'))
115
knn_w2v = pickle.load(open('knn_w2v.pkl','rb'))
116
117
model_list = [knn_bow,knn_tf,knn_hash,knn_w2v]
118
name_list = ["KNN with bow","KNN with tf_idf", "KNN with hash","KNN with word2vec"]
119
y_data = [[y_bow_train,y_bow_test], [y_tf_train,y_tf_test], [y_hash_train,y_hash_test],[y_w2v_train,y_w2v_test]]
120
X_data = [[X_bow_train,X_bow_test], [X_tf_train,X_tf_test], [X_hash_train,X_hash_test],[X_w2v_train,X_w2v_test]]
121
matric_table(model_list, name_list, y_data, X_data)
122
123
124
# ### Binomial Naive Bayes with tf_idf extracted data is the best among all BNBs 
125
126
# In[7]:
127
128
129
bnb_bow = pickle.load(open('bnb_bow.pkl','rb'))
130
bnb_tf = pickle.load(open('bnb_tf.pkl','rb'))
131
bnb_hash = pickle.load(open('bnb_hash.pkl','rb'))
132
bnb_w2v = pickle.load(open('bnb_w2v.pkl','rb'))
133
134
model_list = [bnb_bow,bnb_tf,bnb_hash,bnb_w2v]
135
name_list = ["Binomial Naive Bayes with bow","Binomial Naive Bayes with tf_idf", "Binomial Naive Bayes with hash","Binomial Naive Bayes with word2vec"]
136
y_data = [[y_bow_train,y_bow_test], [y_tf_train,y_tf_test], [y_hash_train,y_hash_test],[y_w2v_train,y_w2v_test]]
137
X_data = [[X_bow_train,X_bow_test], [X_tf_train,X_tf_test], [X_hash_train,X_hash_test],[X_w2v_train,X_w2v_test]]
138
matric_table(model_list, name_list, y_data, X_data)
139
140
141
# ### In conclusion, Random Forest with word2vec dataset is the winner among all classifers with highest score on test accuracy, precision, recall and F1 scores
142
143
# In[8]:
144
145
146
### Find the best classifer among all classifers
147
model_list = [rf_w2v,lr_bow,svc_tf,knn_bow,bnb_tf]
148
name_list = ["Random Forest with word2vec","Logistic Regression with bag-of-words", "SVC with tf_idf","KNN with bag-of-words","Binomial Naive Bayes with tf_idf"]
149
y_data = [[y_w2v_train,y_w2v_test], [y_bow_train,y_bow_test], [y_tf_train,y_tf_test],[y_bow_train,y_bow_test],[y_tf_train,y_tf_test]]
150
X_data = [[X_w2v_train,X_w2v_test], [X_bow_train,X_bow_test], [X_tf_train,X_tf_test],[X_bow_train,X_bow_test],[X_tf_train,X_tf_test]]
151
matric_table(model_list, name_list, y_data, X_data)
152
153
154
# In[ ]:
155
156
157
158