Diff of /code/ModelTraining.py [000000] .. [85d58a]

Switch to unified view

a b/code/ModelTraining.py
1
#!/usr/bin/env python
2
# coding: utf-8
3
4
# In[1]:
5
6
7
import pandas as pd
8
import numpy as np
9
from sklearn.model_selection import train_test_split
10
from sklearn.preprocessing import LabelEncoder
11
import warnings
12
warnings.filterwarnings('ignore')
13
14
X_bow_train = pd.read_csv('X_bow_train.csv')
15
X_bow_test  = pd.read_csv('X_bow_test.csv')
16
y_bow_train = pd.read_csv('y_bow_train.csv')
17
y_bow_test  = pd.read_csv('y_bow_test.csv')
18
19
20
X_tf_train = pd.read_csv('X_tf_train.csv')
21
X_tf_test  = pd.read_csv('X_tf_test.csv')
22
y_tf_train = pd.read_csv('y_tf_train.csv')
23
y_tf_test  = pd.read_csv('y_tf_test.csv')
24
25
X_hash_train = pd.read_csv('X_hash_train.csv')
26
X_hash_test  = pd.read_csv('X_hash_test.csv')
27
y_hash_train = pd.read_csv('y_hash_train.csv')
28
y_hash_test  = pd.read_csv('y_hash_test.csv')
29
30
X_w2v_train = pd.read_csv('X_w2v_train.csv')
31
X_w2v_test  = pd.read_csv('X_w2v_test.csv')
32
y_w2v_train = pd.read_csv('y_w2v_train.csv')
33
y_w2v_test  = pd.read_csv('y_w2v_test.csv')
34
35
36
# In[2]:
37
38
39
import pickle
40
from sklearn.ensemble import RandomForestClassifier
41
# train model with all features
42
rf_bow = RandomForestClassifier(n_estimators=100,
43
                                max_features=None,
44
                                oob_score=True,
45
                                n_jobs=-1,
46
                                random_state=0)
47
rf_tf = RandomForestClassifier(n_estimators=100,
48
                                max_features=None,
49
                                oob_score=True,
50
                                n_jobs=-1,
51
                                random_state=0)
52
rf_hash = RandomForestClassifier(n_estimators=100,
53
                                max_features=None,
54
                                oob_score=True,
55
                                n_jobs=-1,
56
                                random_state=0)
57
rf_w2v = RandomForestClassifier(n_estimators=100,
58
                                max_features=None,
59
                                oob_score=True,
60
                                n_jobs=-1,
61
                                random_state=0)
62
63
rf_bow.fit(X_bow_train, y_bow_train)
64
rf_tf.fit(X_tf_train, y_tf_train)
65
rf_hash.fit(X_hash_train, y_hash_train)
66
rf_w2v.fit(X_w2v_train, y_w2v_train)
67
68
69
# In[3]:
70
71
72
pickle.dump(rf_bow, open('rf_bow.pkl','wb'))
73
pickle.dump(rf_tf, open('rf_tf.pkl','wb'))
74
pickle.dump(rf_hash, open('rf_hash.pkl','wb'))
75
pickle.dump(rf_w2v, open('rf_w2v.pkl','wb'))
76
77
78
# In[4]:
79
80
81
#train model for logistic Regression which is not inherently multiclass classifers. 
82
#In this case, we use  defualt auto setting that if input is binary using OVR otherwise using multnomial
83
from sklearn.linear_model import LogisticRegression
84
85
lr_bow = LogisticRegression()
86
lr_tf = LogisticRegression()
87
lr_hash = LogisticRegression()
88
lr_w2v = LogisticRegression()
89
90
lr_bow.fit(X_bow_train, y_bow_train)
91
lr_tf.fit(X_tf_train, y_tf_train)
92
lr_hash.fit(X_hash_train, y_hash_train)
93
lr_w2v.fit(X_w2v_train, y_w2v_train)
94
95
96
# In[5]:
97
98
99
pickle.dump(lr_bow, open('lr_bow.pkl','wb'))
100
pickle.dump(lr_tf, open('lr_tf.pkl','wb'))
101
pickle.dump(lr_hash, open('lr_hash.pkl','wb'))
102
pickle.dump(lr_w2v, open('lr_w2v.pkl','wb'))
103
104
105
# In[6]:
106
107
108
#train model for linear svm, which is not inherently multiclass classifers. 
109
#In this case, we use One VS Rest to save computing 
110
from sklearn.svm import SVC
111
112
svc_bow = SVC(decision_function_shape='ovr')
113
svc_tf = SVC(decision_function_shape='ovr')
114
svc_hash = SVC(decision_function_shape='ovr')
115
svc_w2v = SVC(decision_function_shape='ovr')
116
117
svc_bow.fit(X_bow_train, y_bow_train)
118
svc_tf.fit(X_tf_train, y_tf_train)
119
svc_hash.fit(X_hash_train, y_hash_train)
120
svc_w2v.fit(X_w2v_train, y_w2v_train)
121
122
123
# In[7]:
124
125
126
pickle.dump(svc_bow, open('svc_bow.pkl','wb'))
127
pickle.dump(svc_tf, open('svc_tf.pkl','wb'))
128
pickle.dump(svc_hash, open('svc_hash.pkl','wb'))
129
pickle.dump(svc_w2v, open('svc_w2v.pkl','wb'))
130
131
132
# In[8]:
133
134
135
#train model for KNN
136
from sklearn.neighbors import KNeighborsClassifier
137
138
knn_bow = KNeighborsClassifier(n_neighbors=3)
139
knn_tf = KNeighborsClassifier(n_neighbors=3)
140
knn_hash = KNeighborsClassifier(n_neighbors=3)
141
knn_w2v = KNeighborsClassifier(n_neighbors=3)
142
143
knn_bow.fit(X_bow_train, y_bow_train)
144
knn_tf.fit(X_tf_train, y_tf_train)
145
knn_hash.fit(X_hash_train, y_hash_train)
146
knn_w2v.fit(X_w2v_train, y_w2v_train)
147
148
149
# In[9]:
150
151
152
pickle.dump(knn_bow, open('knn_bow.pkl','wb'))
153
pickle.dump(knn_tf, open('knn_tf.pkl','wb'))
154
pickle.dump(knn_hash, open('knn_hash.pkl','wb'))
155
pickle.dump(knn_w2v, open('knn_w2v.pkl','wb'))
156
157
158
# In[10]:
159
160
161
#train model for Naive Bayes. 
162
#Bernoulli NB can only focus on a single keyword, 
163
#but will also count how many times that keyword does not occur in the document
164
from sklearn.naive_bayes import BernoulliNB
165
166
167
bnb_bow = BernoulliNB()
168
bnb_tf = BernoulliNB()
169
bnb_hash = BernoulliNB()
170
bnb_w2v = BernoulliNB()
171
172
bnb_bow.fit(X_bow_train, y_bow_train)
173
bnb_tf.fit(X_tf_train, y_tf_train)
174
bnb_hash.fit(X_hash_train, y_hash_train)
175
bnb_w2v.fit(X_w2v_train, y_w2v_train)
176
177
178
# In[11]:
179
180
181
pickle.dump(bnb_bow, open('bnb_bow.pkl','wb'))
182
pickle.dump(bnb_tf, open('bnb_tf.pkl','wb'))
183
pickle.dump(bnb_hash, open('bnb_hash.pkl','wb'))
184
pickle.dump(bnb_w2v, open('bnb_w2v.pkl','wb'))
185
186
187
# In[ ]:
188
189
190
191