--- a +++ b/code/ModelTraining.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[1]: + + +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import LabelEncoder +import warnings +warnings.filterwarnings('ignore') + +X_bow_train = pd.read_csv('X_bow_train.csv') +X_bow_test = pd.read_csv('X_bow_test.csv') +y_bow_train = pd.read_csv('y_bow_train.csv') +y_bow_test = pd.read_csv('y_bow_test.csv') + + +X_tf_train = pd.read_csv('X_tf_train.csv') +X_tf_test = pd.read_csv('X_tf_test.csv') +y_tf_train = pd.read_csv('y_tf_train.csv') +y_tf_test = pd.read_csv('y_tf_test.csv') + +X_hash_train = pd.read_csv('X_hash_train.csv') +X_hash_test = pd.read_csv('X_hash_test.csv') +y_hash_train = pd.read_csv('y_hash_train.csv') +y_hash_test = pd.read_csv('y_hash_test.csv') + +X_w2v_train = pd.read_csv('X_w2v_train.csv') +X_w2v_test = pd.read_csv('X_w2v_test.csv') +y_w2v_train = pd.read_csv('y_w2v_train.csv') +y_w2v_test = pd.read_csv('y_w2v_test.csv') + + +# In[2]: + + +import pickle +from sklearn.ensemble import RandomForestClassifier +# train model with all features +rf_bow = RandomForestClassifier(n_estimators=100, + max_features=None, + oob_score=True, + n_jobs=-1, + random_state=0) +rf_tf = RandomForestClassifier(n_estimators=100, + max_features=None, + oob_score=True, + n_jobs=-1, + random_state=0) +rf_hash = RandomForestClassifier(n_estimators=100, + max_features=None, + oob_score=True, + n_jobs=-1, + random_state=0) +rf_w2v = RandomForestClassifier(n_estimators=100, + max_features=None, + oob_score=True, + n_jobs=-1, + random_state=0) + +rf_bow.fit(X_bow_train, y_bow_train) +rf_tf.fit(X_tf_train, y_tf_train) +rf_hash.fit(X_hash_train, y_hash_train) +rf_w2v.fit(X_w2v_train, y_w2v_train) + + +# In[3]: + + +pickle.dump(rf_bow, open('rf_bow.pkl','wb')) +pickle.dump(rf_tf, open('rf_tf.pkl','wb')) +pickle.dump(rf_hash, open('rf_hash.pkl','wb')) +pickle.dump(rf_w2v, open('rf_w2v.pkl','wb')) + + +# In[4]: + + +#train model for logistic Regression which is not inherently multiclass classifers. +#In this case, we use defualt auto setting that if input is binary using OVR otherwise using multnomial +from sklearn.linear_model import LogisticRegression + +lr_bow = LogisticRegression() +lr_tf = LogisticRegression() +lr_hash = LogisticRegression() +lr_w2v = LogisticRegression() + +lr_bow.fit(X_bow_train, y_bow_train) +lr_tf.fit(X_tf_train, y_tf_train) +lr_hash.fit(X_hash_train, y_hash_train) +lr_w2v.fit(X_w2v_train, y_w2v_train) + + +# In[5]: + + +pickle.dump(lr_bow, open('lr_bow.pkl','wb')) +pickle.dump(lr_tf, open('lr_tf.pkl','wb')) +pickle.dump(lr_hash, open('lr_hash.pkl','wb')) +pickle.dump(lr_w2v, open('lr_w2v.pkl','wb')) + + +# In[6]: + + +#train model for linear svm, which is not inherently multiclass classifers. +#In this case, we use One VS Rest to save computing +from sklearn.svm import SVC + +svc_bow = SVC(decision_function_shape='ovr') +svc_tf = SVC(decision_function_shape='ovr') +svc_hash = SVC(decision_function_shape='ovr') +svc_w2v = SVC(decision_function_shape='ovr') + +svc_bow.fit(X_bow_train, y_bow_train) +svc_tf.fit(X_tf_train, y_tf_train) +svc_hash.fit(X_hash_train, y_hash_train) +svc_w2v.fit(X_w2v_train, y_w2v_train) + + +# In[7]: + + +pickle.dump(svc_bow, open('svc_bow.pkl','wb')) +pickle.dump(svc_tf, open('svc_tf.pkl','wb')) +pickle.dump(svc_hash, open('svc_hash.pkl','wb')) +pickle.dump(svc_w2v, open('svc_w2v.pkl','wb')) + + +# In[8]: + + +#train model for KNN +from sklearn.neighbors import KNeighborsClassifier + +knn_bow = KNeighborsClassifier(n_neighbors=3) +knn_tf = KNeighborsClassifier(n_neighbors=3) +knn_hash = KNeighborsClassifier(n_neighbors=3) +knn_w2v = KNeighborsClassifier(n_neighbors=3) + +knn_bow.fit(X_bow_train, y_bow_train) +knn_tf.fit(X_tf_train, y_tf_train) +knn_hash.fit(X_hash_train, y_hash_train) +knn_w2v.fit(X_w2v_train, y_w2v_train) + + +# In[9]: + + +pickle.dump(knn_bow, open('knn_bow.pkl','wb')) +pickle.dump(knn_tf, open('knn_tf.pkl','wb')) +pickle.dump(knn_hash, open('knn_hash.pkl','wb')) +pickle.dump(knn_w2v, open('knn_w2v.pkl','wb')) + + +# In[10]: + + +#train model for Naive Bayes. +#Bernoulli NB can only focus on a single keyword, +#but will also count how many times that keyword does not occur in the document +from sklearn.naive_bayes import BernoulliNB + + +bnb_bow = BernoulliNB() +bnb_tf = BernoulliNB() +bnb_hash = BernoulliNB() +bnb_w2v = BernoulliNB() + +bnb_bow.fit(X_bow_train, y_bow_train) +bnb_tf.fit(X_tf_train, y_tf_train) +bnb_hash.fit(X_hash_train, y_hash_train) +bnb_w2v.fit(X_w2v_train, y_w2v_train) + + +# In[11]: + + +pickle.dump(bnb_bow, open('bnb_bow.pkl','wb')) +pickle.dump(bnb_tf, open('bnb_tf.pkl','wb')) +pickle.dump(bnb_hash, open('bnb_hash.pkl','wb')) +pickle.dump(bnb_w2v, open('bnb_w2v.pkl','wb')) + + +# In[ ]: + + + +