|
a |
|
b/code/DataTransformationAndPreparation.py |
|
|
1 |
#!/usr/bin/env python |
|
|
2 |
# coding: utf-8 |
|
|
3 |
|
|
|
4 |
# In[1]: |
|
|
5 |
|
|
|
6 |
|
|
|
7 |
import pandas as pd |
|
|
8 |
import numpy as np |
|
|
9 |
from sklearn.decomposition import PCA |
|
|
10 |
import warnings |
|
|
11 |
warnings.filterwarnings('ignore') |
|
|
12 |
|
|
|
13 |
|
|
|
14 |
# fucntion for PCA as feature selection |
|
|
15 |
# set cutoff value is number of components that represents 99% of variance |
|
|
16 |
# return reduced dataset with appropriate PCA components represented 99% variance |
|
|
17 |
def PCA_project(data, data_name="", threshold = 99): |
|
|
18 |
max_component = data.shape[1] |
|
|
19 |
cutoff = threshold |
|
|
20 |
covar_matrix = PCA(n_components = max_component) |
|
|
21 |
covar_matrix.fit(data) |
|
|
22 |
variance = covar_matrix.explained_variance_ratio_ |
|
|
23 |
var = np.cumsum(np.round(variance, decimals = 4)*100) |
|
|
24 |
index = 0 |
|
|
25 |
for i in range(len(var)): |
|
|
26 |
|
|
|
27 |
if np.round(var[i]) < cutoff: |
|
|
28 |
index += 1 |
|
|
29 |
else: |
|
|
30 |
break |
|
|
31 |
principal=PCA(n_components=index) |
|
|
32 |
principal.fit(data) |
|
|
33 |
print('%s reduce features from %d to %d'% (data_name,max_component, index)) |
|
|
34 |
return pd.DataFrame(principal.transform(data)) |
|
|
35 |
|
|
|
36 |
#read apply PCA on our 4 dataset:bag of words, tf_idf, hash, word2vec |
|
|
37 |
df_bow = pd.read_csv('bag_word_df.csv') |
|
|
38 |
bow_P = PCA_project(df_bow.drop('prompt', axis=1), 'bag of words') |
|
|
39 |
|
|
|
40 |
df_tf_idf = pd.read_csv('tf_idf.csv') |
|
|
41 |
tf_idf_P= PCA_project(df_tf_idf.drop('prompt', axis=1), 'tf_idf') |
|
|
42 |
|
|
|
43 |
df_hash_vectorize = pd.read_csv('hash_vectorize.csv') |
|
|
44 |
hash_P= PCA_project(df_hash_vectorize.drop('prompt', axis=1), 'hash_vectorize') |
|
|
45 |
|
|
|
46 |
df_w2v = pd.read_csv('df_w2v.csv') |
|
|
47 |
w2v_P= PCA_project(df_w2v.drop('prompt',axis =1), 'word2vec') |
|
|
48 |
|
|
|
49 |
#save these transformed data |
|
|
50 |
bow_P.to_csv('bow_P.csv', index = False) |
|
|
51 |
tf_idf_P.to_csv('tf_idf_P.csv', index = False) |
|
|
52 |
hash_P.to_csv('hash_P.csv', index = False) |
|
|
53 |
w2v_P.to_csv('w2v_P.csv', index = False) |
|
|
54 |
|
|
|
55 |
|
|
|
56 |
# In[2]: |
|
|
57 |
|
|
|
58 |
|
|
|
59 |
from sklearn.preprocessing import LabelEncoder |
|
|
60 |
from sklearn.model_selection import train_test_split |
|
|
61 |
Text = pd.read_csv('cleaned_data.csv') |
|
|
62 |
y = Text["prompt"] |
|
|
63 |
|
|
|
64 |
X_bow_train, X_bow_test, y_bow_train, y_bow_test = train_test_split(bow_P,y,test_size = 0.2, random_state =0, stratify = y) |
|
|
65 |
X_tf_train, X_tf_test, y_tf_train, y_tf_test = train_test_split(tf_idf_P,y,test_size = 0.2, random_state =0, stratify = y) |
|
|
66 |
X_hash_train, X_hash_test, y_hash_train, y_hash_test = train_test_split(hash_P,y,test_size = 0.2, random_state =0, stratify = y) |
|
|
67 |
X_w2v_train, X_w2v_test, y_w2v_train, y_w2v_test = train_test_split(w2v_P,y,test_size = 0.2, random_state =0, stratify = y) |
|
|
68 |
|
|
|
69 |
pd.DataFrame(X_bow_train).to_csv('X_bow_train.csv', index = False) |
|
|
70 |
pd.DataFrame(X_bow_test).to_csv('X_bow_test.csv', index = False) |
|
|
71 |
pd.DataFrame(y_bow_train).to_csv('y_bow_train.csv', index = False) |
|
|
72 |
pd.DataFrame(y_bow_test).to_csv('y_bow_test.csv', index = False) |
|
|
73 |
|
|
|
74 |
pd.DataFrame(X_tf_train).to_csv('X_tf_train.csv', index = False) |
|
|
75 |
pd.DataFrame(X_tf_test).to_csv('X_tf_test.csv', index = False) |
|
|
76 |
pd.DataFrame(y_tf_train).to_csv('y_tf_train.csv', index = False) |
|
|
77 |
pd.DataFrame(y_tf_test).to_csv('y_tf_test.csv', index = False) |
|
|
78 |
|
|
|
79 |
pd.DataFrame(X_hash_train).to_csv('X_hash_train.csv', index = False) |
|
|
80 |
pd.DataFrame(X_hash_test).to_csv('X_hash_test.csv', index = False) |
|
|
81 |
pd.DataFrame(y_hash_train).to_csv('y_hash_train.csv', index = False) |
|
|
82 |
pd.DataFrame(y_hash_test).to_csv('y_hash_test.csv', index = False) |
|
|
83 |
|
|
|
84 |
pd.DataFrame(X_w2v_train).to_csv('X_w2v_train.csv', index = False) |
|
|
85 |
pd.DataFrame(X_w2v_test).to_csv('X_w2v_test.csv', index = False) |
|
|
86 |
pd.DataFrame(y_w2v_train).to_csv('y_w2v_train.csv', index = False) |
|
|
87 |
pd.DataFrame(y_w2v_test).to_csv('y_w2v_test.csv', index = False) |
|
|
88 |
|
|
|
89 |
|
|
|
90 |
# In[ ]: |
|
|
91 |
|
|
|
92 |
|
|
|
93 |
|
|
|
94 |
|