Switch to unified view

a b/code/DataTransformationAndPreparation.py
1
#!/usr/bin/env python
2
# coding: utf-8
3
4
# In[1]:
5
6
7
import pandas as pd
8
import numpy as np
9
from sklearn.decomposition import PCA
10
import warnings
11
warnings.filterwarnings('ignore')
12
13
14
# fucntion for PCA as feature selection 
15
# set cutoff value is number of components that represents 99% of variance 
16
# return reduced dataset with appropriate PCA components represented 99% variance
17
def PCA_project(data, data_name="", threshold = 99):
18
    max_component = data.shape[1]
19
    cutoff = threshold
20
    covar_matrix = PCA(n_components = max_component)
21
    covar_matrix.fit(data)
22
    variance = covar_matrix.explained_variance_ratio_
23
    var = np.cumsum(np.round(variance, decimals = 4)*100)
24
    index = 0
25
    for i in range(len(var)):
26
        
27
        if np.round(var[i]) < cutoff:
28
            index += 1
29
        else:
30
            break
31
    principal=PCA(n_components=index)
32
    principal.fit(data)
33
    print('%s reduce features from %d to %d'% (data_name,max_component, index))
34
    return pd.DataFrame(principal.transform(data))
35
36
#read apply PCA on our 4 dataset:bag of words, tf_idf, hash, word2vec
37
df_bow = pd.read_csv('bag_word_df.csv')
38
bow_P = PCA_project(df_bow.drop('prompt', axis=1), 'bag of words')
39
40
df_tf_idf = pd.read_csv('tf_idf.csv')
41
tf_idf_P= PCA_project(df_tf_idf.drop('prompt', axis=1), 'tf_idf')
42
43
df_hash_vectorize = pd.read_csv('hash_vectorize.csv')
44
hash_P= PCA_project(df_hash_vectorize.drop('prompt', axis=1), 'hash_vectorize')
45
46
df_w2v = pd.read_csv('df_w2v.csv')
47
w2v_P= PCA_project(df_w2v.drop('prompt',axis =1), 'word2vec')
48
49
#save these transformed data
50
bow_P.to_csv('bow_P.csv', index = False)
51
tf_idf_P.to_csv('tf_idf_P.csv', index = False)
52
hash_P.to_csv('hash_P.csv', index = False)
53
w2v_P.to_csv('w2v_P.csv', index = False)
54
55
56
# In[2]:
57
58
59
from sklearn.preprocessing import LabelEncoder
60
from sklearn.model_selection import train_test_split
61
Text = pd.read_csv('cleaned_data.csv')
62
y = Text["prompt"]
63
64
X_bow_train, X_bow_test, y_bow_train, y_bow_test = train_test_split(bow_P,y,test_size = 0.2, random_state =0, stratify = y)
65
X_tf_train, X_tf_test, y_tf_train, y_tf_test = train_test_split(tf_idf_P,y,test_size = 0.2, random_state =0, stratify = y)
66
X_hash_train, X_hash_test, y_hash_train, y_hash_test = train_test_split(hash_P,y,test_size = 0.2, random_state =0, stratify = y)
67
X_w2v_train, X_w2v_test, y_w2v_train, y_w2v_test = train_test_split(w2v_P,y,test_size = 0.2, random_state =0, stratify = y)
68
69
pd.DataFrame(X_bow_train).to_csv('X_bow_train.csv', index = False) 
70
pd.DataFrame(X_bow_test).to_csv('X_bow_test.csv', index = False)
71
pd.DataFrame(y_bow_train).to_csv('y_bow_train.csv', index = False) 
72
pd.DataFrame(y_bow_test).to_csv('y_bow_test.csv', index = False)
73
74
pd.DataFrame(X_tf_train).to_csv('X_tf_train.csv', index = False) 
75
pd.DataFrame(X_tf_test).to_csv('X_tf_test.csv', index = False)
76
pd.DataFrame(y_tf_train).to_csv('y_tf_train.csv', index = False) 
77
pd.DataFrame(y_tf_test).to_csv('y_tf_test.csv', index = False)
78
79
pd.DataFrame(X_hash_train).to_csv('X_hash_train.csv', index = False) 
80
pd.DataFrame(X_hash_test).to_csv('X_hash_test.csv', index = False)
81
pd.DataFrame(y_hash_train).to_csv('y_hash_train.csv', index = False) 
82
pd.DataFrame(y_hash_test).to_csv('y_hash_test.csv', index = False)
83
84
pd.DataFrame(X_w2v_train).to_csv('X_w2v_train.csv', index = False) 
85
pd.DataFrame(X_w2v_test).to_csv('X_w2v_test.csv', index = False)
86
pd.DataFrame(y_w2v_train).to_csv('y_w2v_train.csv', index = False) 
87
pd.DataFrame(y_w2v_test).to_csv('y_w2v_test.csv', index = False)
88
89
90
# In[ ]:
91
92
93
94