|
a |
|
b/code/DataPrepreocssing.py |
|
|
1 |
#!/usr/bin/env python |
|
|
2 |
# coding: utf-8 |
|
|
3 |
|
|
|
4 |
# In[1]: |
|
|
5 |
|
|
|
6 |
|
|
|
7 |
import pandas as pd |
|
|
8 |
import numpy as np |
|
|
9 |
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer |
|
|
10 |
from gensim.models import Word2Vec |
|
|
11 |
import warnings |
|
|
12 |
warnings.filterwarnings('ignore') |
|
|
13 |
#read cleaned data |
|
|
14 |
Text = pd.read_csv('cleaned_data.csv') |
|
|
15 |
|
|
|
16 |
# Create and fit tf_idf model |
|
|
17 |
text_vectorize = TfidfVectorizer() |
|
|
18 |
X_tf_idf = text_vectorize.fit_transform(Text["new_text"]) |
|
|
19 |
|
|
|
20 |
dense_list = X_tf_idf.todense().tolist() |
|
|
21 |
feature_names = text_vectorize.get_feature_names() |
|
|
22 |
df_tf_idf = pd.DataFrame(dense_list, columns = feature_names) |
|
|
23 |
|
|
|
24 |
# concatenate prompt column with tf_idf matrix |
|
|
25 |
text_tf_idf = pd.concat([Text["prompt"], df_tf_idf], axis = 1) |
|
|
26 |
text_tf_idf.to_csv(f"tf_idf.csv", index=False) |
|
|
27 |
|
|
|
28 |
# Create and fit hashvector model |
|
|
29 |
n = Text['prompt'].nunique() |
|
|
30 |
text_hashvectorize = HashingVectorizer(n_features = n*3) |
|
|
31 |
X_hash = text_hashvectorize.fit_transform(Text["new_text"]) |
|
|
32 |
|
|
|
33 |
df_hash_vectorize = pd.DataFrame(X_hash.toarray()) |
|
|
34 |
|
|
|
35 |
# concatenate prompt column with hash vectorized matrix |
|
|
36 |
text_hash_vectorize = pd.concat([Text["prompt"], df_hash_vectorize], axis = 1) |
|
|
37 |
text_hash_vectorize.to_csv(f"hash_vectorize.csv", index=False) |
|
|
38 |
|
|
|
39 |
# extract feature using bag_of_words |
|
|
40 |
bag_word = CountVectorizer() |
|
|
41 |
feature_bow = bag_word.fit_transform(Text["new_text"].values) |
|
|
42 |
|
|
|
43 |
# maping feature |
|
|
44 |
df_bow = pd.DataFrame(feature_bow.todense().tolist(), columns = bag_word.get_feature_names()) |
|
|
45 |
|
|
|
46 |
# concatenate prompt column with bow matrix |
|
|
47 |
bag_word_df = pd.concat([Text['prompt'], df_bow], axis = 1) |
|
|
48 |
bag_word_df.to_csv('bag_word_df.csv',index=False) |
|
|
49 |
|
|
|
50 |
# Create the list of list format for gensim w2v modeling |
|
|
51 |
Text['new_text_clean'] = Text['new_text'].apply(lambda x: x.split(" ")) |
|
|
52 |
|
|
|
53 |
# Train the word2vec model |
|
|
54 |
w2v_model = Word2Vec(Text['new_text_clean'], min_count = 1,vector_size = 100, window = 5) |
|
|
55 |
|
|
|
56 |
|
|
|
57 |
# Take the average of the word vectors for the words contained in each sentence |
|
|
58 |
def word_avg_vect(data, model, num_features): |
|
|
59 |
words = set(model.wv.index_to_key) |
|
|
60 |
X_vect = np.array([np.array([model.wv[i] for i in s if i in words]) for s in data]) |
|
|
61 |
X_vect_avg = [] |
|
|
62 |
for v in X_vect: |
|
|
63 |
if v.size: |
|
|
64 |
X_vect_avg.append(v.mean(axis = 0)) |
|
|
65 |
else: |
|
|
66 |
X_vect_avg.append(np.zeros(num_features, dtype = float)) |
|
|
67 |
|
|
|
68 |
df_vect_avg = pd.DataFrame(X_vect_avg) |
|
|
69 |
return df_vect_avg |
|
|
70 |
|
|
|
71 |
X_w2v = word_avg_vect(Text['new_text_clean'], w2v_model, 100) |
|
|
72 |
# concatenate prompt column with averaged w2v matrix |
|
|
73 |
df_w2v = pd.concat([Text["prompt"], X_w2v], axis = 1) |
|
|
74 |
df_w2v.to_csv(f"df_w2v.csv", index=False) |
|
|
75 |
|
|
|
76 |
|
|
|
77 |
# In[ ]: |
|
|
78 |
|
|
|
79 |
|
|
|
80 |
|
|
|
81 |
|