[85d58a]: / code / DataPrepreocssing.py

Download this file

82 lines (57 with data), 2.6 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
from gensim.models import Word2Vec
import warnings
warnings.filterwarnings('ignore')
#read cleaned data
Text = pd.read_csv('cleaned_data.csv')
# Create and fit tf_idf model
text_vectorize = TfidfVectorizer()
X_tf_idf = text_vectorize.fit_transform(Text["new_text"])
dense_list = X_tf_idf.todense().tolist()
feature_names = text_vectorize.get_feature_names()
df_tf_idf = pd.DataFrame(dense_list, columns = feature_names)
# concatenate prompt column with tf_idf matrix
text_tf_idf = pd.concat([Text["prompt"], df_tf_idf], axis = 1)
text_tf_idf.to_csv(f"tf_idf.csv", index=False)
# Create and fit hashvector model
n = Text['prompt'].nunique()
text_hashvectorize = HashingVectorizer(n_features = n*3)
X_hash = text_hashvectorize.fit_transform(Text["new_text"])
df_hash_vectorize = pd.DataFrame(X_hash.toarray())
# concatenate prompt column with hash vectorized matrix
text_hash_vectorize = pd.concat([Text["prompt"], df_hash_vectorize], axis = 1)
text_hash_vectorize.to_csv(f"hash_vectorize.csv", index=False)
# extract feature using bag_of_words
bag_word = CountVectorizer()
feature_bow = bag_word.fit_transform(Text["new_text"].values)
# maping feature
df_bow = pd.DataFrame(feature_bow.todense().tolist(), columns = bag_word.get_feature_names())
# concatenate prompt column with bow matrix
bag_word_df = pd.concat([Text['prompt'], df_bow], axis = 1)
bag_word_df.to_csv('bag_word_df.csv',index=False)
# Create the list of list format for gensim w2v modeling
Text['new_text_clean'] = Text['new_text'].apply(lambda x: x.split(" "))
# Train the word2vec model
w2v_model = Word2Vec(Text['new_text_clean'], min_count = 1,vector_size = 100, window = 5)
# Take the average of the word vectors for the words contained in each sentence
def word_avg_vect(data, model, num_features):
words = set(model.wv.index_to_key)
X_vect = np.array([np.array([model.wv[i] for i in s if i in words]) for s in data])
X_vect_avg = []
for v in X_vect:
if v.size:
X_vect_avg.append(v.mean(axis = 0))
else:
X_vect_avg.append(np.zeros(num_features, dtype = float))
df_vect_avg = pd.DataFrame(X_vect_avg)
return df_vect_avg
X_w2v = word_avg_vect(Text['new_text_clean'], w2v_model, 100)
# concatenate prompt column with averaged w2v matrix
df_w2v = pd.concat([Text["prompt"], X_w2v], axis = 1)
df_w2v.to_csv(f"df_w2v.csv", index=False)
# In[ ]: