Diff of /code/DataPrepreocssing.py [000000] .. [85d58a]

Switch to side-by-side view

--- a
+++ b/code/DataPrepreocssing.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[1]:
+
+
+import pandas as pd
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
+from gensim.models import Word2Vec
+import warnings
+warnings.filterwarnings('ignore')
+#read cleaned data
+Text = pd.read_csv('cleaned_data.csv')
+
+# Create and fit tf_idf model
+text_vectorize = TfidfVectorizer()
+X_tf_idf = text_vectorize.fit_transform(Text["new_text"])
+
+dense_list = X_tf_idf.todense().tolist()
+feature_names = text_vectorize.get_feature_names()
+df_tf_idf = pd.DataFrame(dense_list, columns = feature_names)
+
+# concatenate prompt column with tf_idf matrix
+text_tf_idf = pd.concat([Text["prompt"], df_tf_idf], axis = 1)
+text_tf_idf.to_csv(f"tf_idf.csv", index=False)
+
+# Create and fit hashvector model
+n = Text['prompt'].nunique()
+text_hashvectorize = HashingVectorizer(n_features = n*3)
+X_hash = text_hashvectorize.fit_transform(Text["new_text"])
+
+df_hash_vectorize = pd.DataFrame(X_hash.toarray())
+
+# concatenate prompt column with hash vectorized matrix
+text_hash_vectorize = pd.concat([Text["prompt"], df_hash_vectorize], axis = 1)
+text_hash_vectorize.to_csv(f"hash_vectorize.csv", index=False)
+
+# extract feature using bag_of_words
+bag_word = CountVectorizer()
+feature_bow = bag_word.fit_transform(Text["new_text"].values)
+
+# maping feature 
+df_bow = pd.DataFrame(feature_bow.todense().tolist(), columns = bag_word.get_feature_names())
+
+# concatenate prompt column with bow matrix
+bag_word_df = pd.concat([Text['prompt'], df_bow], axis = 1)
+bag_word_df.to_csv('bag_word_df.csv',index=False)
+
+# Create the list of list format for gensim w2v modeling 
+Text['new_text_clean'] = Text['new_text'].apply(lambda x: x.split(" "))
+
+# Train the word2vec model
+w2v_model = Word2Vec(Text['new_text_clean'], min_count = 1,vector_size = 100, window = 5)
+
+
+# Take the average of the word vectors for the words contained in each sentence
+def word_avg_vect(data, model, num_features):
+    words = set(model.wv.index_to_key)
+    X_vect = np.array([np.array([model.wv[i] for i in s if i in words]) for s in data])
+    X_vect_avg = []
+    for v in X_vect:
+        if v.size:
+            X_vect_avg.append(v.mean(axis = 0))
+        else:
+            X_vect_avg.append(np.zeros(num_features, dtype = float))
+
+    df_vect_avg = pd.DataFrame(X_vect_avg)
+    return df_vect_avg
+
+X_w2v = word_avg_vect(Text['new_text_clean'], w2v_model, 100)
+# concatenate prompt column with averaged w2v matrix
+df_w2v = pd.concat([Text["prompt"], X_w2v], axis = 1)
+df_w2v.to_csv(f"df_w2v.csv", index=False)
+
+
+# In[ ]:
+
+
+
+