--- a +++ b/code/Data Cleanse.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[1]: + + +import pandas as pd +import warnings +warnings.filterwarnings('ignore') +import nltk +from nltk.corpus import wordnet as wn +from nltk.stem import WordNetLemmatizer +from nltk.tokenize import word_tokenize +from nltk.corpus import stopwords +import numpy as np +import string +import re + +# Load raw data +df = pd.read_csv('overview-of-recordings.csv') +#start cleansing +#count duplicate +duplicate=df.duplicated().sum() +Text = df[['phrase', 'prompt']] + +# save English stopwords +stopwords_list = set(stopwords.words("english")) +# Clean text data +def phrase_cleanse(phrase): + #Tokenize and divide phrase into separate words + token_words = word_tokenize(phrase) + + # Convert all texts to lower cases + words_step1 = [] + for word_1 in token_words: + words_step1.append(word_1.lower()) + + #Clear all punctuation + words_step2 = [] + for word_2 in words_step1: + word_cleaned = re.sub(r'[^\w\s]','',word_2) + words_step2.append(word_cleaned) + + #Clean the text list + words_step3 = [] + for word_3 in words_step2: + # check if every characters are alphbets + if word_3.isalpha(): + # get rid of stop words + if word_3 not in list(stopwords_list): + words_step3.append(word_3) + else: + continue + + #Lemmatization - group different forms of same word which has more than 2 characters into one word + lem = nltk.stem.WordNetLemmatizer() + lem_list = [] + for word_4 in words_step3: + if(len(word_4) > 2): + lem_list.append(lem.lemmatize(word_4)) + + join_text = " ".join(lem_list) + return join_text + +text = np.array(Text.loc[:,'phrase']) +new_text = [] +for i in text: + new_text.append(phrase_cleanse(i)) +Text.insert(2,'new_text',new_text) +Text.to_csv(f"cleaned_data.csv", index=False) + + +# In[ ]: + + + +