Diff of /code/Data Cleanse.py [000000] .. [85d58a]

Switch to unified view

a b/code/Data Cleanse.py
1
#!/usr/bin/env python
2
# coding: utf-8
3
4
# In[1]:
5
6
7
import pandas as pd
8
import warnings
9
warnings.filterwarnings('ignore')
10
import nltk
11
from nltk.corpus import wordnet as wn 
12
from nltk.stem import WordNetLemmatizer 
13
from nltk.tokenize import word_tokenize
14
from nltk.corpus import stopwords
15
import numpy as np
16
import string
17
import re
18
19
# Load raw data
20
df = pd.read_csv('overview-of-recordings.csv')
21
#start cleansing
22
#count duplicate
23
duplicate=df.duplicated().sum()
24
Text = df[['phrase', 'prompt']]
25
26
# save English stopwords
27
stopwords_list = set(stopwords.words("english"))
28
# Clean text data
29
def phrase_cleanse(phrase):
30
    #Tokenize and divide phrase into separate words
31
    token_words = word_tokenize(phrase)
32
    
33
    # Convert all texts to lower cases
34
    words_step1 = []
35
    for word_1 in token_words:
36
        words_step1.append(word_1.lower())
37
    
38
    #Clear all punctuation
39
    words_step2 = [] 
40
    for word_2 in words_step1:
41
        word_cleaned = re.sub(r'[^\w\s]','',word_2)
42
        words_step2.append(word_cleaned)
43
    
44
    #Clean the text list
45
    words_step3 = []
46
    for word_3 in words_step2:
47
        # check if every characters are alphbets
48
        if word_3.isalpha():
49
            # get rid of stop words
50
            if word_3 not in list(stopwords_list):
51
                words_step3.append(word_3)
52
            else:
53
                continue
54
    
55
    #Lemmatization - group different forms of same word which has more than 2 characters into one word
56
    lem = nltk.stem.WordNetLemmatizer()
57
    lem_list = []
58
    for word_4 in words_step3:
59
        if(len(word_4) > 2):
60
            lem_list.append(lem.lemmatize(word_4))
61
    
62
    join_text = " ".join(lem_list)  
63
    return join_text
64
65
text = np.array(Text.loc[:,'phrase'])
66
new_text = []
67
for i in text:
68
    new_text.append(phrase_cleanse(i))
69
Text.insert(2,'new_text',new_text)
70
Text.to_csv(f"cleaned_data.csv", index=False)
71
72
73
# In[ ]:
74
75
76
77