Switch to side-by-side view

--- a
+++ b/mimic_icd9_coding/utils/preproc.py
@@ -0,0 +1,116 @@
+#%%
+import nltk
+from nltk import word_tokenize, sent_tokenize
+from nltk.corpus import RegexpTokenizer, stopwords
+from nltk.stem.snowball import SnowballStemmer
+
+def remove_special_chars(data):
+    #replace special characers in notes
+    #Create a unicode character dictionary
+    special_chars = {'¶':' ', 'Þ': ' ', 'Û':' '}
+    data.replace(special_chars, regex=True, inplace=True)
+    # return data.replace(special_chars)
+    return data
+
+
+def lowercase_tokenizer(data, no_sentences):
+    tokenizer = RegexpTokenizer(r'\w+')
+    if no_sentences:
+        data = data.apply(lambda x: tokenizer.tokenize(x.lower()))
+    else:
+        data = data.apply(lambda a: [[tokenizer.tokenize(x.lower()) for x in d] for d in a])
+    return data
+
+
+
+def main_preproc_row(text):
+    """ With a single piece of text as input, run every preprocessing step defined above"""
+    raise NotImplementedError
+
+def main_preproc_series(data, stemming=True, no_punct=False, no_sentences=True, stop_words=True):
+    """ With input series of text, run every preprocessing step defined above"""
+
+    data = data.astype(str)
+
+    # remove special characters
+    data = remove_special_chars(data)
+
+    # seperate sentences -- consider not doing this because sentences maybe not common
+    if not no_sentences:
+        data = data.apply(sent_tokenize)
+
+    # # remove empty rows
+    # data = data[data.map(lambda d: len(d)) > 0]
+
+    # convert everything to lowercase and tokenize
+    data = lowercase_tokenizer(data, no_sentences)
+
+    # Remove numbers (and later names)
+    data = data.apply(lambda x: [s for s in x if s.isalpha()])
+
+    # filter out punctuation
+    if no_punct:
+       data = data.apply(lambda l: filter(lambda x: x not in string.punctuation, l))
+
+    # filter out stop words -- note: removes haven, we can remove with other names etc.
+    if stop_words:
+        stop_words = stopwords.words('english')
+        data = data.apply(lambda l: list(filter(lambda x: x not in stop_words, l)))
+
+    #preform stemming
+    if stemming:
+        stemmer = SnowballStemmer("english", ignore_stopwords=True)
+        data = data.apply(lambda x: [stemmer.stem(a) for a in x])
+
+    return data
+
+
+def bert_preproc(data):
+    """ With input row of text, run every preprocessing step defined above"""
+    """Note that by making this a row, we just call the preproc with apply"""
+    # data = data.astype(str)
+
+    # remove special characters
+    data = remove_special_chars(data)
+
+    # seperate sentences -- consider not doing this because sentences maybe not common
+    data = sent_tokenize(data)
+
+    # # remove empty rows
+    # data = data[data.map(lambda d: len(d)) > 0]
+
+    # convert everything to lowercase and tokenize
+    # data = lowercase_tokenizer(data, no_sentences)
+
+    # Remove numbers (and later names)
+    # data = data.apply(lambda x: [s for s in x if s.isalpha()])
+
+    # filter out punctuation
+    # if no_punct:
+    #    data = data.apply(lambda l: filter(lambda x: x not in string.punctuation, l))
+
+    # filter out stop words -- note: removes haven, we can remove with other names etc.
+    # if stop_words:
+        # stop_words = stopwords.words('english')
+        # data = data.apply(lambda l: list(filter(lambda x: x not in stop_words, l)))
+
+    #preform stemming
+    # if stemming:
+        # stemmer = SnowballStemmer("english", ignore_stopwords=True)
+        # data = data.apply(lambda x: [stemmer.stem(a) for a in x])
+
+    return data
+
+# %%
+# %%
+#Filter out some POS that we are not interested in 
+# filtered_words = {key: value for (key, value) in word_count.items() if nltk.pos_tag([key])[0][1] not in ['CC', 'IN', 'DT', 'CD']}
+
+#%% 
+def demo():
+    import pandas as pd
+    folder_path = "/home/br384/project/prot_data/"
+    data_path = folder_path + "evaluation_data.xlsx"
+    df = pd.read_excel(data_path, 'Notes1')
+    data = df['NOTE TEXT']
+    return main_preproc_series(data)