|
a |
|
b/process_data.py |
|
|
1 |
#!/usr/bin/env python |
|
|
2 |
""" |
|
|
3 |
Script to process the notes by tokenizing them and merging the token: |
|
|
4 |
1. Load in the data |
|
|
5 |
2. Drop duplicates |
|
|
6 |
3. Merge `category`, `description`, and `text` into a new column called `note` |
|
|
7 |
4. Tokenize text using `scispacy` and create new column called `scispacy_note` to save tokenized text |
|
|
8 |
5. Save a csv file onto disk |
|
|
9 |
""" |
|
|
10 |
import pandas as pd |
|
|
11 |
import spacy |
|
|
12 |
|
|
|
13 |
from pathlib import Path |
|
|
14 |
|
|
|
15 |
nlp = spacy.load('en_core_sci_md', disable=['parser', 'ner', 'tagger']) |
|
|
16 |
raw_csv = Path('./data/raw_dataset.csv') |
|
|
17 |
proc_csv = Path('./data/proc_dataset.csv') |
|
|
18 |
|
|
|
19 |
def tokenize_text(text): |
|
|
20 |
tokens = [token.text for token in nlp(text)] |
|
|
21 |
return ' '.join(tokens) |
|
|
22 |
|
|
|
23 |
def group_eth(eth): |
|
|
24 |
eth = eth.lower() |
|
|
25 |
if 'white' in eth: |
|
|
26 |
return 'white' |
|
|
27 |
elif 'black' in eth: |
|
|
28 |
return 'black' |
|
|
29 |
elif 'hispanic' in eth: |
|
|
30 |
return 'hispanic' |
|
|
31 |
elif 'asian' in eth: |
|
|
32 |
return 'asian' |
|
|
33 |
else: |
|
|
34 |
return 'unknown' |
|
|
35 |
|
|
|
36 |
if __name__=='__main__': |
|
|
37 |
df = pd.read_csv(raw_csv) |
|
|
38 |
df.drop_duplicates(inplace=True) |
|
|
39 |
df['note'] = df['category'].str.cat(df['description'], sep='\n') |
|
|
40 |
df['note'] = df['note'].str.cat(df['text'], sep='\n') |
|
|
41 |
df['ethnicity'] = df['ethnicity'].apply(group_eth) |
|
|
42 |
df['processed_note'] = df['note'].apply(tokenize_text) |
|
|
43 |
df.drop(['text', 'description'], axis=1, inplace=True) |
|
|
44 |
df.to_csv(proc_csv, index=False) |