|
a |
|
b/src/extract.py |
|
|
1 |
import csv |
|
|
2 |
import pandas as pd |
|
|
3 |
import re |
|
|
4 |
import spacy |
|
|
5 |
import sys |
|
|
6 |
|
|
|
7 |
nlp = spacy.load('en') |
|
|
8 |
|
|
|
9 |
TRAINING_METADATA_FILE = './data/notes_metadata_trainingset.csv' |
|
|
10 |
VALIDATION_METADATA_FILE = '' |
|
|
11 |
|
|
|
12 |
|
|
|
13 |
def extract_phrases(metadata_frame, phrase_length, target_patterns, output_file): |
|
|
14 |
"""Extracts phrases around target terms indicating bleeding. |
|
|
15 |
|
|
|
16 |
Extracts clinical notes for each unique mrn in the training or validation |
|
|
17 |
metadata. Uses spacy sentence splitter to extract sentences among these |
|
|
18 |
notes. If a term indicating a bleeding event is identified in any of these |
|
|
19 |
sentences, extracts phrases from the clinical notes before and after the |
|
|
20 |
term, specified by phrase_length. |
|
|
21 |
|
|
|
22 |
Args: |
|
|
23 |
metadata_frame: The expanded metadata frame containing clinical notes |
|
|
24 |
for all mrns in the training or validation set. |
|
|
25 |
phrase_length: The number of phrases to extract when a term indicating |
|
|
26 |
a bleeding event is found in the clinical notes. |
|
|
27 |
(e.g. if phrase_length = 5, this function extracts 2 phrases before |
|
|
28 |
the phrase containing the term, the phrase containting the term, |
|
|
29 |
and 2 phrases after the phrase containing the term) |
|
|
30 |
targets_patterns: The regex for extracting terms indicating bleeding. |
|
|
31 |
output_file: The name of the file where the training or validation data |
|
|
32 |
will be written, with the extracted notes appended to it. |
|
|
33 |
|
|
|
34 |
Returns: |
|
|
35 |
None. |
|
|
36 |
|
|
|
37 |
Raises: |
|
|
38 |
None. |
|
|
39 |
""" |
|
|
40 |
|
|
|
41 |
output_fieldnames = ['mrn', 'note_date', 'note_ids', 'note', 'targets'] |
|
|
42 |
with open(output_file, 'w') as f: |
|
|
43 |
writer = csv.writer(f) |
|
|
44 |
writer.writerow(output_fieldnames) |
|
|
45 |
count = 0 |
|
|
46 |
mrns = metadata_frame['mrn'].unique() |
|
|
47 |
for mrn in mrns: |
|
|
48 |
mrn_frame = metadata_frame[metadata_frame['mrn'] == mrn] |
|
|
49 |
dates = mrn_frame['note_date'].unique() |
|
|
50 |
for date in dates: |
|
|
51 |
notes = list(mrn_frame[mrn_frame['note_date'] == date]['text']) |
|
|
52 |
note_ids = list(mrn_frame[mrn_frame['note_date'] == date]['noteid']) |
|
|
53 |
extracted_notes = [] |
|
|
54 |
all_targets = [] |
|
|
55 |
for note in notes: |
|
|
56 |
if isinstance(note, str): |
|
|
57 |
doc = nlp(unicode(note, 'utf-8')) |
|
|
58 |
sentences = [str(i) for i in list(doc.sents)] |
|
|
59 |
for i in range(len(sentences)): |
|
|
60 |
targets = [] |
|
|
61 |
for target_pattern in target_patterns: |
|
|
62 |
targets += re.findall(target_pattern, sentences[i]) |
|
|
63 |
all_targets += re.findall(target_pattern, sentences[i]) |
|
|
64 |
if len(targets) > 0: |
|
|
65 |
# beginning index |
|
|
66 |
beg = i - int(phrase_length/2) |
|
|
67 |
if beg < 0: |
|
|
68 |
beg = 0 |
|
|
69 |
# ending index |
|
|
70 |
end = i + phrase_length - int(phrase_length/2) |
|
|
71 |
if end > len(sentences): |
|
|
72 |
end = len(sentences) |
|
|
73 |
extracted_notes += sentences[beg:end] |
|
|
74 |
extracted_notes = ' '.join(extracted_notes) |
|
|
75 |
line = [mrn, date, note_ids, extracted_notes, all_targets] |
|
|
76 |
writer.writerow(line) |
|
|
77 |
count += 1 |
|
|
78 |
sys.stdout.write('\rCompleted %d of %d, %d%%' % (count, len(mrns), count*100/len(mrns))) |
|
|
79 |
sys.stdout.flush() |
|
|
80 |
|
|
|
81 |
|
|
|
82 |
def main(): |
|
|
83 |
"""Main function. |
|
|
84 |
|
|
|
85 |
Reads training or validation metadata, and number of phrases to extract |
|
|
86 |
when a term indicating a bleeding event is encountered. Calls the function |
|
|
87 |
to extract notes from the metadata. |
|
|
88 |
|
|
|
89 |
Usage: |
|
|
90 |
python extract.py [-t | -v] <phrase_length> |
|
|
91 |
|
|
|
92 |
Args: |
|
|
93 |
-t: Training. |
|
|
94 |
-v: Validation. |
|
|
95 |
phrase_length: The number of phrases to extract. |
|
|
96 |
|
|
|
97 |
Example: |
|
|
98 |
python extract.py -t 3 |
|
|
99 |
""" |
|
|
100 |
|
|
|
101 |
# read CLI arguments |
|
|
102 |
if len(sys.argv) < 3: |
|
|
103 |
print 'Insufficient input.' |
|
|
104 |
exit() |
|
|
105 |
|
|
|
106 |
# read training or validation metadata |
|
|
107 |
if sys.argv[1] == '-t': |
|
|
108 |
print 'Reading training metadata file...' |
|
|
109 |
metadata_frame = pd.read_csv(TRAINING_METADATA_FILE) |
|
|
110 |
OUTPUT_FILE = ['./data/training'] |
|
|
111 |
elif sys.argv[1] == '-v': |
|
|
112 |
print 'Reading validation metadata file...' |
|
|
113 |
metadata_frame = pd.read_csv(VALIDATION_METADATA_FILE) |
|
|
114 |
OUTPUT_FILE = ['./data/validation'] |
|
|
115 |
else: |
|
|
116 |
print 'Incorrect input. Enter -t for training data, or -v for validation data.' |
|
|
117 |
exit() |
|
|
118 |
|
|
|
119 |
# read phrase length |
|
|
120 |
phrase_length = int(sys.argv[2]) |
|
|
121 |
if phrase_length < 1: |
|
|
122 |
print 'Phrase length too small. Enter a number between 0 and 5.' |
|
|
123 |
exit() |
|
|
124 |
elif phrase_length > 5: |
|
|
125 |
print 'Phrase length too big. Enter a number between 0 and 5.' |
|
|
126 |
exit() |
|
|
127 |
OUTPUT_FILE.append(str(phrase_length) + 'phrases.csv') |
|
|
128 |
|
|
|
129 |
# define regex for bleeding target terms |
|
|
130 |
bleeding_targets = [r"(?<!non)(?<!non )(?<!non-)(?<!re)(bleed(?!ing)|bleeding(?!\stime))", |
|
|
131 |
r"blood loss", |
|
|
132 |
r"blood per rectum", |
|
|
133 |
r"(?<!non-)(?<!non)(?<!non )bloody", |
|
|
134 |
r"brbpr", r"coffee[\- ](ground|grounds)", |
|
|
135 |
r"ecchymos[ie]s", |
|
|
136 |
r"epistaxis", |
|
|
137 |
r"exsanguination", |
|
|
138 |
r"\bl?gib\b", |
|
|
139 |
r"((\bg|gua?iac)([\-]|\s+)((pos(itive)?)|\+)|guaiac\(\+\))", |
|
|
140 |
r"(?<!splinter\s)hem{1,2}or{1,2}h{1,2}age?", |
|
|
141 |
r"hematem[a-z]+", |
|
|
142 |
r"hematochezia", |
|
|
143 |
r"hematoma", |
|
|
144 |
r"hematuria", |
|
|
145 |
r"hemoperitoneum", |
|
|
146 |
r"hemoptysis", |
|
|
147 |
r"hemothorax", |
|
|
148 |
r"hemopericardium", |
|
|
149 |
r"hemarthrosis", |
|
|
150 |
r"hemarthroses", |
|
|
151 |
r"hemearthrosis", |
|
|
152 |
r"sanguineous", |
|
|
153 |
r"haemorrhage", |
|
|
154 |
r"diffuse\balveolar\bhemorrhage", |
|
|
155 |
r"dah", |
|
|
156 |
r"epidural\bhematoma", |
|
|
157 |
r"edh", |
|
|
158 |
r"intracranial\bhemorrhage", |
|
|
159 |
r"intracranial\bhemhorrage" |
|
|
160 |
r"intracranial\bhemorrhage" |
|
|
161 |
r"ich", |
|
|
162 |
r"\bich", r"mel[ae]n(a|ic)", |
|
|
163 |
r"(ng|ngt)\s+lavage\s+((positive)|(pos)|\+)", |
|
|
164 |
r"((positive)|(pos)|\+) (ng|ngt) lavage", |
|
|
165 |
r"(fecal\s+occult(\s+blood)?|\bob|\bfob)\s+pos(itive)?", |
|
|
166 |
r"sah", |
|
|
167 |
r"sdh", |
|
|
168 |
r"(maroon|red)\s+(stool|bowel\s+movement|bm)", |
|
|
169 |
r"vomit[a-z]* blood", |
|
|
170 |
] |
|
|
171 |
|
|
|
172 |
# extract phrases from notes |
|
|
173 |
OUTPUT_FILE = '_'.join(OUTPUT_FILE) |
|
|
174 |
print 'Writing extracted phrases...' |
|
|
175 |
extract_phrases(metadata_frame = metadata_frame, |
|
|
176 |
phrase_length = phrase_length, |
|
|
177 |
target_patterns = bleeding_targets, |
|
|
178 |
output_file = OUTPUT_FILE) |
|
|
179 |
|
|
|
180 |
|
|
|
181 |
if __name__ == '__main__': |
|
|
182 |
main() |