[b64007]: / src / extract.py

Download this file

182 lines (160 with data), 5.8 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import csv
import pandas as pd
import re
import spacy
import sys
nlp = spacy.load('en')
TRAINING_METADATA_FILE = './data/notes_metadata_trainingset.csv'
VALIDATION_METADATA_FILE = ''
def extract_phrases(metadata_frame, phrase_length, target_patterns, output_file):
"""Extracts phrases around target terms indicating bleeding.
Extracts clinical notes for each unique mrn in the training or validation
metadata. Uses spacy sentence splitter to extract sentences among these
notes. If a term indicating a bleeding event is identified in any of these
sentences, extracts phrases from the clinical notes before and after the
term, specified by phrase_length.
Args:
metadata_frame: The expanded metadata frame containing clinical notes
for all mrns in the training or validation set.
phrase_length: The number of phrases to extract when a term indicating
a bleeding event is found in the clinical notes.
(e.g. if phrase_length = 5, this function extracts 2 phrases before
the phrase containing the term, the phrase containting the term,
and 2 phrases after the phrase containing the term)
targets_patterns: The regex for extracting terms indicating bleeding.
output_file: The name of the file where the training or validation data
will be written, with the extracted notes appended to it.
Returns:
None.
Raises:
None.
"""
output_fieldnames = ['mrn', 'note_date', 'note_ids', 'note', 'targets']
with open(output_file, 'w') as f:
writer = csv.writer(f)
writer.writerow(output_fieldnames)
count = 0
mrns = metadata_frame['mrn'].unique()
for mrn in mrns:
mrn_frame = metadata_frame[metadata_frame['mrn'] == mrn]
dates = mrn_frame['note_date'].unique()
for date in dates:
notes = list(mrn_frame[mrn_frame['note_date'] == date]['text'])
note_ids = list(mrn_frame[mrn_frame['note_date'] == date]['noteid'])
extracted_notes = []
all_targets = []
for note in notes:
if isinstance(note, str):
doc = nlp(unicode(note, 'utf-8'))
sentences = [str(i) for i in list(doc.sents)]
for i in range(len(sentences)):
targets = []
for target_pattern in target_patterns:
targets += re.findall(target_pattern, sentences[i])
all_targets += re.findall(target_pattern, sentences[i])
if len(targets) > 0:
# beginning index
beg = i - int(phrase_length/2)
if beg < 0:
beg = 0
# ending index
end = i + phrase_length - int(phrase_length/2)
if end > len(sentences):
end = len(sentences)
extracted_notes += sentences[beg:end]
extracted_notes = ' '.join(extracted_notes)
line = [mrn, date, note_ids, extracted_notes, all_targets]
writer.writerow(line)
count += 1
sys.stdout.write('\rCompleted %d of %d, %d%%' % (count, len(mrns), count*100/len(mrns)))
sys.stdout.flush()
def main():
"""Main function.
Reads training or validation metadata, and number of phrases to extract
when a term indicating a bleeding event is encountered. Calls the function
to extract notes from the metadata.
Usage:
python extract.py [-t | -v] <phrase_length>
Args:
-t: Training.
-v: Validation.
phrase_length: The number of phrases to extract.
Example:
python extract.py -t 3
"""
# read CLI arguments
if len(sys.argv) < 3:
print 'Insufficient input.'
exit()
# read training or validation metadata
if sys.argv[1] == '-t':
print 'Reading training metadata file...'
metadata_frame = pd.read_csv(TRAINING_METADATA_FILE)
OUTPUT_FILE = ['./data/training']
elif sys.argv[1] == '-v':
print 'Reading validation metadata file...'
metadata_frame = pd.read_csv(VALIDATION_METADATA_FILE)
OUTPUT_FILE = ['./data/validation']
else:
print 'Incorrect input. Enter -t for training data, or -v for validation data.'
exit()
# read phrase length
phrase_length = int(sys.argv[2])
if phrase_length < 1:
print 'Phrase length too small. Enter a number between 0 and 5.'
exit()
elif phrase_length > 5:
print 'Phrase length too big. Enter a number between 0 and 5.'
exit()
OUTPUT_FILE.append(str(phrase_length) + 'phrases.csv')
# define regex for bleeding target terms
bleeding_targets = [r"(?<!non)(?<!non )(?<!non-)(?<!re)(bleed(?!ing)|bleeding(?!\stime))",
r"blood loss",
r"blood per rectum",
r"(?<!non-)(?<!non)(?<!non )bloody",
r"brbpr", r"coffee[\- ](ground|grounds)",
r"ecchymos[ie]s",
r"epistaxis",
r"exsanguination",
r"\bl?gib\b",
r"((\bg|gua?iac)([\-]|\s+)((pos(itive)?)|\+)|guaiac\(\+\))",
r"(?<!splinter\s)hem{1,2}or{1,2}h{1,2}age?",
r"hematem[a-z]+",
r"hematochezia",
r"hematoma",
r"hematuria",
r"hemoperitoneum",
r"hemoptysis",
r"hemothorax",
r"hemopericardium",
r"hemarthrosis",
r"hemarthroses",
r"hemearthrosis",
r"sanguineous",
r"haemorrhage",
r"diffuse\balveolar\bhemorrhage",
r"dah",
r"epidural\bhematoma",
r"edh",
r"intracranial\bhemorrhage",
r"intracranial\bhemhorrage"
r"intracranial\bhemorrhage"
r"ich",
r"\bich", r"mel[ae]n(a|ic)",
r"(ng|ngt)\s+lavage\s+((positive)|(pos)|\+)",
r"((positive)|(pos)|\+) (ng|ngt) lavage",
r"(fecal\s+occult(\s+blood)?|\bob|\bfob)\s+pos(itive)?",
r"sah",
r"sdh",
r"(maroon|red)\s+(stool|bowel\s+movement|bm)",
r"vomit[a-z]* blood",
]
# extract phrases from notes
OUTPUT_FILE = '_'.join(OUTPUT_FILE)
print 'Writing extracted phrases...'
extract_phrases(metadata_frame = metadata_frame,
phrase_length = phrase_length,
target_patterns = bleeding_targets,
output_file = OUTPUT_FILE)
if __name__ == '__main__':
main()