Diff of /src/extract.py [000000] .. [b64007]

Switch to unified view

a b/src/extract.py
1
import csv
2
import pandas as pd
3
import re
4
import spacy
5
import sys
6
7
nlp = spacy.load('en')
8
9
TRAINING_METADATA_FILE = './data/notes_metadata_trainingset.csv'
10
VALIDATION_METADATA_FILE = ''
11
12
13
def extract_phrases(metadata_frame, phrase_length, target_patterns, output_file):
14
    """Extracts phrases around target terms indicating bleeding.
15
16
    Extracts clinical notes for each unique mrn in the training or validation
17
    metadata. Uses spacy sentence splitter to extract sentences among these
18
    notes. If a term indicating a bleeding event is identified in any of these
19
    sentences, extracts phrases from the clinical notes before and after the
20
    term, specified by phrase_length.
21
22
    Args:
23
        metadata_frame: The expanded metadata frame containing clinical notes
24
            for all mrns in the training or validation set.
25
        phrase_length: The number of phrases to extract when a term indicating
26
            a bleeding event is found in the clinical notes.
27
            (e.g. if phrase_length = 5, this function extracts 2 phrases before
28
            the phrase containing the term, the phrase containting the term,
29
            and 2 phrases after the phrase containing the term)
30
        targets_patterns: The regex for extracting terms indicating bleeding.
31
        output_file: The name of the file where the training or validation data
32
            will be written, with the extracted notes appended to it.
33
34
    Returns:
35
        None.
36
37
    Raises:
38
        None.
39
    """
40
41
    output_fieldnames = ['mrn', 'note_date', 'note_ids', 'note', 'targets']
42
    with open(output_file, 'w') as f:
43
        writer = csv.writer(f)
44
        writer.writerow(output_fieldnames)
45
        count = 0
46
        mrns = metadata_frame['mrn'].unique()
47
        for mrn in mrns:
48
            mrn_frame = metadata_frame[metadata_frame['mrn'] == mrn]
49
            dates = mrn_frame['note_date'].unique()
50
            for date in dates:
51
                notes = list(mrn_frame[mrn_frame['note_date'] == date]['text'])
52
                note_ids = list(mrn_frame[mrn_frame['note_date'] == date]['noteid'])
53
                extracted_notes = []
54
                all_targets = []
55
                for note in notes:
56
                    if isinstance(note, str):
57
                        doc = nlp(unicode(note, 'utf-8'))
58
                        sentences = [str(i) for i in list(doc.sents)]
59
                        for i in range(len(sentences)):
60
                            targets = []
61
                            for target_pattern in target_patterns:
62
                                targets += re.findall(target_pattern, sentences[i])
63
                                all_targets += re.findall(target_pattern, sentences[i])
64
                            if len(targets) > 0:
65
                                # beginning index
66
                                beg = i - int(phrase_length/2)
67
                                if beg < 0:
68
                                    beg = 0
69
                                # ending index
70
                                end = i + phrase_length - int(phrase_length/2)
71
                                if end > len(sentences):
72
                                    end = len(sentences)
73
                                extracted_notes += sentences[beg:end]
74
                extracted_notes = ' '.join(extracted_notes)
75
                line = [mrn, date, note_ids, extracted_notes, all_targets]
76
                writer.writerow(line)
77
            count += 1
78
            sys.stdout.write('\rCompleted %d of %d, %d%%' % (count, len(mrns), count*100/len(mrns)))
79
            sys.stdout.flush()
80
81
82
def main():
83
    """Main function.
84
85
    Reads training or validation metadata, and number of phrases to extract
86
    when a term indicating a bleeding event is encountered. Calls the function
87
    to extract notes from the metadata.
88
89
    Usage:
90
    python extract.py [-t | -v] <phrase_length>
91
92
    Args:
93
        -t: Training.
94
        -v: Validation.
95
        phrase_length: The number of phrases to extract.
96
97
    Example:
98
    python extract.py -t 3
99
    """
100
101
    # read CLI arguments
102
    if len(sys.argv) < 3:
103
        print 'Insufficient input.'
104
        exit()
105
106
    # read training or validation metadata
107
    if sys.argv[1] == '-t':
108
        print 'Reading training metadata file...'
109
        metadata_frame = pd.read_csv(TRAINING_METADATA_FILE)
110
        OUTPUT_FILE = ['./data/training']
111
    elif sys.argv[1] == '-v':
112
        print 'Reading validation metadata file...'
113
        metadata_frame = pd.read_csv(VALIDATION_METADATA_FILE)
114
        OUTPUT_FILE = ['./data/validation']
115
    else:
116
        print 'Incorrect input. Enter -t for training data, or -v for validation data.'
117
        exit()
118
119
    # read phrase length
120
    phrase_length = int(sys.argv[2])
121
    if phrase_length < 1:
122
        print 'Phrase length too small. Enter a number between 0 and 5.'
123
        exit()
124
    elif phrase_length > 5:
125
        print 'Phrase length too big. Enter a number between 0 and 5.'
126
        exit()
127
    OUTPUT_FILE.append(str(phrase_length) + 'phrases.csv')
128
129
    # define regex for bleeding target terms
130
    bleeding_targets = [r"(?<!non)(?<!non )(?<!non-)(?<!re)(bleed(?!ing)|bleeding(?!\stime))", 
131
                        r"blood loss", 
132
                        r"blood per rectum", 
133
                        r"(?<!non-)(?<!non)(?<!non )bloody", 
134
                        r"brbpr", r"coffee[\- ](ground|grounds)", 
135
                        r"ecchymos[ie]s", 
136
                        r"epistaxis", 
137
                        r"exsanguination", 
138
                        r"\bl?gib\b", 
139
                        r"((\bg|gua?iac)([\-]|\s+)((pos(itive)?)|\+)|guaiac\(\+\))", 
140
                        r"(?<!splinter\s)hem{1,2}or{1,2}h{1,2}age?", 
141
                        r"hematem[a-z]+", 
142
                        r"hematochezia", 
143
                        r"hematoma", 
144
                        r"hematuria", 
145
                        r"hemoperitoneum", 
146
                        r"hemoptysis",
147
                        r"hemothorax",
148
                        r"hemopericardium",
149
                        r"hemarthrosis",
150
                        r"hemarthroses",
151
                        r"hemearthrosis",
152
                        r"sanguineous",
153
                        r"haemorrhage",
154
                        r"diffuse\balveolar\bhemorrhage",
155
                        r"dah",
156
                        r"epidural\bhematoma",
157
                        r"edh",
158
                        r"intracranial\bhemorrhage",
159
                        r"intracranial\bhemhorrage"
160
                        r"intracranial\bhemorrhage"
161
                        r"ich",
162
                        r"\bich", r"mel[ae]n(a|ic)", 
163
                        r"(ng|ngt)\s+lavage\s+((positive)|(pos)|\+)", 
164
                        r"((positive)|(pos)|\+) (ng|ngt) lavage", 
165
                        r"(fecal\s+occult(\s+blood)?|\bob|\bfob)\s+pos(itive)?", 
166
                        r"sah", 
167
                        r"sdh", 
168
                        r"(maroon|red)\s+(stool|bowel\s+movement|bm)", 
169
                        r"vomit[a-z]* blood",
170
                        ]
171
172
    # extract phrases from notes
173
    OUTPUT_FILE = '_'.join(OUTPUT_FILE)
174
    print 'Writing extracted phrases...'
175
    extract_phrases(metadata_frame = metadata_frame, 
176
        phrase_length = phrase_length, 
177
        target_patterns = bleeding_targets, 
178
        output_file = OUTPUT_FILE)
179
180
181
if __name__ == '__main__':
182
    main()