bleeding-detector / Git / Diff of /src/extract.py

Models:

DavidFeaster/

bleeding-detector

Downloads: 1

Diff of /src/extract.py [000000] .. [b64007]

Switch to unified view

 b/src/extract.py
+import csv
+import pandas as pd
+import re
+import spacy
+import sys
+nlp = spacy.load('en')
+TRAINING_METADATA_FILE = './data/notes_metadata_trainingset.csv'
+VALIDATION_METADATA_FILE = ''
+def extract_phrases(metadata_frame, phrase_length, target_patterns, output_file):
+    """Extracts phrases around target terms indicating bleeding.
+    Extracts clinical notes for each unique mrn in the training or validation
+    metadata. Uses spacy sentence splitter to extract sentences among these
+    notes. If a term indicating a bleeding event is identified in any of these
+    sentences, extracts phrases from the clinical notes before and after the
+    term, specified by phrase_length.
+    Args:
+        metadata_frame: The expanded metadata frame containing clinical notes
+            for all mrns in the training or validation set.
+        phrase_length: The number of phrases to extract when a term indicating
+            a bleeding event is found in the clinical notes.
+            (e.g. if phrase_length = 5, this function extracts 2 phrases before
+            the phrase containing the term, the phrase containting the term,
+            and 2 phrases after the phrase containing the term)
+        targets_patterns: The regex for extracting terms indicating bleeding.
+        output_file: The name of the file where the training or validation data
+            will be written, with the extracted notes appended to it.
+    Returns:
+        None.
+    Raises:
+        None.
+    """
+    output_fieldnames = ['mrn', 'note_date', 'note_ids', 'note', 'targets']
+    with open(output_file, 'w') as f:
+        writer = csv.writer(f)
+        writer.writerow(output_fieldnames)
+        count = 0
+        mrns = metadata_frame['mrn'].unique()
+        for mrn in mrns:
+            mrn_frame = metadata_frame[metadata_frame['mrn'] == mrn]
+            dates = mrn_frame['note_date'].unique()
+            for date in dates:
+                notes = list(mrn_frame[mrn_frame['note_date'] == date]['text'])
+                note_ids = list(mrn_frame[mrn_frame['note_date'] == date]['noteid'])
+                extracted_notes = []
+                all_targets = []
+                for note in notes:
+                    if isinstance(note, str):
+                        doc = nlp(unicode(note, 'utf-8'))
+                        sentences = [str(i) for i in list(doc.sents)]
+                        for i in range(len(sentences)):
+                            targets = []
+                            for target_pattern in target_patterns:
+                                targets += re.findall(target_pattern, sentences[i])
+                                all_targets += re.findall(target_pattern, sentences[i])
+                            if len(targets) > 0:
+                                # beginning index
+                                beg = i - int(phrase_length/2)
+                                if beg < 0:
+                                    beg = 0
+                                # ending index
+                                end = i + phrase_length - int(phrase_length/2)
+                                if end > len(sentences):
+                                    end = len(sentences)
+                                extracted_notes += sentences[beg:end]
+                extracted_notes = ' '.join(extracted_notes)
+                line = [mrn, date, note_ids, extracted_notes, all_targets]
+                writer.writerow(line)
+            count += 1
+            sys.stdout.write('\rCompleted %d of %d, %d%%' % (count, len(mrns), count*100/len(mrns)))
+            sys.stdout.flush()
+def main():
+    """Main function.
+    Reads training or validation metadata, and number of phrases to extract
+    when a term indicating a bleeding event is encountered. Calls the function
+    to extract notes from the metadata.
+    Usage:
+    python extract.py [-t | -v] <phrase_length>
+    Args:
+        -t: Training.
+        -v: Validation.
+        phrase_length: The number of phrases to extract.
+    Example:
+    python extract.py -t 3
+    """
+    # read CLI arguments
+    if len(sys.argv) < 3:
+        print 'Insufficient input.'
+        exit()
+    # read training or validation metadata
+    if sys.argv[1] == '-t':
+        print 'Reading training metadata file...'
+        metadata_frame = pd.read_csv(TRAINING_METADATA_FILE)
+        OUTPUT_FILE = ['./data/training']
+    elif sys.argv[1] == '-v':
+        print 'Reading validation metadata file...'
+        metadata_frame = pd.read_csv(VALIDATION_METADATA_FILE)
+        OUTPUT_FILE = ['./data/validation']
+    else:
+        print 'Incorrect input. Enter -t for training data, or -v for validation data.'
+        exit()
+    # read phrase length
+    phrase_length = int(sys.argv[2])
+    if phrase_length < 1:
+        print 'Phrase length too small. Enter a number between 0 and 5.'
+        exit()
+    elif phrase_length > 5:
+        print 'Phrase length too big. Enter a number between 0 and 5.'
+        exit()
+    OUTPUT_FILE.append(str(phrase_length) + 'phrases.csv')
+    # define regex for bleeding target terms
+    bleeding_targets = [r"(?<!non)(?<!non )(?<!non-)(?<!re)(bleed(?!ing)|bleeding(?!\stime))",
+                        r"blood loss",
+                        r"blood per rectum",
+                        r"(?<!non-)(?<!non)(?<!non )bloody",
+                        r"brbpr", r"coffee[\- ](ground|grounds)",
+                        r"ecchymos[ie]s",
+                        r"epistaxis",
+                        r"exsanguination",
+                        r"\bl?gib\b",
+                        r"((\bg|gua?iac)([\-]|\s+)((pos(itive)?)|\+)|guaiac\(\+\))",
+                        r"(?<!splinter\s)hem{1,2}or{1,2}h{1,2}age?",
+                        r"hematem[a-z]+",
+                        r"hematochezia",
+                        r"hematoma",
+                        r"hematuria",
+                        r"hemoperitoneum",
+                        r"hemoptysis",
+                        r"hemothorax",
+                        r"hemopericardium",
+                        r"hemarthrosis",
+                        r"hemarthroses",
+                        r"hemearthrosis",
+                        r"sanguineous",
+                        r"haemorrhage",
+                        r"diffuse\balveolar\bhemorrhage",
+                        r"dah",
+                        r"epidural\bhematoma",
+                        r"edh",
+                        r"intracranial\bhemorrhage",
+                        r"intracranial\bhemhorrage"
+                        r"intracranial\bhemorrhage"
+                        r"ich",
+                        r"\bich", r"mel[ae]n(a|ic)",
+                        r"(ng|ngt)\s+lavage\s+((positive)|(pos)|\+)",
+                        r"((positive)|(pos)|\+) (ng|ngt) lavage",
+                        r"(fecal\s+occult(\s+blood)?|\bob|\bfob)\s+pos(itive)?",
+                        r"sah",
+                        r"sdh",
+                        r"(maroon|red)\s+(stool|bowel\s+movement|bm)",
+                        r"vomit[a-z]* blood",
+                        ]
+    # extract phrases from notes
+    OUTPUT_FILE = '_'.join(OUTPUT_FILE)
+    print 'Writing extracted phrases...'
+    extract_phrases(metadata_frame = metadata_frame,
+        phrase_length = phrase_length,
+        target_patterns = bleeding_targets,
+        output_file = OUTPUT_FILE)
+if __name__ == '__main__':
+    main()