bleeding-detector / Git / [b64007] /src/extract.py

Models:
DavidFeaster/
bleeding-detector
Downloads: 1
[b64007]: / src / extract.py
History
Download this file
182 lines (160 with data), 5.8 kB

import csv
import pandas as pd
import re
import spacy
import sys

nlp = spacy.load('en')

TRAINING_METADATA_FILE = './data/notes_metadata_trainingset.csv'
VALIDATION_METADATA_FILE = ''


def extract_phrases(metadata_frame, phrase_length, target_patterns, output_file):
	"""Extracts phrases around target terms indicating bleeding.

	Extracts clinical notes for each unique mrn in the training or validation
	metadata. Uses spacy sentence splitter to extract sentences among these
	notes. If a term indicating a bleeding event is identified in any of these
	sentences, extracts phrases from the clinical notes before and after the
	term, specified by phrase_length.

	Args:
		metadata_frame: The expanded metadata frame containing clinical notes
			for all mrns in the training or validation set.
		phrase_length: The number of phrases to extract when a term indicating
			a bleeding event is found in the clinical notes.
			(e.g. if phrase_length = 5, this function extracts 2 phrases before
			the phrase containing the term, the phrase containting the term,
			and 2 phrases after the phrase containing the term)
		targets_patterns: The regex for extracting terms indicating bleeding.
		output_file: The name of the file where the training or validation data
			will be written, with the extracted notes appended to it.

	Returns:
		None.

	Raises:
		None.
	"""

	output_fieldnames = ['mrn', 'note_date', 'note_ids', 'note', 'targets']
	with open(output_file, 'w') as f:
		writer = csv.writer(f)
		writer.writerow(output_fieldnames)
		count = 0
		mrns = metadata_frame['mrn'].unique()
		for mrn in mrns:
			mrn_frame = metadata_frame[metadata_frame['mrn'] == mrn]
			dates = mrn_frame['note_date'].unique()
			for date in dates:
				notes = list(mrn_frame[mrn_frame['note_date'] == date]['text'])
				note_ids = list(mrn_frame[mrn_frame['note_date'] == date]['noteid'])
				extracted_notes = []
				all_targets = []
				for note in notes:
					if isinstance(note, str):
						doc = nlp(unicode(note, 'utf-8'))
						sentences = [str(i) for i in list(doc.sents)]
						for i in range(len(sentences)):
							targets = []
							for target_pattern in target_patterns:
								targets += re.findall(target_pattern, sentences[i])
								all_targets += re.findall(target_pattern, sentences[i])
							if len(targets) > 0:
								# beginning index
								beg = i - int(phrase_length/2)
								if beg < 0:
									beg = 0
								# ending index
								end = i + phrase_length - int(phrase_length/2)
								if end > len(sentences):
									end = len(sentences)
								extracted_notes += sentences[beg:end]
				extracted_notes = ' '.join(extracted_notes)
				line = [mrn, date, note_ids, extracted_notes, all_targets]
				writer.writerow(line)
			count += 1
			sys.stdout.write('\rCompleted %d of %d, %d%%' % (count, len(mrns), count*100/len(mrns)))
			sys.stdout.flush()


def main():
	"""Main function.

	Reads training or validation metadata, and number of phrases to extract
	when a term indicating a bleeding event is encountered. Calls the function
	to extract notes from the metadata.

	Usage:
	python extract.py [-t | -v] <phrase_length>

	Args:
		-t: Training.
		-v: Validation.
		phrase_length: The number of phrases to extract.

	Example:
	python extract.py -t 3
	"""

	# read CLI arguments
	if len(sys.argv) < 3:
		print 'Insufficient input.'
		exit()

	# read training or validation metadata
	if sys.argv[1] == '-t':
		print 'Reading training metadata file...'
		metadata_frame = pd.read_csv(TRAINING_METADATA_FILE)
		OUTPUT_FILE = ['./data/training']
	elif sys.argv[1] == '-v':
		print 'Reading validation metadata file...'
		metadata_frame = pd.read_csv(VALIDATION_METADATA_FILE)
		OUTPUT_FILE = ['./data/validation']
	else:
		print 'Incorrect input. Enter -t for training data, or -v for validation data.'
		exit()

	# read phrase length
	phrase_length = int(sys.argv[2])
	if phrase_length < 1:
		print 'Phrase length too small. Enter a number between 0 and 5.'
		exit()
	elif phrase_length > 5:
		print 'Phrase length too big. Enter a number between 0 and 5.'
		exit()
	OUTPUT_FILE.append(str(phrase_length) + 'phrases.csv')

	# define regex for bleeding target terms
	bleeding_targets = [r"(?<!non)(?<!non )(?<!non-)(?<!re)(bleed(?!ing)|bleeding(?!\stime))", 
						r"blood loss", 
						r"blood per rectum", 
						r"(?<!non-)(?<!non)(?<!non )bloody", 
						r"brbpr", r"coffee[\- ](ground|grounds)", 
						r"ecchymos[ie]s", 
						r"epistaxis", 
						r"exsanguination", 
						r"\bl?gib\b", 
						r"((\bg|gua?iac)([\-]|\s+)((pos(itive)?)|\+)|guaiac\(\+\))", 
						r"(?<!splinter\s)hem{1,2}or{1,2}h{1,2}age?", 
						r"hematem[a-z]+", 
						r"hematochezia", 
						r"hematoma", 
						r"hematuria", 
						r"hemoperitoneum", 
						r"hemoptysis",
						r"hemothorax",
						r"hemopericardium",
						r"hemarthrosis",
						r"hemarthroses",
						r"hemearthrosis",
						r"sanguineous",
						r"haemorrhage",
						r"diffuse\balveolar\bhemorrhage",
						r"dah",
						r"epidural\bhematoma",
						r"edh",
						r"intracranial\bhemorrhage",
						r"intracranial\bhemhorrage"
						r"intracranial\bhemorrhage"
						r"ich",
						r"\bich", r"mel[ae]n(a|ic)", 
						r"(ng|ngt)\s+lavage\s+((positive)|(pos)|\+)", 
						r"((positive)|(pos)|\+) (ng|ngt) lavage", 
						r"(fecal\s+occult(\s+blood)?|\bob|\bfob)\s+pos(itive)?", 
						r"sah", 
						r"sdh", 
						r"(maroon|red)\s+(stool|bowel\s+movement|bm)", 
						r"vomit[a-z]* blood",
						]

	# extract phrases from notes
	OUTPUT_FILE = '_'.join(OUTPUT_FILE)
	print 'Writing extracted phrases...'
	extract_phrases(metadata_frame = metadata_frame, 
		phrase_length = phrase_length, 
		target_patterns = bleeding_targets, 
		output_file = OUTPUT_FILE)


if __name__ == '__main__':
	main()