--- a
+++ b/src/extract.py
@@ -0,0 +1,182 @@
+import csv
+import pandas as pd
+import re
+import spacy
+import sys
+
+nlp = spacy.load('en')
+
+TRAINING_METADATA_FILE = './data/notes_metadata_trainingset.csv'
+VALIDATION_METADATA_FILE = ''
+
+
+def extract_phrases(metadata_frame, phrase_length, target_patterns, output_file):
+	"""Extracts phrases around target terms indicating bleeding.
+
+	Extracts clinical notes for each unique mrn in the training or validation
+	metadata. Uses spacy sentence splitter to extract sentences among these
+	notes. If a term indicating a bleeding event is identified in any of these
+	sentences, extracts phrases from the clinical notes before and after the
+	term, specified by phrase_length.
+
+	Args:
+		metadata_frame: The expanded metadata frame containing clinical notes
+			for all mrns in the training or validation set.
+		phrase_length: The number of phrases to extract when a term indicating
+			a bleeding event is found in the clinical notes.
+			(e.g. if phrase_length = 5, this function extracts 2 phrases before
+			the phrase containing the term, the phrase containting the term,
+			and 2 phrases after the phrase containing the term)
+		targets_patterns: The regex for extracting terms indicating bleeding.
+		output_file: The name of the file where the training or validation data
+			will be written, with the extracted notes appended to it.
+
+	Returns:
+		None.
+
+	Raises:
+		None.
+	"""
+
+	output_fieldnames = ['mrn', 'note_date', 'note_ids', 'note', 'targets']
+	with open(output_file, 'w') as f:
+		writer = csv.writer(f)
+		writer.writerow(output_fieldnames)
+		count = 0
+		mrns = metadata_frame['mrn'].unique()
+		for mrn in mrns:
+			mrn_frame = metadata_frame[metadata_frame['mrn'] == mrn]
+			dates = mrn_frame['note_date'].unique()
+			for date in dates:
+				notes = list(mrn_frame[mrn_frame['note_date'] == date]['text'])
+				note_ids = list(mrn_frame[mrn_frame['note_date'] == date]['noteid'])
+				extracted_notes = []
+				all_targets = []
+				for note in notes:
+					if isinstance(note, str):
+						doc = nlp(unicode(note, 'utf-8'))
+						sentences = [str(i) for i in list(doc.sents)]
+						for i in range(len(sentences)):
+							targets = []
+							for target_pattern in target_patterns:
+								targets += re.findall(target_pattern, sentences[i])
+								all_targets += re.findall(target_pattern, sentences[i])
+							if len(targets) > 0:
+								# beginning index
+								beg = i - int(phrase_length/2)
+								if beg < 0:
+									beg = 0
+								# ending index
+								end = i + phrase_length - int(phrase_length/2)
+								if end > len(sentences):
+									end = len(sentences)
+								extracted_notes += sentences[beg:end]
+				extracted_notes = ' '.join(extracted_notes)
+				line = [mrn, date, note_ids, extracted_notes, all_targets]
+				writer.writerow(line)
+			count += 1
+			sys.stdout.write('\rCompleted %d of %d, %d%%' % (count, len(mrns), count*100/len(mrns)))
+			sys.stdout.flush()
+
+
+def main():
+	"""Main function.
+
+	Reads training or validation metadata, and number of phrases to extract
+	when a term indicating a bleeding event is encountered. Calls the function
+	to extract notes from the metadata.
+
+	Usage:
+	python extract.py [-t | -v] <phrase_length>
+
+	Args:
+		-t: Training.
+		-v: Validation.
+		phrase_length: The number of phrases to extract.
+
+	Example:
+	python extract.py -t 3
+	"""
+
+	# read CLI arguments
+	if len(sys.argv) < 3:
+		print 'Insufficient input.'
+		exit()
+
+	# read training or validation metadata
+	if sys.argv[1] == '-t':
+		print 'Reading training metadata file...'
+		metadata_frame = pd.read_csv(TRAINING_METADATA_FILE)
+		OUTPUT_FILE = ['./data/training']
+	elif sys.argv[1] == '-v':
+		print 'Reading validation metadata file...'
+		metadata_frame = pd.read_csv(VALIDATION_METADATA_FILE)
+		OUTPUT_FILE = ['./data/validation']
+	else:
+		print 'Incorrect input. Enter -t for training data, or -v for validation data.'
+		exit()
+
+	# read phrase length
+	phrase_length = int(sys.argv[2])
+	if phrase_length < 1:
+		print 'Phrase length too small. Enter a number between 0 and 5.'
+		exit()
+	elif phrase_length > 5:
+		print 'Phrase length too big. Enter a number between 0 and 5.'
+		exit()
+	OUTPUT_FILE.append(str(phrase_length) + 'phrases.csv')
+
+	# define regex for bleeding target terms
+	bleeding_targets = [r"(?<!non)(?<!non )(?<!non-)(?<!re)(bleed(?!ing)|bleeding(?!\stime))", 
+						r"blood loss", 
+						r"blood per rectum", 
+						r"(?<!non-)(?<!non)(?<!non )bloody", 
+						r"brbpr", r"coffee[\- ](ground|grounds)", 
+						r"ecchymos[ie]s", 
+						r"epistaxis", 
+						r"exsanguination", 
+						r"\bl?gib\b", 
+						r"((\bg|gua?iac)([\-]|\s+)((pos(itive)?)|\+)|guaiac\(\+\))", 
+						r"(?<!splinter\s)hem{1,2}or{1,2}h{1,2}age?", 
+						r"hematem[a-z]+", 
+						r"hematochezia", 
+						r"hematoma", 
+						r"hematuria", 
+						r"hemoperitoneum", 
+						r"hemoptysis",
+						r"hemothorax",
+						r"hemopericardium",
+						r"hemarthrosis",
+						r"hemarthroses",
+						r"hemearthrosis",
+						r"sanguineous",
+						r"haemorrhage",
+						r"diffuse\balveolar\bhemorrhage",
+						r"dah",
+						r"epidural\bhematoma",
+						r"edh",
+						r"intracranial\bhemorrhage",
+						r"intracranial\bhemhorrage"
+						r"intracranial\bhemorrhage"
+						r"ich",
+						r"\bich", r"mel[ae]n(a|ic)", 
+						r"(ng|ngt)\s+lavage\s+((positive)|(pos)|\+)", 
+						r"((positive)|(pos)|\+) (ng|ngt) lavage", 
+						r"(fecal\s+occult(\s+blood)?|\bob|\bfob)\s+pos(itive)?", 
+						r"sah", 
+						r"sdh", 
+						r"(maroon|red)\s+(stool|bowel\s+movement|bm)", 
+						r"vomit[a-z]* blood",
+						]
+
+	# extract phrases from notes
+	OUTPUT_FILE = '_'.join(OUTPUT_FILE)
+	print 'Writing extracted phrases...'
+	extract_phrases(metadata_frame = metadata_frame, 
+		phrase_length = phrase_length, 
+		target_patterns = bleeding_targets, 
+		output_file = OUTPUT_FILE)
+
+
+if __name__ == '__main__':
+	main()
\ No newline at end of file