--- a +++ b/generateTurkTasks.py @@ -0,0 +1,44 @@ +import csv +import itertools +import re + +from loader import get_patient_by_EMPI +from extract_data import get_ef_value_notes +from shared_values import get_supplemental_list + +keywords = ['(?:ef|ejection fraction)\s*(?:of|is)?[:\s]*([0-9]*\.?[0-9]*)\s*%'] +allpatients = get_supplemental_list() +for key, patients in itertools.groupby(enumerate(allpatients), lambda k: k[0]//20): + filename = "/home/ubuntu/www/turkTasks_" + str(key) + ".csv" + print "Working on: " + filename + rows = [] + for (_, patient) in patients: + print patient + patient_data = get_patient_by_EMPI(patient) + efnotes = get_ef_value_notes(patient_data) + for (_, ef_value, note) in efnotes: + note_id = note.split('\n')[1].split('|')[3] + + # change new line to html br + note = note.replace("\r\n", "<br>") + + # bold found matches + for keyword in keywords: + pattern = re.compile(keyword) + matches = re.finditer(pattern, note) + offset = 0 + for match in matches: + start = match.start() + offset + end = match.end() + offset + replacement = ("<span class='highlight'>" + + note[start:end] + + "</span>") + note = note[:start] + replacement + note[end:] + offset += len(replacement) - (end - start) + + rows.append((note, ef_value, patient, note_id)) + + with open(filename, 'wb') as csvfile: + csvwriter = csv.writer(csvfile) + csvwriter.writerow(['image1', 'guess', 'empi', 'note_id']) + csvwriter.writerows(rows)