EHRKit-2022 / Git / [2d4573] /ehrkit/ehrkit.py

Models:
philipB/
EHRKit-2022
Downloads: 1
[2d4573]: / ehrkit / ehrkit.py
History
Download this file
725 lines (607 with data), 33.2 kB

from datetime import date
import pymysql
#from sshtunnel import SSHTunnelForwarder
from ehrkit.classes import Patient, Disease, Diagnosis, Prescription, Procedure
from ehrkit.solr_lib import *
from datetime import datetime
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from gensim import corpora, models, similarities
from collections import defaultdict
import re
import sys
import os
import pprint
import string
import torch
import requests
from sklearn.feature_extraction.text import TfidfVectorizer

dir_path = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
sys.path.append(dir_path)
from scripts.train_word2vec import train_word2vec
from scripts.abb_extraction import output_abb


# TODO: adding external library
import torch
from transformers import AutoTokenizer, AutoModelWithLMHead

class ehr_db:
    """Connection object to Tangra MySQL Server.

    Attributes:
        cnx: pymysql connection object
        cur: pymysql cursor object
    """

    def __init__(self, sess):
        self.cnx = sess['cnx']
        self.cur = sess['cur']
        self.patients = {}
        self.note_event_flag = False


    def get_patients(self, n):
        """Retrieves n patient objects from the database, adds them to self.patients

        Note:
            Patient sorted by ROW_ID in database
        Note:
            If n == -1, returns all patients.
        Args:
            n (int): Number of patient objects to return
        Returns:
            none
        """
        if n == -1:
            self.cur.execute("SELECT SUBJECT_ID, GENDER, DOB, DOD FROM mimic.PATIENTS")
        else:
            self.cur.execute("SELECT SUBJECT_ID, GENDER, DOB, DOD FROM mimic.PATIENTS LIMIT %d" % n)
        raw = self.cur.fetchall()


        for p in raw:
            data = {}
            data["id"] = p[0]
            data["sex"] = p[1]
            data["dob"] = p[2]

            # QUESTION: why use %Y and not %y? %Y only holds last two digits of year. How to tell difference between 100yo patient and newborn?
            if data["dob"] != None and isinstance(data["dob"], str):
                data["dob"] = datetime.strptime(data["dob"][0:10], "%Y-%m-%d")

            data["dod"] = p[3]

            if data["dod"] != None and isinstance(data["dod"], str):
                data["dod"] = datetime.strptime(data["dod"][0:10], "%Y-%m-%d")

            data["alive"] = (data["dod"] == None)

            self.patients[data["id"]] = Patient(data)

    def count_patients(self):
        '''Counts and returns the number of patients as an int in the database.'''

        self.cur.execute("SELECT COUNT(*) FROM mimic.PATIENTS")
        raw = self.cur.fetchall()
        return int(raw[0][0])

    def count_docs(self, query, getAll = False, inverted = False):
        '''
        returns document count of tables
        query is a list of table names
        setting getAll to true returns count of all rows in all tables
        setting inverted = False returns count of rows in tables specified in *args
        setting inverted = True returns count of rows in all tables except those specified in *args
        '''
        table_count = self.cur.execute("SELECT TABLE_NAME, TABLE_ROWS from information_schema.tables where TABLE_SCHEMA = 'mimic' ")
        numtup = self.cur.fetchall()
        #numtup(nested tuple) structure: ((TABLE_NAME(str), TABLE_ROWS(int)),...)
        count = 0
        if getAll:
            for i in range(table_count):
                count = count + numtup[i][1]
            return count
        if inverted:
            for i in range(table_count):
                if numtup[i][0] in query:
                    continue
                count = count+numtup[i][1]
            return count
        for i in range(table_count):
            if numtup[i][0] in query:
                count = count+numtup[i][1]
        return count


    #is this redundant?

    def get_note_events(self):
        """
        adds note_events to patient objects in self.patients
        depends on get_patients(have to call it first to populate ehrdb with patients)
        return: None
        """
        #TODO: Currently only adds one NoteEvent
        for patient in self.patients.values():
            if patient.note_events is None:
                self.cur.execute("select ROW_ID, TEXT from mimic.NOTEEVENTS where SUBJECT_ID = %d" %patient.id)
                rawt = self.cur.fetchall()
                ls = []
                for p in rawt:
                    sent_list = sent_tokenize(p[1])
                    ls.append((p[0],sent_list))
                    patient.addNE(ls)
        self.note_event_flag = True

    def longest_NE(self):
        '''
        returns the longest note event in the patient dict
        '''
        #TODO: Currently only considers one NoteEvent per patient
        maxpid, maxlen = None, 0
        for patient in self.patients.values():
            for doc in patient.note_events:
                pid = patient.id
                rowid = doc[0]
                leng = len(doc[1])
                if leng>maxlen:
                    maxlen = leng
                    maxpid = pid
                    maxrowid = rowid
        return maxpid, maxrowid, maxlen

    def get_document(self, id):
        """Returns the text of a specific patient record given the ID (row ID in NOTEEVENTS).
        """
        text = ""
        self.cur.execute("select TEXT from mimic.NOTEEVENTS where ROW_ID = %d" % id)
        text = self.cur.fetchall()
        return text[0][0]

    def get_all_patient_document_ids(self, patientID):

        """Returns a list of all document IDs associated with patientID.
        """
        records = []
        self.cur.execute("select ROW_ID from mimic.NOTEEVENTS where SUBJECT_ID = %d" % patientID)
        records = self.cur.fetchall()
        return flatten(records)

    def list_all_patient_ids(self):
        """Returns a list of all patient IDs in the database.
        """
        ids = []
        self.cur.execute("select SUBJECT_ID from mimic.PATIENTS")
        ids = self.cur.fetchall()
        return flatten(ids)

    def list_all_document_ids(self):

        """Returns a list of all document IDs in the database.
        """
        ids = []
        self.cur.execute("select ROW_ID from mimic.NOTEEVENTS")
        ids = self.cur.fetchall()
        return flatten(ids)

    def get_document_sents(self, docID):

        """Returns list of sentences in a record.
        """
        self.cur.execute("select TEXT from mimic.NOTEEVENTS where ROW_ID = %d" % docID)
        raw = self.cur.fetchall()
        sent_list = sent_tokenize(raw[0][0])
        if not sent_list:
            print("No document text found.")
        return sent_list

    def get_abbreviations(self, doc_id):
        ''' Returns a list of the abbreviations in a document.
        '''
        sent_list = self.get_document_sents(doc_id)
        abb_list = set()
        for sent in sent_list:
            for word in word_tokenize(sent):
                pattern = r'[A-Z]{2}'
                if re.match(pattern, word):
                    abb_list.add(word)

        return list(abb_list)

    def get_abbreviation_sent_ids(self, doc_id):
        ''' Returns a list of the abbreviations in a document along with the sentence ID they appear in
            in the format [(abbreviation, sent_id), ...]
        '''

        sent_list = self.get_document_sents(doc_id)
        abb_list = []
        for i, sent in zip(range(0, len(sent_list)), sent_list):
            for word in word_tokenize(sent):
                pattern = r'[A-Z]{2}'
                if re.match(pattern, word):
                    abb_list.append((word, i))

        return list(abb_list)


    def get_documents_d(self, date):
        """Returns a list of all document IDs recorded on date. Format of YYYY-MM-DD for date.
        """
        ids = []
        self.cur.execute("select ROW_ID from mimic.NOTEEVENTS where CHARTDATE = \"%s\"" % date)
        ids = self.cur.fetchall()
        if not ids:
            print("No values returned. Note that date must be formatted YYYY-MM-DD.")
        return flatten(ids)

    def get_documents_q(self, query, n = -1):
        """returns a List of all document IDs that include this text:”Service: SURGERY”
            when n = -1, search against all getDocuments
        """
        query = "%"+query+"%"
        ids = []
        if n == -1:
            self.cur.execute("select ROW_ID from mimic.NOTEEVENTS where TEXT like \'%s\'" %query)
        else:
            self.cur.execute("select ROW_ID from mimic.NOTEEVENTS where TEXT like \'%s\' limit %d" %(query,n))
        ids = self.cur.fetchall() #tuples?, TODO: try Dict Server?
        if not ids:
            print("No values returned. Note that the query must be formatted such as Service: Surgery")
        return flatten(ids)

    def get_documents_icd9_alt(self,query):
        '''
        returns: documents in DIAGNOSES_ICD given icd 9 Code query
        dependancy: does not depend on calling get_patients
        '''
        query = "%"+str(query)+"%"
        self.cur.execute("select ROW_ID, ICD9_CODE from mimic.DIAGNOSES_ICD where ICD9_CODE like '%s'" %query)
        raws = self.cur.fetchall()
        docs = []
        for raw in raws:
            print(raw)#debug
            if raw[1][0] != 'V' or raw[1][0] != 'E':
                modified = raw[1][0:3]+'.'+raw[1][3:]

            else:
                modified = raw[1][0:2]+'.'+raw[1][2:]
            print(modified)#debug
            rt = tree.find(modified).parent
            description = rt.description
            docs.append((raw[0],rt,description))


        if not docs:
            print("No values returned.")
        return docs

    def get_documents_icd9(self,code):
        '''
        returns: documents in DIAGNOSES_ICD given icd 9 Code query
        dependancy: does not depend on calling get_patients
        '''
        code = str(code)
        self.cur.execute("select ROW_ID from mimic.DIAGNOSES_ICD where ICD9_CODE = '%s'" % code)
        ids = self.cur.fetchall()
        if not ids:
            print("No values returned.")
            return None
        self.cur.execute("select SHORT_TITLE from mimic.D_ICD_DIAGNOSES where ICD9_CODE ='%s'" % code)

        d = {code: (flatten(self.cur.fetchall()), flatten(ids))}

        return d

    def get_prescription(self):
        """ TODO: NEEDS TO BE FIXED. CURRENTLY HAS IDs HARDCODED IN.
        """
        for patient in self.patients.values():
            self.cur.execute("select DRUG from mimic.PRESCRIPTIONS where ROW_ID = 2968759 or ROW_ID = 2968760")
            drugtuple = self.cur.fetchall()
            druglist = []
            for drug in drugtuple:
                druglist.append(drug[0])
            patient.addPrescriptions(druglist)

    def count_all_prescriptions(self):
        """ Returns a dictionary with each medicine in PRESCRIPTIONS as keys
            and how many times it has been prescribed as values. Takes a long time to run.
        """
        meds_dict = {}
        self.cur.execute("select DRUG from mimic.PRESCRIPTIONS")
        raw = self.cur.fetchall()
        meds_list = flatten(raw)
        for med in meds_list:
            if med in meds_dict:
                meds_dict[med] += 1
            else:
                meds_dict[med] = 1

        return meds_dict

    def get_diagnoses(self):
        """Adds diagnoses (converted from ICD-9 code) from DIAGNOSES_ICD to patient.diagnoses for each patient in patients dictionary.
        """
        codes = []
        diags = {}
        for patient in self.patients.values():
            self.cur.execute("select ICD9_CODE from mimic.DIAGNOSES_ICD where SUBJECT_ID = %d" % patient.id)
            codes = self.cur.fetchall()
            for code in codes:
                if code not in diags:
                    self.cur.execute("select LONG_TITLE from mimic.D_ICD_DIAGNOSES where ICD9_CODE = \"%s\"" % code)
                    diags[code] = self.cur.fetchall()
                patient.diagnose(diags[code])

    def get_procedures(self):
        """Adds procedures (converted from ICD-9 code) from PROCEDURES_ICD to patient.procedures for each patient in patients dictionary.
        """
        codes = []
        procs = {}
        for patient in self.patients.values():
            self.cur.execute("select ICD9_CODE from mimic.PROCEDURES_ICD where SUBJECT_ID = %d" % patient.id)
            codes = self.cur.fetchall()
            for code in codes:
                if code not in procs:
                    self.cur.execute("select LONG_TITLE from mimic.D_ICD_PROCEDURES where ICD9_CODE = \"%s\"" % code)
                    procs[code] = self.cur.fetchall()
                patient.add_procedure(procs[code])

    def extract_patient_words(self, patientID):
        """Uses Gensim to extract all words relevant to a patient and writes these words to a file [patientID].txt.
        """

        # will hold all text to be processed by gensim
        text = []

        if patientID in self.patients:
            patient = self.patients[patientID]

            # Adds note_events to text
            if not patient.note_events:
                self.get_note_events()
            for doc in patient.note_events:
                text.extend(doc[1])

            # Adds prescriptions to text
            if not patient.prescriptions:
                self.get_prescription()
            text.extend(patient.prescriptions)

            # # Adds diagnoses to text
            # if not patient.diagnosis:
            #     self.get_diagnoses()
            # text.extend([diagnosis.name for diagnosis in patient.diagnosis])

            # # Adds procedures to text
            # if not patient.procedures:
            #     self.get_procedures()
            # text.extend([procedure.name for procedure in patient.procedures])

        ### Cleans the documents of punctuation ###
        text = [sent.translate(str.maketrans('', '', string.punctuation)) for sent in text]
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(text)
        names = vectorizer.get_feature_names()
        doc = 0
        feature_index = tfidf_matrix[doc,:].nonzero()[1]
        scores = zip(feature_index, [tfidf_matrix[doc, x] for x in feature_index])

        print("TEMPORARY OUTPUT FOR TASK T4.4")
        for w, s in [(names[i], s) for (i, s) in scores]:
            print(w, s)
        return scores
    def extract_key_words(self, text):
        # code from AAN Keyword Cloud
        def remove_common_words_and_count(tokens):
            common_words = {'figure','a','able','about','above','abroad','according','accordingly','across','actually','adj','after','afterwards','again','against','ago','ahead','ain\'t','all','allow','allows','almost','alone','along','alongside','already','also','although','always','am','amid','amidst','among','amongst','an','and','another','any','anybody','anyhow','anyone','anything','anyway','anyways','anywhere','apart','appear','appreciate','appropriate','are','aren\'t','around','as','a\'s','aside','ask','asking','associated','at','available','away','awfully','b','back','backward','backwards','be','became','because','become','becomes','becoming','been','before','beforehand','begin','behind','being','believe','below','beside','besides','best','better','between','beyond','both','brief','but','by','c','came','can','cannot','cant','can\'t','caption','cause','causes','certain','certainly','changes','clearly','c\'mon','co','co.','com','come','comes','concerning','consequently','consider','considering','contain','containing','contains','corresponding','could','couldn\'t','course','c\'s','currently','d','dare','daren\'t','definitely','described','despite','did','didn\'t','different','directly','do','does','doesn\'t','doing','done','don\'t','down','downwards','during','e','each','edu','eg','eight','eighty','either','else','elsewhere','end','ending','enough','entirely','especially','et','etc','even','ever','evermore','every','everybody','everyone','everything','everywhere','ex','exactly','example','except','f','fairly','far','farther','few','fewer','fifth','first','five','followed','following','follows','for','forever','former','formerly','forth','forward','found','four','from','further','furthermore','g','get','gets','getting','given','gives','go','goes','going','gone','got','gotten','greetings','h','had','hadn\'t','half','happens','hardly','has','hasn\'t','have','haven\'t','having','he','he\'d','he\'ll','hello','help','hence','her','here','hereafter','hereby','herein','here\'s','hereupon','hers','herself','he\'s','hi','him','himself','his','hither','hopefully','how','howbeit','however','hundred','i','i\'d','ie','if','ignored','i\'ll','i\'m','immediate','in','inasmuch','inc','inc.','indeed','indicate','indicated','indicates','inner','inside','insofar','instead','into','inward','is','isn\'t','it','it\'d','it\'ll','its','it\'s','itself','i\'ve','j','just','k','keep','keeps','kept','know','known','knows','l','last','lately','later','latter','latterly','least','less','lest','let','let\'s','like','liked','likely','likewise','little','look','looking','looks','low','lower','ltd','m','made','mainly','make','makes','many','may','maybe','mayn\'t','me','mean','meantime','meanwhile','merely','might','mightn\'t','mine','minus','miss','more','moreover','most','mostly','mr','mrs','much','must','mustn\'t','my','myself','n','name','namely','nd','near','nearly','necessary','need','needn\'t','needs','neither','never','neverf','neverless','nevertheless','new','next','nine','ninety','no','nobody','non','none','nonetheless','noone','no-one','nor','normally','not','nothing','notwithstanding','novel','now','nowhere','o','obviously','of','off','often','oh','ok','okay','old','on','once','one','ones','one\'s','only','onto','opposite','or','other','others','otherwise','ought','oughtn\'t','our','ours','ourselves','out','outside','over','overall','own','p','particular','particularly','past','per','perhaps','placed','please','plus','possible','presumably','probably','provided','provides','q','que','quite','qv','r','rather','rd','re','really','reasonably','recent','recently','regarding','regardless','regards','relatively','respectively','right','round','s','said','same','saw','say','saying','says','second','secondly','see','seeing','seem','seemed','seeming','seems','seen','self','selves','sensible','sent','serious','seriously','seven','several','shall','shan\'t','she','she\'d','she\'ll','she\'s','should','shouldn\'t','since','six','so','some','somebody','someday','somehow','someone','something','sometime','sometimes','somewhat','somewhere','soon','sorry','specified','specify','specifying','still','sub','such','sup','sure','t','take','taken','taking','tell','tends','th','than','thank','thanks','thanx','that','that\'ll','thats','that\'s','that\'ve','the','their','theirs','them','themselves','then','thence','there','thereafter','thereby','there\'d','therefore','therein','there\'ll','there\'re','theres','there\'s','thereupon','there\'ve','these','they','they\'d','they\'ll','they\'re','they\'ve','thing','things','think','third','thirty','this','thorough','thoroughly','those','though','three','through','throughout','thru','thus','till','to','together','too','took','toward','towards','tried','tries','truly','try','trying','t\'s','twice','two','u','un','under','underneath','undoing','unfortunately','unless','unlike','unlikely','until','unto','up','upon','upwards','us','use','used','useful','uses','using','usually','v','value','various','versus','very','via','viz','vs','w','want','wants','was','wasn\'t','way','we','we\'d','welcome','well','we\'ll','went','were','we\'re','weren\'t','we\'ve','what','whatever','what\'ll','what\'s','what\'ve','when','whence','whenever','where','whereafter','whereas','whereby','wherein','where\'s','whereupon','wherever','whether','which','whichever','while','whilst','whither','who','who\'d','whoever','whole','who\'ll','whom','whomever','who\'s','whose','why','will','willing','wish','with','within','without','wonder','won\'t','would','wouldn\'t','x','y','yes','yet','you','you\'d','you\'ll','your','you\'re','yours','yourself','yourselves','you\'ve','z','zero'}
            token_counts = {}
            for token in tokens:
                token = token.lower()
                if token in common_words or token.isdigit() or len(token) == 1:
                    pass
                elif token in token_counts:
                    token_counts[token] += 1
                else:
                    token_counts[token] = 1
            return token_counts
        token_counts = remove_common_words_and_count(re.findall('[\w\-]+', text))
        # Sort token with highest counts first, and take top 50 only.
        sorted_token_counts = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)[:50]
        scale = 48.0 / sorted_token_counts[0][1]
        # Normalize font size for each token such that token with largest count is size 48.
        token_to_font_size = [(tup[0], round(tup[1] * scale, 1)) for tup in sorted_token_counts]
        return sorted_token_counts


    def extract_phrases(self, docID):
        self.cur.execute("SELECT TEXT FROM mimic.NOTEEVENTS WHERE ROW_ID = %d" % docID)
        doc = self.cur.fetchall()
        upperdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
        f = open(upperdir+"/external/phrase-at-scale/data/raw_doc.txt", "w+")
        f.write(doc[0][0])
        f.close()

        cmd = '~/venv/lib/python3.6/site-packages/pyspark/bin/spark-submit --master local[200] --driver-memory 4G external/phrase-at-scale/phrase_generator.py'
        os.system(cmd)

    def output_note_events_file_by_patients(self, directory):
        '''
        input: file path like EHRKit/output/patients
        return: none
        output: Noteevents Text fields saved in EHRKit/output/patients/patient[SUBJECT_ID]/[ROW_ID].txt files
        '''

        #self.cur.execute('select SUBJECT_ID, count(ROW_ID) from mimic.NOTEEVENTS group by SUBJECT_ID having count(ROW_ID) > 10 limit 1')
        self.cur.execute('select SUBJECT_ID, count(ROW_ID) from (select SUBJECT_ID, ROW_ID from mimic.NOTEEVENTS limit 10000) as SMALLNE group by SUBJECT_ID having count(ROW_ID) > 10 limit 10')
        patients = self.cur.fetchall()
        print('Format: (Patient ID, Document count) \n', patients)
        for patient in patients:
            pid = patient[0]
            print('patient %d' %pid)
            self.cur.execute('select ROW_ID from (select SUBJECT_ID, ROW_ID from mimic.NOTEEVENTS limit 10000) as SMALLNE where SUBJECT_ID = %d' %pid)
            docids = self.cur.fetchall()
            for num,doctup in enumerate(docids, start = 1):
                docid = doctup[0]
                self.cur.execute('select TEXT from mimic.NOTEEVENTS where ROW_ID = %d' %docid)
                doctext = self.cur.fetchall()
                try:
                    os.makedirs(directory+'patient%d' %pid)
                    docpath = os.path.join(directory, 'patient%d' %pid)
                except FileExistsError:
                    docpath = os.path.join(directory, 'patient%d' %pid)
                with open(os.path.join(docpath, '%d.txt' %docid), 'w+') as f:
                    f.write(doctext[0][0])
                print('patient document %d saved' %docid)
        print('Done, please check EHRKit/Output/patients/ for files')

    def output_note_events_discharge_summary(self, directory):
        '''
        input: file path like EHRKit/output/
        return: none
        output: Noteevents Text fields saved in EHRKit/output/discharge_summary/[ROW_ID].txt files
        '''

        #self.cur.execute('select SUBJECT_ID, count(ROW_ID) from mimic.NOTEEVENTS group by SUBJECT_ID having count(ROW_ID) > 10 limit 1')
        self.cur.execute("select ROW_ID, TEXT from (select * from mimic.NOTEEVENTS limit 10000) as SMALLNE where CATEGORY = 'Discharge summary' limit 100")
        raw = self.cur.fetchall()
        for doc in raw:
            docid = doc[0]
            doctext = doc[1]
            print('Discharge Summary %d' %docid)
            try:
                os.makedirs(directory)
                docpath = directory
            except FileExistsError:
                docpath = directory
            with open(os.path.join(docpath, '%d.txt' %docid), 'w+') as f:
                f.write(doctext)
                print('discharge summary %d saved' %docid)
        print('Done, please check EHRKit/output/discharge_summary for files')

    def outputAbbreviation(self, directory):
        '''
        input: file path like EHRKit/output/
        return: none
        output: Noteevents Text files containing abbreviation “AB” in e.g. EHRKit/output/AB/194442.txt
        '''


    def count_gender(self, gender):
        ''' Counts how many patients there are of a certain gender in the database.
            Argument gender must be a capitalized single-letter string.
        '''

        self.cur.execute('SELECT COUNT(*) FROM mimic.PATIENTS WHERE GENDER = \'%s\'' % gender)
        count = self.cur.fetchall()

        return count[0][0]

    def docs_with_phrase(self, phrase):
        ''' Writes document text containing phrase to files named with document IDs.
        '''

        self.cur.execute('SELECT ROW_ID, TEXT FROM mimic.NOTEEVENTS WHERE TEXT LIKE \'%%%s%%\' LIMIT 1' % phrase)
        docs = self.cur.fetchall()
        os.mkdir("docs_with_phrase_%s" % phrase)

    #TODO: bert tokenize
    def get_bert_tokenize(self, doc_id):
        text = self.get_document(doc_id)
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        bert_tokenized_text = tokenizer.tokenize(text)
        return bert_tokenized_text

    # TODO: bart sumamrize test
    def summarize_huggingface(self, text, model_name):
        if '/' in model_name:
            path = model_name.split('/')[1]
        else:
            path = model_name

        tokenizer_path = os.path.join(os.path.dirname(__file__), '..', 'huggingface', path, 'tokenizer')
        model_path = os.path.join(os.path.dirname(__file__), '..', 'huggingface', path, 'model')
        tokenizer = AutoTokenizer.from_pretrained('t5-small', cache_dir=tokenizer_path)
        model = AutoModelWithLMHead.from_pretrained(model_name, cache_dir=model_path)

        inputs = tokenizer([text], max_length=1024, return_tensors='pt')
        # early_stopping=True produces shorter summaries. Changing max_ and min_length doesn't change anything.
        summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=inputs['input_ids'].shape[1], early_stopping=False)
        summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
        summary = " ".join(summary)
        return summary

    def bert_predict_masked(self, doc_id, sentence_id, mask_id):
        #TODO: FROM HUGGINGFACE LIBRARY
        tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking")
        model = AutoModelWithLMHead.from_pretrained("bert-large-uncased-whole-word-masking")

        kit_doc = self.get_document_sents(doc_id) #retrieve that doc
        sentence = kit_doc[sentence_id] #choose that particular sentence
        #print(sentence)

        #TODO: replace a random word by a masked symbol
        sentence_list = sentence.split(' ')
        sentence_list[mask_id] = tokenizer.mask_token
        sequence = ' '.join(sentence_list)

        input = tokenizer.encode(sequence, return_tensors="pt")
        mask_token_index = torch.where(input == tokenizer.mask_token_id)[1]

        token_logits = model(input)[0]
        mask_token_logits = token_logits[0, mask_token_index, :]

        top_token = torch.topk(mask_token_logits, 1, dim=1).indices[0].tolist()

        for token in top_token:
            return sequence.replace(tokenizer.mask_token, tokenizer.decode([token]))

    def close_session(self):
        """Ends DB Session by closing SSH
        Tunnel and MySQL database connection.
        """
        #self.server.stop()
        self.cnx.close()

### ---------------- ###
### HELPER FUNCTIONS ###
### ---------------- ###

def start_session(db_user, db_pass):
    """Opens SQL Connection. Creates cursor
    for executing queries. Returns ehr_db object.

    Args:

        db_user (str): Username for MySQL DB on Tangra
        db_pass (str): Password for MySQL DB on Tangra

    Returns:
        dict: Contains SSHTunnelForwarder, pymysql connection, and
        pymysql cursor objects.
    """


    cnx = pymysql.connect(host='0.0.0.0',
                             user=db_user,
                             password=db_pass,port = 3306)
                             #port=8080)
    # Session Dictionary: Stores SSH Tunnel (server), MySQL Connection (cnx),
    # and DB Cursor(cursor).
    #sess_dict = {'server': server, 'cnx':cnx, 'cur':cnx.cursor()}
    sess_dict = {'cnx':cnx, 'cur':cnx.cursor()}
    # Create Session Object:
    sess = ehr_db(sess_dict)

    sess.cur.execute("use mimic")

    return sess

def createPatient(data):
    """Creates a single Patient object.

    Args:
        data (dict): Dictionary containing patient data

    Returns:
        patient: Patient object
    """
    data["diagnosis"] = getDiagnoses(data["id"], current=True)
    data["current_prescriptions"] = getMeds(data["id"], current=True)
    history = medicalHistory(data["id"])
    data["past_prescriptions"] = history["past_prescriptions"]
    data["past_diagnoses"] = history["past_diagnoses"]
    data["procedures"] = history["procedures"]

    patient = Patient(data)

    return patient

def flatten(lst):
    """Returns flattened list from nested list.
    """
    if not lst: return lst
    return [x for sublist in lst for x in sublist]

def numbered_print(lst):
    for num, elt in enumerate(lst, start = 1):
        print(num, '\n', elt)


def init_embedding_model():
    train_word2vec()

def get_abbs_sent_ids(text):
    ''' Returns a list of the abbreviations in a document along with the sentence ID they appear in
        in the format [(abbreviation, sent_id), ...]
    '''
    sent_list = sent_tokenize(text)
    abb_list = []
    for i, sent in zip(range(0, len(sent_list)), sent_list):
        for word in word_tokenize(sent):
            pattern = r'[A-Z]{2}'
            if re.match(pattern, word):
                abb_list.append((word, i))

    return list(abb_list)
def post_single_dict_to_solr(d: dict, core: str) -> None:
    response = requests.post('http://tangra.cs.yale.edu:8983/solr/{}/update/json/docs'.format(core), json=d)

def abbs_disambiguate(ABB):
    long_forms, long_form_to_score_map = get_solr_response_umn_wrap(ABB)
    return long_forms

def get_documents_solr(query):
    ids, scores = get_solr_response_mimic(query)
    if not ids:
        print("No documents found")
    return sorted(ids)



### ------------------- ###
### Tangra DB Structure ###
### ------------------- ###

### DIAGNOSES_ICD Table ###
# Description: Stores ICD-9 Diagnosis Codes for patients
# Source: https://mimic.physionet.org/mimictables/diagnoses_icd/
# ATTRIBUTES:
# HADM_ID = unique ID for hospital ID (possibly more than 1 per patient)
# SEQ_NUM = Order of priority for ICD diagnoses
# ICD9_CODE = ICD-9 code for patient diagnosis
# SUBJECT_ID = unique ID for each patient

### D_ICD_DIAGNOSES Table ###
# Description: Definition Table for ICD Diagnoses
# Source: https://mimic.physionet.org/mimictables/d_icd_diagnoses/
# ATTRIBUTES:
# SHORT_TITLE
# LONG_TITLE
# ICD9_CODE: FK on DIAGNOSES_ICD.ICD9_CODE

### D_ICD_PROCEDURES Table ###
# Description: Definition Table for ICD procedures
# Source: https://mimic.physionet.org/mimictables/d_icd_procedures/
# ATTRIBUTES:
# SHORT_TITLE
# LONG_TITLE
# ICD9_CODE: FK on DIAGNOSES_ICD.ICD9_CODE

### NOTEEVENTS Table ###
# Description: Stores all notes for patients
# Source: https://mimic.physionet.org/mimictables/noteevents/
# ATTRIBUTES:
# SUBJECT_ID = unique ID for patient
# HADM_ID = unique hospital admission ID
# CHART-DATE = timestamp for date when note was charted
# CATEGORY and DESCRIPTION: describe type of note
# CGID = unique ID for caregiver
# ISERROR = if 1, means physician identified note as erroneous
# TEXT = note text

### PATIENTS Table ###
# Description: Demographic chart data for all patients
# Source: https://mimic.physionet.org/mimictables/patients/
# ATTRIBUTES:
# SUBJECT_ID = unique ID for patient
# GENDER
# DOB
# DOD_HOSP: Date of death as recorded by hospital (null if alive)
# DOD_SSN: Date of death as recorded in social security DB. (null if alive)
# DOD_HOSP takes priority over DOD_SSN if both present
# EXPIRE_FLAG = 1 if patient dead

### PROCEDURES_ICD Table ###
# Description: Stores ICD-9 procedures for patients (similar to DIAGNOSES_ICD)
# Source: https://mimic.physionet.org/mimictables/procedures_icd/
# ATTRIBUTES:
# SUBJECT_ID = unique patient ID
# HADM_ID = unique hospital admission ID
# SEQ_NUM = order in which procedures were performed
# ICD9_CODE = ICD-9 code for procedure