--- a +++ b/wrapper_functions/medspacy_functions.py @@ -0,0 +1,192 @@ +# pip install medspacy==0.2.0.0 + + +import spacy +import medspacy +from medspacy.util import DEFAULT_PIPENAMES +from medspacy.custom_tokenizer import create_medspacy_tokenizer +import medspacy +from medspacy.section_detection import Sectionizer +from medspacy.section_detection import SectionRule + +import warnings +warnings.filterwarnings("ignore") + + + +def get_word_tokenization(text): + """ + returns a list of strings of tokenized word from a input string + + example: + >> get_word_tokenization("'''Admission Date: [**2573-5-30**] ") + >> ['Admission', 'Date', ':', '[', '*', '*', '2573', '-', '5',...] + """ + + # logging + print(f"Getting sentence tokenizer from medspaCy\n") + + nlp = spacy.blank("en") + + medspacy_tokenizer = create_medspacy_tokenizer(nlp) + + print(f"Input text (truncated): {text}\n...") + results = list(medspacy_tokenizer(text)) + + 'filter out empty elements' + tokenized = [element.text.strip() for element in results if len(element.text.strip()) > 0] + + return tokenized + + +def get_section_detection(text,rules=None): + ''' + given a string as the input, extract sections, consisting of medical history, allergies, comments and so on + :param text: a string + :rules: the personalized rules, a dictionary of string, i.e., {"category": "allergies"} + :return: a list of spacy Section object. + + + Example usage: + >> text1 = 'Past Medical History: + pt has history of medical events + Comments: some comment here + + Allergies: apple, seafood + peanuts' + + >> get_section_detection(text1) + >> + CATEGORY.............. past_medical_history + TITLE................. Past Medical History: + PARENT................ None + SECTION TEXT.......... + + pt has history of medical events + + ---------------------- + CATEGORY.............. comments + TITLE................. Comments: + PARENT................ None + SECTION TEXT.......... + some comment here + + + ---------------------- + CATEGORY.............. allergies + TITLE................. Allergies: + PARENT................ None + SECTION TEXT.......... + apple, seafood + peanuts + ''' + nlp = medspacy.load() + sectionizer = Sectionizer(nlp, rules=None) + pattern_dicts = [{"category": "past_medical_history", "literal": "Past Medical History:"}, + {"category": "allergies", "literal": "Allergies:"}, + {"category": "medical_assessment", "literal": "Medical Assessment:"}, + {"category": "comment", "literal": "Comments:", "parents": ["past_medical_history", "allergies"]}] + if rules is not None: + # combine with personalized rule + pattern_dicts.append(rules) + patterns = [SectionRule.from_dict(pattern) for pattern in pattern_dicts] + sectionizer.add(patterns) + nlp.add_pipe("medspacy_sectionizer") + doc = nlp(text) + sections = [] + + print(f"Getting section detection function from medspaCy\n") + for section in doc._.sections: + print("CATEGORY.............. {0}".format(section.category)) + print("TITLE................. {0}".format(section.title_span)) + if section.parent: + print("PARENT................ {0}".format(section.parent.category)) + else: + print("PARENT................ {0}".format(section.parent)) + print("SECTION TEXT..........\n{0}".format(section.body_span)) + print("----------------------") + sections.append(section) + + return sections + +def get_UMLS_match(text): + + ''' + Match the UMLS concept for the input text. + :param text: a string + :return: a list of tuples, (entity_text, label, similarity, semtypes) + + Example: + >> concept_text = 'Decreased dipalmitoyllecithin content found in lung specimens' + >> get_UMLS_match(concept_type) + >> + Entity text : dipalmitoyllecithin + Label (UMLS CUI) : C0000039 + Similarity : 0.8888888888888888 + Semtypes : {'T119', 'T121'} + ''' + + medspacy_pipes = DEFAULT_PIPENAMES.copy() + + if 'medspacy_quickumls' not in medspacy_pipes: + medspacy_pipes.add('medspacy_quickumls') + + + nlp = medspacy.load(enable=medspacy_pipes) + + + doc = nlp(text) + + umls = [] + + print(f"Getting UMLS matching from medspaCy\n") + for ent in doc.ents: + print('Entity text : {}'.format(ent.text)) + print('Label (UMLS CUI) : {}'.format(ent.label_)) + print('Similarity : {}'.format(ent._.similarity)) + print('Semtypes : {}'.format(ent._.semtypes)) + umls.append((ent.text,ent.label_,ent._.similarity, ent._.semtypes)) + + return umls + + + +# example code +demo_text1 = '''Admission Date: [**2573-5-30**] Discharge Date: [**2573-7-1**] + +Date of Birth: [**2498-8-19**] Sex: F + +Service: SURGERY + +Allergies: +Hydrochlorothiazide + +Attending:[**First Name3 (LF) 1893**] +Chief Complaint: +Abdominal pain + +Major Surgical or Invasive Procedure: +PICC line [**6-25**] +ERCP w/ sphincterotomy [**5-31**] + + +History of Present Illness: +74y female with type 2 dm and a recent stroke affecting her +speech, who presents with 2 days of abdominal pain. Imaging sh''' +get_word_tokenization(demo_text1) + +demo_text2 = '''Past Medical History: +pt has history of medical events +Comments: some comment here + +Allergies: apple, seafood +peanuts +''' +get_section_detection(demo_text2) + + + +demo_text3 = 'Decreased dipalmitoyllecithin content found in lung specimens' +get_UMLS_match(demo_text3) + +