EHRKit-2022 / Git / [2d4573] /wrapper_functions/medspacy

Models:
philipB/
EHRKit-2022
Downloads: 1
[2d4573]: / wrapper_functions / medspacy_functions.py
History
Download this file
193 lines (138 with data), 5.4 kB

# pip install medspacy==0.2.0.0


import spacy
import medspacy
from medspacy.util import DEFAULT_PIPENAMES
from medspacy.custom_tokenizer import create_medspacy_tokenizer
import medspacy
from medspacy.section_detection import Sectionizer
from medspacy.section_detection import SectionRule

import warnings
warnings.filterwarnings("ignore")



def get_word_tokenization(text):
    """
    returns a list of strings of tokenized word from a input string

    example:
    >> get_word_tokenization("'''Admission Date:  [**2573-5-30**] ")
    >> ['Admission', 'Date', ':', '[', '*', '*', '2573', '-', '5',...]
    """

    # logging
    print(f"Getting sentence tokenizer from medspaCy\n")

    nlp = spacy.blank("en")

    medspacy_tokenizer = create_medspacy_tokenizer(nlp)

    print(f"Input text (truncated): {text}\n...")
    results = list(medspacy_tokenizer(text))

    'filter out empty elements'
    tokenized = [element.text.strip() for element in results if len(element.text.strip()) > 0]

    return tokenized


def get_section_detection(text,rules=None):
    '''
    given a string as the input, extract sections, consisting of medical history, allergies, comments and so on
    :param text: a string
    :rules: the personalized rules, a dictionary of string, i.e., {"category": "allergies"}
    :return: a list of spacy Section object.


    Example usage:
    >> text1 = 'Past Medical History:
    pt has history of medical events
    Comments: some comment here

    Allergies: apple, seafood
    peanuts'

    >> get_section_detection(text1)
    >>
    CATEGORY.............. past_medical_history
    TITLE................. Past Medical History:
    PARENT................ None
    SECTION TEXT..........

    pt has history of medical events

    ----------------------
    CATEGORY.............. comments
    TITLE................. Comments:
    PARENT................ None
    SECTION TEXT..........
    some comment here


    ----------------------
    CATEGORY.............. allergies
    TITLE................. Allergies:
    PARENT................ None
    SECTION TEXT..........
    apple, seafood
    peanuts
    '''
    nlp = medspacy.load()
    sectionizer = Sectionizer(nlp, rules=None)
    pattern_dicts = [{"category": "past_medical_history", "literal": "Past Medical History:"},
                     {"category": "allergies", "literal": "Allergies:"},
                     {"category": "medical_assessment", "literal": "Medical Assessment:"},
                     {"category": "comment", "literal": "Comments:", "parents": ["past_medical_history", "allergies"]}]
    if rules is not None:
        # combine with personalized rule
        pattern_dicts.append(rules)
    patterns = [SectionRule.from_dict(pattern) for pattern in pattern_dicts]
    sectionizer.add(patterns)
    nlp.add_pipe("medspacy_sectionizer")
    doc = nlp(text)
    sections = []

    print(f"Getting section detection function from medspaCy\n")
    for section in doc._.sections:
        print("CATEGORY.............. {0}".format(section.category))
        print("TITLE................. {0}".format(section.title_span))
        if section.parent:
            print("PARENT................ {0}".format(section.parent.category))
        else:
            print("PARENT................ {0}".format(section.parent))
        print("SECTION TEXT..........\n{0}".format(section.body_span))
        print("----------------------")
        sections.append(section)

    return sections

def get_UMLS_match(text):

    '''
    Match the UMLS concept for the input text.
    :param text: a string
    :return: a list of tuples, (entity_text, label, similarity, semtypes)

    Example:
    >> concept_text = 'Decreased dipalmitoyllecithin content found in lung specimens'
    >> get_UMLS_match(concept_type)
    >>
    Entity text : dipalmitoyllecithin
    Label (UMLS CUI) : C0000039
    Similarity : 0.8888888888888888
    Semtypes : {'T119', 'T121'}
    '''

    medspacy_pipes = DEFAULT_PIPENAMES.copy()

    if 'medspacy_quickumls' not in medspacy_pipes:
        medspacy_pipes.add('medspacy_quickumls')


    nlp = medspacy.load(enable=medspacy_pipes)


    doc = nlp(text)

    umls = []

    print(f"Getting UMLS matching from medspaCy\n")
    for ent in doc.ents:
        print('Entity text : {}'.format(ent.text))
        print('Label (UMLS CUI) : {}'.format(ent.label_))
        print('Similarity : {}'.format(ent._.similarity))
        print('Semtypes : {}'.format(ent._.semtypes))
        umls.append((ent.text,ent.label_,ent._.similarity, ent._.semtypes))

    return umls



# example code
demo_text1 = '''Admission Date:  [**2573-5-30**]              Discharge Date:   [**2573-7-1**]

Date of Birth:  [**2498-8-19**]             Sex:   F

Service: SURGERY

Allergies:
Hydrochlorothiazide

Attending:[**First Name3 (LF) 1893**]
Chief Complaint:
Abdominal pain

Major Surgical or Invasive Procedure:
PICC line [**6-25**]
ERCP w/ sphincterotomy [**5-31**]


History of Present Illness:
74y female with type 2 dm and a recent stroke affecting her
speech, who presents with 2 days of abdominal pain. Imaging sh'''
get_word_tokenization(demo_text1)

demo_text2 = '''Past Medical History: 
pt has history of medical events
Comments: some comment here

Allergies: apple, seafood
peanuts
'''
get_section_detection(demo_text2)



demo_text3 = 'Decreased dipalmitoyllecithin content found in lung specimens'
get_UMLS_match(demo_text3)