[2d4573]: / wrapper_functions / medspacy_functions.py

Download this file

193 lines (138 with data), 5.4 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# pip install medspacy==0.2.0.0
import spacy
import medspacy
from medspacy.util import DEFAULT_PIPENAMES
from medspacy.custom_tokenizer import create_medspacy_tokenizer
import medspacy
from medspacy.section_detection import Sectionizer
from medspacy.section_detection import SectionRule
import warnings
warnings.filterwarnings("ignore")
def get_word_tokenization(text):
"""
returns a list of strings of tokenized word from a input string
example:
>> get_word_tokenization("'''Admission Date: [**2573-5-30**] ")
>> ['Admission', 'Date', ':', '[', '*', '*', '2573', '-', '5',...]
"""
# logging
print(f"Getting sentence tokenizer from medspaCy\n")
nlp = spacy.blank("en")
medspacy_tokenizer = create_medspacy_tokenizer(nlp)
print(f"Input text (truncated): {text}\n...")
results = list(medspacy_tokenizer(text))
'filter out empty elements'
tokenized = [element.text.strip() for element in results if len(element.text.strip()) > 0]
return tokenized
def get_section_detection(text,rules=None):
'''
given a string as the input, extract sections, consisting of medical history, allergies, comments and so on
:param text: a string
:rules: the personalized rules, a dictionary of string, i.e., {"category": "allergies"}
:return: a list of spacy Section object.
Example usage:
>> text1 = 'Past Medical History:
pt has history of medical events
Comments: some comment here
Allergies: apple, seafood
peanuts'
>> get_section_detection(text1)
>>
CATEGORY.............. past_medical_history
TITLE................. Past Medical History:
PARENT................ None
SECTION TEXT..........
pt has history of medical events
----------------------
CATEGORY.............. comments
TITLE................. Comments:
PARENT................ None
SECTION TEXT..........
some comment here
----------------------
CATEGORY.............. allergies
TITLE................. Allergies:
PARENT................ None
SECTION TEXT..........
apple, seafood
peanuts
'''
nlp = medspacy.load()
sectionizer = Sectionizer(nlp, rules=None)
pattern_dicts = [{"category": "past_medical_history", "literal": "Past Medical History:"},
{"category": "allergies", "literal": "Allergies:"},
{"category": "medical_assessment", "literal": "Medical Assessment:"},
{"category": "comment", "literal": "Comments:", "parents": ["past_medical_history", "allergies"]}]
if rules is not None:
# combine with personalized rule
pattern_dicts.append(rules)
patterns = [SectionRule.from_dict(pattern) for pattern in pattern_dicts]
sectionizer.add(patterns)
nlp.add_pipe("medspacy_sectionizer")
doc = nlp(text)
sections = []
print(f"Getting section detection function from medspaCy\n")
for section in doc._.sections:
print("CATEGORY.............. {0}".format(section.category))
print("TITLE................. {0}".format(section.title_span))
if section.parent:
print("PARENT................ {0}".format(section.parent.category))
else:
print("PARENT................ {0}".format(section.parent))
print("SECTION TEXT..........\n{0}".format(section.body_span))
print("----------------------")
sections.append(section)
return sections
def get_UMLS_match(text):
'''
Match the UMLS concept for the input text.
:param text: a string
:return: a list of tuples, (entity_text, label, similarity, semtypes)
Example:
>> concept_text = 'Decreased dipalmitoyllecithin content found in lung specimens'
>> get_UMLS_match(concept_type)
>>
Entity text : dipalmitoyllecithin
Label (UMLS CUI) : C0000039
Similarity : 0.8888888888888888
Semtypes : {'T119', 'T121'}
'''
medspacy_pipes = DEFAULT_PIPENAMES.copy()
if 'medspacy_quickumls' not in medspacy_pipes:
medspacy_pipes.add('medspacy_quickumls')
nlp = medspacy.load(enable=medspacy_pipes)
doc = nlp(text)
umls = []
print(f"Getting UMLS matching from medspaCy\n")
for ent in doc.ents:
print('Entity text : {}'.format(ent.text))
print('Label (UMLS CUI) : {}'.format(ent.label_))
print('Similarity : {}'.format(ent._.similarity))
print('Semtypes : {}'.format(ent._.semtypes))
umls.append((ent.text,ent.label_,ent._.similarity, ent._.semtypes))
return umls
# example code
demo_text1 = '''Admission Date: [**2573-5-30**] Discharge Date: [**2573-7-1**]
Date of Birth: [**2498-8-19**] Sex: F
Service: SURGERY
Allergies:
Hydrochlorothiazide
Attending:[**First Name3 (LF) 1893**]
Chief Complaint:
Abdominal pain
Major Surgical or Invasive Procedure:
PICC line [**6-25**]
ERCP w/ sphincterotomy [**5-31**]
History of Present Illness:
74y female with type 2 dm and a recent stroke affecting her
speech, who presents with 2 days of abdominal pain. Imaging sh'''
get_word_tokenization(demo_text1)
demo_text2 = '''Past Medical History:
pt has history of medical events
Comments: some comment here
Allergies: apple, seafood
peanuts
'''
get_section_detection(demo_text2)
demo_text3 = 'Decreased dipalmitoyllecithin content found in lung specimens'
get_UMLS_match(demo_text3)