|
a |
|
b/mimic-cxr/section_parser.py |
|
|
1 |
|
|
|
2 |
import re |
|
|
3 |
|
|
|
4 |
|
|
|
5 |
def section_text(text): |
|
|
6 |
"""Splits text into sections. |
|
|
7 |
|
|
|
8 |
Assumes text is in a radiology report format, e.g.: |
|
|
9 |
|
|
|
10 |
COMPARISON: Chest radiograph dated XYZ. |
|
|
11 |
|
|
|
12 |
IMPRESSION: ABC... |
|
|
13 |
|
|
|
14 |
Given text like this, it will output text from each section, |
|
|
15 |
where the section type is determined by the all caps header. |
|
|
16 |
|
|
|
17 |
Returns a three element tuple: |
|
|
18 |
sections - list containing the text of each section |
|
|
19 |
section_names - a normalized version of the section name |
|
|
20 |
section_idx - list of start indices of the text in the section |
|
|
21 |
""" |
|
|
22 |
p_section = re.compile( |
|
|
23 |
r'\n ([A-Z ()/,-]+):\s', re.DOTALL) |
|
|
24 |
|
|
|
25 |
sections = list() |
|
|
26 |
section_names = list() |
|
|
27 |
section_idx = list() |
|
|
28 |
|
|
|
29 |
idx = 0 |
|
|
30 |
s = p_section.search(text, idx) |
|
|
31 |
|
|
|
32 |
if s: |
|
|
33 |
sections.append(text[0:s.start(1)]) |
|
|
34 |
section_names.append('preamble') |
|
|
35 |
section_idx.append(0) |
|
|
36 |
|
|
|
37 |
while s: |
|
|
38 |
current_section = s.group(1).lower() |
|
|
39 |
# get the start of the text for this section |
|
|
40 |
idx_start = s.end() |
|
|
41 |
# skip past the first newline to avoid some bad parses |
|
|
42 |
idx_skip = text[idx_start:].find('\n') |
|
|
43 |
if idx_skip == -1: |
|
|
44 |
idx_skip = 0 |
|
|
45 |
|
|
|
46 |
s = p_section.search(text, idx_start + idx_skip) |
|
|
47 |
|
|
|
48 |
if s is None: |
|
|
49 |
idx_end = len(text) |
|
|
50 |
else: |
|
|
51 |
idx_end = s.start() |
|
|
52 |
|
|
|
53 |
sections.append(text[idx_start:idx_end]) |
|
|
54 |
section_names.append(current_section) |
|
|
55 |
section_idx.append(idx_start) |
|
|
56 |
|
|
|
57 |
else: |
|
|
58 |
sections.append(text) |
|
|
59 |
section_names.append('full report') |
|
|
60 |
section_idx.append(0) |
|
|
61 |
|
|
|
62 |
section_names = normalize_section_names(section_names) |
|
|
63 |
|
|
|
64 |
# remove empty sections |
|
|
65 |
# this handles when the report starts with a finding-like statement |
|
|
66 |
# .. but this statement is not a section, more like a report title |
|
|
67 |
# e.g. p10/p10103318/s57408307 |
|
|
68 |
# CHEST, PA LATERAL: |
|
|
69 |
# |
|
|
70 |
# INDICATION: This is the actual section .... |
|
|
71 |
# it also helps when there are multiple findings sections |
|
|
72 |
# usually one is empty |
|
|
73 |
for i in reversed(range(len(section_names))): |
|
|
74 |
if section_names[i] in ('impression', 'findings'): |
|
|
75 |
if sections[i].strip() == '': |
|
|
76 |
sections.pop(i) |
|
|
77 |
section_names.pop(i) |
|
|
78 |
section_idx.pop(i) |
|
|
79 |
|
|
|
80 |
if ('impression' not in section_names) & ('findings' not in section_names): |
|
|
81 |
# create a new section for the final paragraph |
|
|
82 |
if '\n \n' in sections[-1]: |
|
|
83 |
sections.append('\n \n'.join(sections[-1].split('\n \n')[1:])) |
|
|
84 |
sections[-2] = sections[-2].split('\n \n')[0] |
|
|
85 |
section_names.append('last_paragraph') |
|
|
86 |
section_idx.append(section_idx[-1] + len(sections[-2])) |
|
|
87 |
|
|
|
88 |
return sections, section_names, section_idx |
|
|
89 |
|
|
|
90 |
|
|
|
91 |
def normalize_section_names(section_names): |
|
|
92 |
# first, lower case all |
|
|
93 |
section_names = [s.lower().strip() for s in section_names] |
|
|
94 |
|
|
|
95 |
frequent_sections = { |
|
|
96 |
"preamble": "preamble", # 227885 |
|
|
97 |
"impression": "impression", # 187759 |
|
|
98 |
"comparison": "comparison", # 154647 |
|
|
99 |
"indication": "indication", # 153730 |
|
|
100 |
"findings": "findings", # 149842 |
|
|
101 |
"examination": "examination", # 94094 |
|
|
102 |
"technique": "technique", # 81402 |
|
|
103 |
"history": "history", # 45624 |
|
|
104 |
"comparisons": "comparison", # 8686 |
|
|
105 |
"clinical history": "history", # 7121 |
|
|
106 |
"reason for examination": "indication", # 5845 |
|
|
107 |
"notification": "notification", # 5749 |
|
|
108 |
"reason for exam": "indication", # 4430 |
|
|
109 |
"clinical information": "history", # 4024 |
|
|
110 |
"exam": "examination", # 3907 |
|
|
111 |
"clinical indication": "indication", # 1945 |
|
|
112 |
"conclusion": "impression", # 1802 |
|
|
113 |
"chest, two views": "findings", # 1735 |
|
|
114 |
"recommendation(s)": "recommendations", # 1700 |
|
|
115 |
"type of examination": "examination", # 1678 |
|
|
116 |
"reference exam": "comparison", # 347 |
|
|
117 |
"patient history": "history", # 251 |
|
|
118 |
"addendum": "addendum", # 183 |
|
|
119 |
"comparison exam": "comparison", # 163 |
|
|
120 |
"date": "date", # 108 |
|
|
121 |
"comment": "comment", # 88 |
|
|
122 |
"findings and impression": "impression", # 87 |
|
|
123 |
"wet read": "wet read", # 83 |
|
|
124 |
"comparison film": "comparison", # 79 |
|
|
125 |
"recommendations": "recommendations", # 72 |
|
|
126 |
"findings/impression": "impression", # 47 |
|
|
127 |
"pfi": "history", |
|
|
128 |
'recommendation': 'recommendations', |
|
|
129 |
'wetread': 'wet read', |
|
|
130 |
'ndication': 'impression', # 1 |
|
|
131 |
'impresson': 'impression', # 2 |
|
|
132 |
'imprression': 'impression', # 1 |
|
|
133 |
'imoression': 'impression', # 1 |
|
|
134 |
'impressoin': 'impression', # 1 |
|
|
135 |
'imprssion': 'impression', # 1 |
|
|
136 |
'impresion': 'impression', # 1 |
|
|
137 |
'imperssion': 'impression', # 1 |
|
|
138 |
'mpression': 'impression', # 1 |
|
|
139 |
'impession': 'impression', # 3 |
|
|
140 |
'findings/ impression': 'impression', # ,1 |
|
|
141 |
'finding': 'findings', # ,8 |
|
|
142 |
'findins': 'findings', |
|
|
143 |
'findindgs': 'findings', # ,1 |
|
|
144 |
'findgings': 'findings', # ,1 |
|
|
145 |
'findngs': 'findings', # ,1 |
|
|
146 |
'findnings': 'findings', # ,1 |
|
|
147 |
'finidngs': 'findings', # ,2 |
|
|
148 |
'idication': 'indication', # ,1 |
|
|
149 |
'reference findings': 'findings', # ,1 |
|
|
150 |
'comparision': 'comparison', # ,2 |
|
|
151 |
'comparsion': 'comparison', # ,1 |
|
|
152 |
'comparrison': 'comparison', # ,1 |
|
|
153 |
'comparisions': 'comparison' # ,1 |
|
|
154 |
} |
|
|
155 |
|
|
|
156 |
p_findings = [ |
|
|
157 |
'chest', |
|
|
158 |
'portable', |
|
|
159 |
'pa and lateral', |
|
|
160 |
'lateral and pa', |
|
|
161 |
'ap and lateral', |
|
|
162 |
'lateral and ap', |
|
|
163 |
'frontal and', |
|
|
164 |
'two views', |
|
|
165 |
'frontal view', |
|
|
166 |
'pa view', |
|
|
167 |
'ap view', |
|
|
168 |
'one view', |
|
|
169 |
'lateral view', |
|
|
170 |
'bone window', |
|
|
171 |
'frontal upright', |
|
|
172 |
'frontal semi-upright', |
|
|
173 |
'ribs', |
|
|
174 |
'pa and lat' |
|
|
175 |
] |
|
|
176 |
p_findings = re.compile('({})'.format('|'.join(p_findings))) |
|
|
177 |
|
|
|
178 |
main_sections = [ |
|
|
179 |
'impression', 'findings', 'history', 'comparison', |
|
|
180 |
'addendum' |
|
|
181 |
] |
|
|
182 |
for i, s in enumerate(section_names): |
|
|
183 |
if s in frequent_sections: |
|
|
184 |
section_names[i] = frequent_sections[s] |
|
|
185 |
continue |
|
|
186 |
|
|
|
187 |
main_flag = False |
|
|
188 |
for m in main_sections: |
|
|
189 |
if m in s: |
|
|
190 |
section_names[i] = m |
|
|
191 |
main_flag = True |
|
|
192 |
break |
|
|
193 |
if main_flag: |
|
|
194 |
continue |
|
|
195 |
|
|
|
196 |
m = p_findings.search(s) |
|
|
197 |
if m is not None: |
|
|
198 |
section_names[i] = 'findings' |
|
|
199 |
|
|
|
200 |
# if it looks like it is describing the entire study |
|
|
201 |
# it's equivalent to findings |
|
|
202 |
# group similar phrasings for impression |
|
|
203 |
|
|
|
204 |
return section_names |
|
|
205 |
|
|
|
206 |
|
|
|
207 |
def custom_mimic_cxr_rules(): |
|
|
208 |
custom_section_names = { |
|
|
209 |
's50913680': 'recommendations', # files/p11/p11851243/s50913680.txt |
|
|
210 |
's59363654': 'examination', # files/p12/p12128253/s59363654.txt |
|
|
211 |
's59279892': 'technique', # files/p13/p13150370/s59279892.txt |
|
|
212 |
's59768032': 'recommendations', # files/p13/p13249077/s59768032.txt |
|
|
213 |
's57936451': 'indication', # files/p14/p14325424/s57936451.txt |
|
|
214 |
's50058765': 'indication', # files/p14/p14731346/s50058765.txt |
|
|
215 |
's53356173': 'examination', # files/p15/p15898350/s53356173.txt |
|
|
216 |
's53202765': 'technique', # files/p16/p16076182/s53202765.txt |
|
|
217 |
's50808053': 'technique', # files/p16/p16631485/s50808053.txt |
|
|
218 |
's51966317': 'indication', # files/p10/p10817099/s51966317.txt |
|
|
219 |
's50743547': 'examination', # files/p11/p11388341/s50743547.txt |
|
|
220 |
's56451190': 'note', # files/p11/p11842879/s56451190.txt |
|
|
221 |
's59067458': 'recommendations', # files/p11/p11984647/s59067458.txt |
|
|
222 |
's59215320': 'examination', # files/p12/p12408912/s59215320.txt |
|
|
223 |
's55124749': 'indication', # files/p12/p12428492/s55124749.txt |
|
|
224 |
's54365831': 'indication', # files/p13/p13876470/s54365831.txt |
|
|
225 |
's59087630': 'recommendations', # files/p14/p14267880/s59087630.txt |
|
|
226 |
's58157373': 'recommendations', # files/p15/p15032392/s58157373.txt |
|
|
227 |
's56482935': 'recommendations', # files/p15/p15388421/s56482935.txt |
|
|
228 |
's58375018': 'recommendations', # files/p15/p15505556/s58375018.txt |
|
|
229 |
's54654948': 'indication', # files/p17/p17090359/s54654948.txt |
|
|
230 |
's55157853': 'examination', # files/p18/p18975498/s55157853.txt |
|
|
231 |
's51491012': 'history', # files/p19/p19314266/s51491012.txt |
|
|
232 |
|
|
|
233 |
} |
|
|
234 |
|
|
|
235 |
custom_indices = { |
|
|
236 |
's50525523': [201, 349], # files/p10/p10602608/s50525523.txt |
|
|
237 |
's57564132': [233, 554], # files/p10/p10637168/s57564132.txt |
|
|
238 |
's59982525': [313, 717], # files/p11/p11989982/s59982525.txt |
|
|
239 |
's53488209': [149, 475], # files/p12/p12458657/s53488209.txt |
|
|
240 |
's54875119': [234, 988], # files/p13/p13687044/s54875119.txt |
|
|
241 |
's50196495': [59, 399], # files/p13/p13894879/s50196495.txt |
|
|
242 |
's56579911': [59, 218], # files/p15/p15394326/s56579911.txt |
|
|
243 |
's52648681': [292, 631], # files/p15/p15666238/s52648681.txt |
|
|
244 |
's59889364': [172, 453], # files/p15/p15835529/s59889364.txt |
|
|
245 |
's53514462': [73, 377], # files/p16/p16297706/s53514462.txt |
|
|
246 |
's59505494': [59, 450], # files/p16/p16730991/s59505494.txt |
|
|
247 |
's53182247': [59, 412], # files/p16/p16770442/s53182247.txt |
|
|
248 |
's51410602': [47, 320], # files/p17/p17069955/s51410602.txt |
|
|
249 |
's56412866': [522, 822], # files/p17/p17612000/s56412866.txt |
|
|
250 |
's54986978': [59, 306], # files/p17/p17912487/s54986978.txt |
|
|
251 |
's59003148': [262, 505], # files/p17/p17916384/s59003148.txt |
|
|
252 |
's57150433': [61, 394], # files/p18/p18335791/s57150433.txt |
|
|
253 |
's56760320': [219, 457], # files/p18/p18418794/s56760320.txt |
|
|
254 |
's59562049': [158, 348], # files/p18/p18502016/s59562049.txt |
|
|
255 |
's52674888': [145, 296], # files/p19/p19381919/s52674888.txt |
|
|
256 |
's55258338': [192, 568], # files/p13/p13719117/s55258338.txt |
|
|
257 |
's59330497': [140, 655], # files/p15/p15479218/s59330497.txt |
|
|
258 |
's52119491': [179, 454], # files/p17/p17959278/s52119491.txt |
|
|
259 |
# below have no findings at all in the entire report |
|
|
260 |
's58235663': [0, 0], # files/p11/p11573679/s58235663.txt |
|
|
261 |
's50798377': [0, 0], # files/p12/p12632853/s50798377.txt |
|
|
262 |
's54168089': [0, 0], # files/p14/p14463099/s54168089.txt |
|
|
263 |
's53071062': [0, 0], # files/p15/p15774521/s53071062.txt |
|
|
264 |
's56724958': [0, 0], # files/p16/p16175671/s56724958.txt |
|
|
265 |
's54231141': [0, 0], # files/p16/p16312859/s54231141.txt |
|
|
266 |
's53607029': [0, 0], # files/p17/p17603668/s53607029.txt |
|
|
267 |
's52035334': [0, 0], # files/p19/p19349312/s52035334.txt |
|
|
268 |
} |
|
|
269 |
|
|
|
270 |
return custom_section_names, custom_indices |