Switch to unified view

a b/mimic-cxr/section_parser.py
1
2
import re
3
4
5
def section_text(text):
6
    """Splits text into sections.
7
8
    Assumes text is in a radiology report format, e.g.:
9
10
        COMPARISON:  Chest radiograph dated XYZ.
11
12
        IMPRESSION:  ABC...
13
14
    Given text like this, it will output text from each section, 
15
    where the section type is determined by the all caps header.
16
17
    Returns a three element tuple:
18
        sections - list containing the text of each section
19
        section_names - a normalized version of the section name
20
        section_idx - list of start indices of the text in the section
21
    """
22
    p_section = re.compile(
23
        r'\n ([A-Z ()/,-]+):\s', re.DOTALL)
24
25
    sections = list()
26
    section_names = list()
27
    section_idx = list()
28
29
    idx = 0
30
    s = p_section.search(text, idx)
31
32
    if s:
33
        sections.append(text[0:s.start(1)])
34
        section_names.append('preamble')
35
        section_idx.append(0)
36
37
        while s:
38
            current_section = s.group(1).lower()
39
            # get the start of the text for this section
40
            idx_start = s.end()
41
            # skip past the first newline to avoid some bad parses
42
            idx_skip = text[idx_start:].find('\n')
43
            if idx_skip == -1:
44
                idx_skip = 0
45
46
            s = p_section.search(text, idx_start + idx_skip)
47
48
            if s is None:
49
                idx_end = len(text)
50
            else:
51
                idx_end = s.start()
52
53
            sections.append(text[idx_start:idx_end])
54
            section_names.append(current_section)
55
            section_idx.append(idx_start)
56
57
    else:
58
        sections.append(text)
59
        section_names.append('full report')
60
        section_idx.append(0)
61
62
    section_names = normalize_section_names(section_names)
63
64
    # remove empty sections
65
    # this handles when the report starts with a finding-like statement
66
    #  .. but this statement is not a section, more like a report title
67
    #  e.g. p10/p10103318/s57408307
68
    #    CHEST, PA LATERAL:
69
    #
70
    #    INDICATION:   This is the actual section ....
71
    # it also helps when there are multiple findings sections
72
    # usually one is empty
73
    for i in reversed(range(len(section_names))):
74
        if section_names[i] in ('impression', 'findings'):
75
            if sections[i].strip() == '':
76
                sections.pop(i)
77
                section_names.pop(i)
78
                section_idx.pop(i)
79
80
    if ('impression' not in section_names) & ('findings' not in section_names):
81
        # create a new section for the final paragraph
82
        if '\n \n' in sections[-1]:
83
            sections.append('\n \n'.join(sections[-1].split('\n \n')[1:]))
84
            sections[-2] = sections[-2].split('\n \n')[0]
85
            section_names.append('last_paragraph')
86
            section_idx.append(section_idx[-1] + len(sections[-2]))
87
88
    return sections, section_names, section_idx
89
90
91
def normalize_section_names(section_names):
92
    # first, lower case all
93
    section_names = [s.lower().strip() for s in section_names]
94
95
    frequent_sections = {
96
        "preamble": "preamble",  # 227885
97
        "impression": "impression",  # 187759
98
        "comparison": "comparison",  # 154647
99
        "indication": "indication",  # 153730
100
        "findings": "findings",  # 149842
101
        "examination": "examination",  # 94094
102
        "technique": "technique",  # 81402
103
        "history": "history",  # 45624
104
        "comparisons": "comparison",  # 8686
105
        "clinical history": "history",  # 7121
106
        "reason for examination": "indication",  # 5845
107
        "notification": "notification",  # 5749
108
        "reason for exam": "indication",  # 4430
109
        "clinical information": "history",  # 4024
110
        "exam": "examination",  # 3907
111
        "clinical indication": "indication",  # 1945
112
        "conclusion": "impression",  # 1802
113
        "chest, two views": "findings",  # 1735
114
        "recommendation(s)": "recommendations",  # 1700
115
        "type of examination": "examination",  # 1678
116
        "reference exam": "comparison",  # 347
117
        "patient history": "history",  # 251
118
        "addendum": "addendum",  # 183
119
        "comparison exam": "comparison",  # 163
120
        "date": "date",  # 108
121
        "comment": "comment",  # 88
122
        "findings and impression": "impression",  # 87
123
        "wet read": "wet read",  # 83
124
        "comparison film": "comparison",  # 79
125
        "recommendations": "recommendations",  # 72
126
        "findings/impression": "impression",  # 47
127
        "pfi": "history",
128
        'recommendation': 'recommendations',
129
        'wetread': 'wet read',
130
        'ndication': 'impression',  # 1
131
        'impresson': 'impression',  # 2
132
        'imprression': 'impression',  # 1
133
        'imoression': 'impression',  # 1
134
        'impressoin': 'impression',  # 1
135
        'imprssion': 'impression',  # 1
136
        'impresion': 'impression',  # 1
137
        'imperssion': 'impression',  # 1
138
        'mpression': 'impression',  # 1
139
        'impession': 'impression',  # 3
140
        'findings/ impression': 'impression',  # ,1
141
        'finding': 'findings',  # ,8
142
        'findins': 'findings',
143
        'findindgs': 'findings',  # ,1
144
        'findgings': 'findings',  # ,1
145
        'findngs': 'findings',  # ,1
146
        'findnings': 'findings',  # ,1
147
        'finidngs': 'findings',  # ,2
148
        'idication': 'indication',  # ,1
149
        'reference findings': 'findings',  # ,1
150
        'comparision': 'comparison',  # ,2
151
        'comparsion': 'comparison',  # ,1
152
        'comparrison': 'comparison',  # ,1
153
        'comparisions': 'comparison'  # ,1
154
    }
155
156
    p_findings = [
157
        'chest',
158
        'portable',
159
        'pa and lateral',
160
        'lateral and pa',
161
        'ap and lateral',
162
        'lateral and ap',
163
        'frontal and',
164
        'two views',
165
        'frontal view',
166
        'pa view',
167
        'ap view',
168
        'one view',
169
        'lateral view',
170
        'bone window',
171
        'frontal upright',
172
        'frontal semi-upright',
173
        'ribs',
174
        'pa and lat'
175
    ]
176
    p_findings = re.compile('({})'.format('|'.join(p_findings)))
177
178
    main_sections = [
179
        'impression', 'findings', 'history', 'comparison',
180
        'addendum'
181
    ]
182
    for i, s in enumerate(section_names):
183
        if s in frequent_sections:
184
            section_names[i] = frequent_sections[s]
185
            continue
186
187
        main_flag = False
188
        for m in main_sections:
189
            if m in s:
190
                section_names[i] = m
191
                main_flag = True
192
                break
193
        if main_flag:
194
            continue
195
196
        m = p_findings.search(s)
197
        if m is not None:
198
            section_names[i] = 'findings'
199
200
        # if it looks like it is describing the entire study
201
        # it's equivalent to findings
202
        # group similar phrasings for impression
203
204
    return section_names
205
206
207
def custom_mimic_cxr_rules():
208
    custom_section_names = {
209
        's50913680': 'recommendations',  # files/p11/p11851243/s50913680.txt
210
        's59363654': 'examination',  # files/p12/p12128253/s59363654.txt
211
        's59279892': 'technique',  # files/p13/p13150370/s59279892.txt
212
        's59768032': 'recommendations',  # files/p13/p13249077/s59768032.txt
213
        's57936451': 'indication',  # files/p14/p14325424/s57936451.txt
214
        's50058765': 'indication',  # files/p14/p14731346/s50058765.txt
215
        's53356173': 'examination',  # files/p15/p15898350/s53356173.txt
216
        's53202765': 'technique',  # files/p16/p16076182/s53202765.txt
217
        's50808053': 'technique',  # files/p16/p16631485/s50808053.txt
218
        's51966317': 'indication',  # files/p10/p10817099/s51966317.txt
219
        's50743547': 'examination',  # files/p11/p11388341/s50743547.txt
220
        's56451190': 'note',  # files/p11/p11842879/s56451190.txt
221
        's59067458': 'recommendations',  # files/p11/p11984647/s59067458.txt
222
        's59215320': 'examination',  # files/p12/p12408912/s59215320.txt
223
        's55124749': 'indication',  # files/p12/p12428492/s55124749.txt
224
        's54365831': 'indication',  # files/p13/p13876470/s54365831.txt
225
        's59087630': 'recommendations',  # files/p14/p14267880/s59087630.txt
226
        's58157373': 'recommendations',  # files/p15/p15032392/s58157373.txt
227
        's56482935': 'recommendations',  # files/p15/p15388421/s56482935.txt
228
        's58375018': 'recommendations',  # files/p15/p15505556/s58375018.txt
229
        's54654948': 'indication',  # files/p17/p17090359/s54654948.txt
230
        's55157853': 'examination',  # files/p18/p18975498/s55157853.txt
231
        's51491012': 'history',  # files/p19/p19314266/s51491012.txt
232
233
    }
234
235
    custom_indices = {
236
        's50525523': [201, 349],  # files/p10/p10602608/s50525523.txt
237
        's57564132': [233, 554],  # files/p10/p10637168/s57564132.txt
238
        's59982525': [313, 717],  # files/p11/p11989982/s59982525.txt
239
        's53488209': [149, 475],  # files/p12/p12458657/s53488209.txt
240
        's54875119': [234, 988],  # files/p13/p13687044/s54875119.txt
241
        's50196495': [59, 399],  # files/p13/p13894879/s50196495.txt
242
        's56579911': [59, 218],  # files/p15/p15394326/s56579911.txt
243
        's52648681': [292, 631],  # files/p15/p15666238/s52648681.txt
244
        's59889364': [172, 453],  # files/p15/p15835529/s59889364.txt
245
        's53514462': [73, 377],  # files/p16/p16297706/s53514462.txt
246
        's59505494': [59, 450],  # files/p16/p16730991/s59505494.txt
247
        's53182247': [59, 412],  # files/p16/p16770442/s53182247.txt
248
        's51410602': [47, 320],  # files/p17/p17069955/s51410602.txt
249
        's56412866': [522, 822],  # files/p17/p17612000/s56412866.txt
250
        's54986978': [59, 306],  # files/p17/p17912487/s54986978.txt
251
        's59003148': [262, 505],  # files/p17/p17916384/s59003148.txt
252
        's57150433': [61, 394],  # files/p18/p18335791/s57150433.txt
253
        's56760320': [219, 457],  # files/p18/p18418794/s56760320.txt
254
        's59562049': [158, 348],  # files/p18/p18502016/s59562049.txt
255
        's52674888': [145, 296],  # files/p19/p19381919/s52674888.txt
256
        's55258338': [192, 568],  # files/p13/p13719117/s55258338.txt
257
        's59330497': [140, 655],  # files/p15/p15479218/s59330497.txt
258
        's52119491': [179, 454],  # files/p17/p17959278/s52119491.txt
259
        # below have no findings at all in the entire report
260
        's58235663': [0, 0],  # files/p11/p11573679/s58235663.txt
261
        's50798377': [0, 0],  # files/p12/p12632853/s50798377.txt
262
        's54168089': [0, 0],  # files/p14/p14463099/s54168089.txt
263
        's53071062': [0, 0],  # files/p15/p15774521/s53071062.txt
264
        's56724958': [0, 0],  # files/p16/p16175671/s56724958.txt
265
        's54231141': [0, 0],  # files/p16/p16312859/s54231141.txt
266
        's53607029': [0, 0],  # files/p17/p17603668/s53607029.txt
267
        's52035334': [0, 0],  # files/p19/p19349312/s52035334.txt
268
    }
269
270
    return custom_section_names, custom_indices