Switch to unified view

a b/mimic-cxr/create_section_files.py
1
# This script extracts the conclusion section from MIMIC-CXR reports
2
# It outputs them into individual files with at most 10,000 reports.
3
import json
4
import sys
5
import os
6
import argparse
7
import csv
8
from pathlib import Path
9
10
from tqdm import tqdm
11
12
# local folder import
13
import section_parser as sp
14
from local_config import PATH_TO_MIMIC_CXR
15
16
parser = argparse.ArgumentParser()
17
parser.add_argument('--reports_path',
18
                    default=f"{PATH_TO_MIMIC_CXR}/mimic-cxr-reports/files",
19
                    help=('Path to file with radiology reports,'
20
                          ' e.g. /data/mimic-cxr/files'))
21
parser.add_argument('--mimic_cxr_jpg_path',
22
                    default=f"{PATH_TO_MIMIC_CXR}/mimic-cxr-jpg/2.0.0/files",
23
                    help=('Path to file with radiology reports,'
24
                          ' e.g. /data/mimic-cxr/files'))
25
parser.add_argument('--output_path',
26
                    default='reports_processed',
27
                    help='Path to output CSV files.')
28
29
30
def list_rindex(l, s):
31
    """Helper function: *last* matching element in a list"""
32
    return len(l) - l[-1::-1].index(s) - 1
33
34
35
def main(args):
36
    args = parser.parse_args(args)
37
38
    reports_path = Path(args.reports_path)
39
    mimic_cxr_jpg_path = Path(args.mimic_cxr_jpg_path)
40
    output_path = Path(args.output_path)
41
42
    if not output_path.exists():
43
        output_path.mkdir()
44
45
    # not all reports can be automatically sectioned
46
    # we load in some dictionaries which have manually determined sections
47
    custom_section_names, custom_indices = sp.custom_mimic_cxr_rules()
48
49
    # get all higher up folders (p00, p01, etc)
50
    p_grp_folders = os.listdir(reports_path)
51
    p_grp_folders = [p for p in p_grp_folders
52
                     if p.startswith('p') and len(p) == 3]
53
    p_grp_folders.sort()
54
55
    # study_sections will have an element for each study
56
    # this element will be a list, each element having text for a specific section
57
    study_sections = []
58
    for p_grp in p_grp_folders:
59
        # get patient folders, usually around ~6k per group folder
60
        cxr_path = reports_path / p_grp
61
        p_folders = os.listdir(cxr_path)
62
        p_folders = [p for p in p_folders if p.startswith('p')]
63
        p_folders.sort()
64
65
        # For each patient in this grouping folder
66
        print(p_grp)
67
        for p in tqdm(p_folders):
68
            patient_path = cxr_path / p
69
70
            # get the filename for all their free-text reports
71
            studies = os.listdir(patient_path)
72
            studies = [s for s in studies if s.startswith('s')]
73
74
            for s in studies:
75
76
                img_path = mimic_cxr_jpg_path / p_grp / p / s.replace('.txt', '')
77
                corr_dicom_ids = os.listdir(img_path)
78
                corr_dicom_ids = [d.replace('.jpg', '') for d in corr_dicom_ids if d.endswith('.jpg')]
79
                # load in the free-text report
80
                with open(patient_path / s, 'r') as fp:
81
                    text = ''.join(fp.readlines())
82
83
                # get study string name without the txt extension
84
                s_stem = s[0:-4]
85
86
                # split text into sections
87
                sections, section_names, section_idx = sp.section_text(
88
                    text
89
                )
90
91
                study_sectioned = [s_stem]
92
                for sn in ('impression', 'findings',
93
                           'last_paragraph', 'comparison'):
94
                    if sn in section_names:
95
                        idx = list_rindex(section_names, sn)
96
                        study_sectioned.append(sections[idx].strip())
97
                    else:
98
                        study_sectioned.append(None)
99
                # append once per dicom_id
100
                for dicom_id in corr_dicom_ids:
101
                    study_sectioned_copy = study_sectioned.copy()
102
                    study_sectioned_copy.append(dicom_id)
103
                    study_sectioned_copy.append(f"{dicom_id}.jpg")
104
                    study_sectioned_copy.append(Path("files")/p_grp / p / s.replace('.txt', ''))
105
                    study_sectioned_copy.append(f'{s_stem}.txt')
106
                    study_sections.append(study_sectioned_copy)
107
108
    # write out a single CSV with the sections
109
    with open(output_path / 'mimic_cxr_sectioned.csv', 'w') as fp:
110
        csvwriter = csv.writer(fp)
111
        # write header
112
        csvwriter.writerow(['impression', 'findings', 'last_paragraph', 'comparison', 'dicom_id', 'Img_Filename', 'Img_Folder', 'Note_file'])
113
        for row in study_sections:
114
            csvwriter.writerow(row)
115
116
117
if __name__ == '__main__':
118
   main(sys.argv[1:])