Diff of /validate.py [000000] .. [27805f]

Switch to unified view

a b/validate.py
1
# import pydicom
2
# import numpy as np
3
# from PIL import Image
4
#
5
# tolerance = 1
6
#
7
# # Load the DICOM file
8
# dicom = pydicom.dcmread(r"/home/ubuntu/nlp_project/Code/physionet.org/files/mimic-cxr/2.1.0/files/p10/p10000032/s50414267/02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.dcm")
9
# dicom_pixels = dicom.pixel_array
10
#
11
# # Load the PNG file
12
# png_image = Image.open(r"/home/ubuntu/nlp_project/Code/physionet.org/files/mimic-cxr/2.1.0/files/p10/p10000032/s50414267/out_png/02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.png")
13
# png_pixels = np.array(png_image)
14
#
15
# # Compare shapes
16
# assert dicom_pixels.shape == png_pixels.shape, "Shapes of DICOM and PNG do not match"
17
#
18
# # Compare pixel values
19
# difference = np.abs(dicom_pixels - png_pixels)
20
#
21
# dicom_non_zero_elements = dicom_pixels[dicom_pixels != 0]
22
# dicom_non_zero_count = len(dicom_non_zero_elements)
23
#
24
# # Get non-zero elements in PNG array
25
# png_non_zero_elements = png_pixels[png_pixels != 0]
26
# png_non_zero_count = len(png_non_zero_elements)
27
#
28
# print(f"Non-zero elements in DICOM: {dicom_non_zero_count}")
29
# print(f"Non-zero elements in PNG: {png_non_zero_count}")
30
#
31
# # If needed, print the actual non-zero elements (be cautious for large arrays)
32
# print(f"Non-zero elements in DICOM array:\n{dicom_non_zero_elements}")
33
# print(f"Non-zero elements in PNG array:\n{png_non_zero_elements}")
34
#
35
#
36
# print(f"First 10 non-zero elements in DICOM array:\n{dicom_non_zero_elements[:10]}")
37
# print(f"First 10 non-zero elements in PNG array:\n{png_non_zero_elements[:10]}")
38
#
39
#
40
# # assert np.all(difference < tolerance), f"Pixel difference exceeds tolerance: {difference.max()}"
41
42
43
import os
44
import pandas as pd
45
from tqdm import tqdm
46
import pydicom
47
import numpy as np
48
from PIL import Image
49
50
51
# Function to convert DICOM to PNG
52
def convert_dicom_to_png(dicom_path, output_path):
53
    try:
54
        # Read the DICOM file
55
        dicom = pydicom.dcmread(dicom_path)
56
        # Get pixel array
57
        pixel_array = dicom.pixel_array
58
        # Normalize pixel values to 0-255
59
        pixel_array = ((pixel_array - pixel_array.min()) / (pixel_array.max() - pixel_array.min()) * 255).astype(
60
            np.uint8)
61
        # Save as PNG
62
        image = Image.fromarray(pixel_array)
63
        image.save(output_path)
64
    except Exception as e:
65
        print(f"Error converting {dicom_path} to PNG: {e}")
66
67
68
# Function to extract findings and impressions from a report
69
def extract_findings_and_impression(file_path):
70
    with open(file_path, 'r') as file:
71
        content = file.read()
72
73
    # Extract Findings
74
    findings_start = content.find("FINDINGS:")
75
    impression_start = content.find("IMPRESSION:")
76
77
    findings = ""
78
    impression = ""
79
80
    if findings_start != -1:
81
        findings = content[findings_start + len("FINDINGS:"):impression_start].strip()
82
83
    if impression_start != -1:
84
        impression = content[impression_start + len("IMPRESSION:"):].strip()
85
86
    return findings, impression
87
88
89
# Main logic to create the DataFrame
90
reports_root_path = input("Enter the root path for reports: ").strip()
91
92
# Ensure the path exists
93
if not os.path.exists(reports_root_path):
94
    raise FileNotFoundError(f"The specified path does not exist: {reports_root_path}")
95
96
data = []
97
98
grp_folders = os.listdir(reports_root_path)
99
100
for p_grp in grp_folders:
101
    cxr_path = os.path.join(reports_root_path, p_grp)
102
    p_files = os.listdir(cxr_path)
103
104
    for p in p_files:
105
        res_path = os.path.join(cxr_path, p)
106
107
        if os.path.isdir(res_path):
108
            dicom_dirs = [d for d in os.listdir(res_path) if os.path.isdir(os.path.join(res_path, d))]
109
            txt_files = [f for f in os.listdir(res_path) if f.endswith('.txt') and f.startswith('s')]
110
111
            for dicom_dir in dicom_dirs:
112
                dicom_path = os.path.join(res_path, dicom_dir)
113
                dicom_files = [os.path.join(dicom_path, f) for f in os.listdir(dicom_path) if f.endswith('.dcm')]
114
115
                report_file = f"{dicom_dir}.txt"
116
                if report_file in txt_files:
117
                    report_path = os.path.join(res_path, report_file)
118
                    findings, impressions = extract_findings_and_impression(report_path)
119
120
                    for dicom_file in dicom_files:
121
                        dicom_id = os.path.basename(dicom_file)
122
                        png_path = dicom_file.replace('.dcm', '.png')  # Define the PNG output path
123
124
                        # Convert the DICOM to PNG
125
                        convert_dicom_to_png(dicom_file, png_path)
126
127
                        # Append data to the list
128
                        # data.append({
129
                        #     "dicom_path": dicom_file,
130
                        #     "png_path": png_path,
131
                        #     "dicom_id": dicom_id,
132
                        #     "findings": findings,
133
                        #     "impressions": impressions
134
                        # })
135
136
                        data_entry = {
137
                            "dicom_path": dicom_file,
138
                            "png_path": png_path,
139
                            "dicom_id": dicom_id,
140
                            "findings": findings,
141
                            "impressions": impressions
142
                        }
143
144
                        data.append(data_entry)
145
146
                        print(f"Processed PNG path: {data_entry['png_path']}")
147
148
df = pd.DataFrame(data)
149
150
print(df.head())
151
print(f"Total entries: {len(df)}")
152
153
# Save the DataFrame to a CSV file
154
df.to_csv('data_with_png_paths.csv', index=False)