|
a |
|
b/build_dataset.sh |
|
|
1 |
#!/bin/bash |
|
|
2 |
|
|
|
3 |
# Capture the start time |
|
|
4 |
start_time=$(date +%s) |
|
|
5 |
|
|
|
6 |
# Read username and password (for PhysioNet) |
|
|
7 |
read -p "Username: " USERNAME |
|
|
8 |
read -s -p "Password: " PASSWORD |
|
|
9 |
|
|
|
10 |
# Define directories and file names |
|
|
11 |
MIMIC_CXR="https://physionet.org/files/mimic-cxr-jpg/2.0.0" |
|
|
12 |
CHEST_IMAGENOME_BASE="https://physionet.org/files/chest-imagenome/1.0.0" |
|
|
13 |
CHEST_IMAGENOME_SILVER="${CHEST_IMAGENOME_BASE}/silver_dataset" |
|
|
14 |
CHEST_IMAGENOME_GOLD="$CHEST_IMAGENOME_BASE/gold_dataset" |
|
|
15 |
CHEST_IMAGENOME_UTILS="$CHEST_IMAGENOME_BASE/utils/scene_postprocessing" |
|
|
16 |
CHEST_IMAGENOME_SEMANTICS="$CHEST_IMAGENOME_BASE/semantics" |
|
|
17 |
MIMIC_IV="https://physionet.org/files/mimiciv/2.2" |
|
|
18 |
|
|
|
19 |
# Define wget parameters for readability |
|
|
20 |
WGET_PARAMS="-r -N -c -np --user $USERNAME --password $PASSWORD" |
|
|
21 |
|
|
|
22 |
# Helper function to download and extract files |
|
|
23 |
download_and_extract() { |
|
|
24 |
local file_url=$1 |
|
|
25 |
local destination_dir=$2 |
|
|
26 |
local file_name=$(basename "$file_url") |
|
|
27 |
|
|
|
28 |
# Download the file |
|
|
29 |
wget $WGET_PARAMS "$file_url" |
|
|
30 |
|
|
|
31 |
# Extract if it's a zip file |
|
|
32 |
if [[ "$file_name" == *.zip ]]; then |
|
|
33 |
unzip -o "$destination_dir/$file_name" -d "$destination_dir" # -o: overwrite |
|
|
34 |
fi |
|
|
35 |
|
|
|
36 |
# Extract if it's a gzip file |
|
|
37 |
if [[ "$file_name" == *.gz ]]; then |
|
|
38 |
gzip -d "$destination_dir/$file_name" |
|
|
39 |
fi |
|
|
40 |
} |
|
|
41 |
|
|
|
42 |
# Download MIMIC-CXR metadata |
|
|
43 |
download_and_extract "$MIMIC_CXR/mimic-cxr-2.0.0-metadata.csv.gz" "physionet.org/files/mimic-cxr-jpg/2.0.0" |
|
|
44 |
|
|
|
45 |
# Download Chest Imagenome files |
|
|
46 |
download_and_extract "$CHEST_IMAGENOME_SILVER/scene_graph.zip" "physionet.org/files/chest-imagenome/1.0.0/silver_dataset" |
|
|
47 |
download_and_extract "$CHEST_IMAGENOME_GOLD/gold_attributes_relations_500pts_500studies1st.txt" "physionet.org/files/chest-imagenome/1.0.0/gold_dataset" |
|
|
48 |
download_and_extract "$CHEST_IMAGENOME_GOLD/gold_bbox_coordinate_annotations_1000images.csv" "physionet.org/files/chest-imagenome/1.0.0/gold_dataset" |
|
|
49 |
download_and_extract "$CHEST_IMAGENOME_UTILS/scenegraph_postprocessing.py" "physionet.org/files/chest-imagenome/1.0.0/utils/scene_postprocessing" |
|
|
50 |
download_and_extract "$CHEST_IMAGENOME_SEMANTICS/attribute_relations_v1.txt" "physionet.org/files/chest-imagenome/1.0.0/semantics" |
|
|
51 |
download_and_extract "$CHEST_IMAGENOME_SEMANTICS/label_to_UMLS_mapping.json" "physionet.org/files/chest-imagenome/1.0.0/semantics" |
|
|
52 |
download_and_extract "$CHEST_IMAGENOME_SEMANTICS/objects_extracted_from_reports_v1.txt" "physionet.org/files/chest-imagenome/1.0.0/semantics" |
|
|
53 |
|
|
|
54 |
# Download MIMIC-IV hosp modules |
|
|
55 |
download_and_extract "$MIMIC_IV/hosp/admissions.csv.gz" "physionet.org/files/mimiciv/2.2/hosp" |
|
|
56 |
download_and_extract "$MIMIC_IV/hosp/diagnoses_icd.csv.gz" "physionet.org/files/mimiciv/2.2/hosp" |
|
|
57 |
download_and_extract "$MIMIC_IV/hosp/d_icd_diagnoses.csv.gz" "physionet.org/files/mimiciv/2.2/hosp" |
|
|
58 |
download_and_extract "$MIMIC_IV/hosp/d_icd_procedures.csv.gz" "physionet.org/files/mimiciv/2.2/hosp" |
|
|
59 |
download_and_extract "$MIMIC_IV/hosp/d_labitems.csv.gz" "physionet.org/files/mimiciv/2.2/hosp" |
|
|
60 |
download_and_extract "$MIMIC_IV/hosp/labevents.csv.gz" "physionet.org/files/mimiciv/2.2/hosp" |
|
|
61 |
download_and_extract "$MIMIC_IV/hosp/microbiologyevents.csv.gz" "physionet.org/files/mimiciv/2.2/hosp" |
|
|
62 |
download_and_extract "$MIMIC_IV/hosp/patients.csv.gz" "physionet.org/files/mimiciv/2.2/hosp" |
|
|
63 |
download_and_extract "$MIMIC_IV/hosp/prescriptions.csv.gz" "physionet.org/files/mimiciv/2.2/hosp" |
|
|
64 |
download_and_extract "$MIMIC_IV/hosp/procedures_icd.csv.gz" "physionet.org/files/mimiciv/2.2/hosp" |
|
|
65 |
download_and_extract "$MIMIC_IV/hosp/transfers.csv.gz" "physionet.org/files/mimiciv/2.2/hosp" |
|
|
66 |
|
|
|
67 |
# Download MIMIC-IV icu modules |
|
|
68 |
download_and_extract "$MIMIC_IV/icu/chartevents.csv.gz" "physionet.org/files/mimiciv/2.2/icu" |
|
|
69 |
download_and_extract "$MIMIC_IV/icu/d_items.csv.gz" "physionet.org/files/mimiciv/2.2/icu" |
|
|
70 |
download_and_extract "$MIMIC_IV/icu/icustays.csv.gz" "physionet.org/files/mimiciv/2.2/icu" |
|
|
71 |
download_and_extract "$MIMIC_IV/icu/inputevents.csv.gz" "physionet.org/files/mimiciv/2.2/icu" |
|
|
72 |
download_and_extract "$MIMIC_IV/icu/outputevents.csv.gz" "physionet.org/files/mimiciv/2.2/icu" |
|
|
73 |
|
|
|
74 |
# Save currentdirectory |
|
|
75 |
orig_dir=$(pwd) |
|
|
76 |
|
|
|
77 |
# Change directory and run python script |
|
|
78 |
if [ ! -f "physionet.org/files/chest-imagenome/1.0.0/silver_dataset/scene_tabular/attribute_relations_tabular.txt" ] || [ ! -f "physionet.org/files/chest-imagenome/1.0.0/silver_dataset/scene_tabular/bbox_objects_tabular.txt" ]; then |
|
|
79 |
cd "physionet.org/files/chest-imagenome/1.0.0/utils/scene_postprocessing" |
|
|
80 |
|
|
|
81 |
echo '{ |
|
|
82 |
"SCENE_DIR": "../../silver_dataset/scene_graph", |
|
|
83 |
"OUTPUT_DIR": "../../silver_dataset/scene_tabular", |
|
|
84 |
"OUTPUT_TYPE": ["attributes", "objects"], |
|
|
85 |
"RDF_LEVEL": "study_id", |
|
|
86 |
"RESOURCE": "../../semantics/label_to_UMLS_mapping.json", |
|
|
87 |
"AGGREGATION": "last", |
|
|
88 |
"INCLUDE_SECTIONS": "all" |
|
|
89 |
}' > scenegraph_postprocessing_settings.json |
|
|
90 |
python scenegraph_postprocessing.py |
|
|
91 |
echo "Done with scene postprocessing" |
|
|
92 |
fi |
|
|
93 |
|
|
|
94 |
# Return to the original directory |
|
|
95 |
cd "$orig_dir" |
|
|
96 |
|
|
|
97 |
# Preprocessing and generate dataset |
|
|
98 |
SAVE_DIR="dataset_builder/preprocessed_data/" |
|
|
99 |
PREPROCESS_SCRIPTS=("preprocess_cohort.py" "preprocess_label.py") |
|
|
100 |
SPLITS=("train" "valid" "test") |
|
|
101 |
|
|
|
102 |
mkdir -p "$SAVE_DIR" |
|
|
103 |
|
|
|
104 |
for split in "${SPLITS[@]}"; do |
|
|
105 |
if [ ! -f "${SAVE_DIR}/${split}_dataset.csv" ]; then |
|
|
106 |
for script in "${PREPROCESS_SCRIPTS[@]}"; do |
|
|
107 |
python "dataset_builder/${script}" \ |
|
|
108 |
--mimic_cxr_jpg_dir "physionet.org/files/mimic-cxr-jpg/2.0.0/" \ |
|
|
109 |
--chest_imagenome_dir "physionet.org/files/chest-imagenome/1.0.0/" \ |
|
|
110 |
--save_dir "$SAVE_DIR" |
|
|
111 |
done |
|
|
112 |
fi |
|
|
113 |
done |
|
|
114 |
|
|
|
115 |
# DB preprocessing code |
|
|
116 |
declare -A splits=( ["test"]=400 ["train"]=800 ) # Array of splits and their corresponding number of patients |
|
|
117 |
|
|
|
118 |
for split in "${!splits[@]}"; do |
|
|
119 |
num_patient=${splits[$split]} |
|
|
120 |
|
|
|
121 |
echo "Processing $split split with $num_patient patients..." |
|
|
122 |
|
|
|
123 |
python dataset_builder/preprocess_db.py \ |
|
|
124 |
--split "$split" \ |
|
|
125 |
--mimic_iv_dir "physionet.org/files/mimiciv/2.2/" \ |
|
|
126 |
--mimic_cxr_jpg_dir "physionet.org/files/mimic-cxr-jpg/2.0.0/" \ |
|
|
127 |
--chest_imagenome_dir "physionet.org/files/chest-imagenome/1.0.0/" \ |
|
|
128 |
--db_name mimic_iv_cxr \ |
|
|
129 |
--out_dir "./database" \ |
|
|
130 |
--deid \ |
|
|
131 |
--timeshift \ |
|
|
132 |
--current_time "2105-12-31 23:59:00" \ |
|
|
133 |
--start_year 2100 \ |
|
|
134 |
--time_span 5 \ |
|
|
135 |
--cur_patient_ratio 0.1 \ |
|
|
136 |
--num_patient $num_patient |
|
|
137 |
done |
|
|
138 |
|
|
|
139 |
echo "Database preprocessing complete." |
|
|
140 |
|
|
|
141 |
# Answer generation code |
|
|
142 |
for split in "${SPLITS[@]}"; do |
|
|
143 |
python dataset_builder/generate_answer.py \ |
|
|
144 |
--mimic_iv_dir "physionet.org/files/mimiciv/2.2/" \ |
|
|
145 |
--mimic_cxr_jpg_dir "physionet.org/files/mimic-cxr-jpg/2.0.0/" \ |
|
|
146 |
--chest_imagenome_dir "physionet.org/files/chest-imagenome/1.0.0/" \ |
|
|
147 |
--json_file_path "dataset/mimic_iv_cxr/_${split}.json" \ |
|
|
148 |
--db_file_path "database/mimic_iv_cxr/${split}/mimic_iv_cxr.db" \ |
|
|
149 |
--output_path "dataset/mimic_iv_cxr/${split}.json" |
|
|
150 |
done |
|
|
151 |
|
|
|
152 |
# Capture the end time |
|
|
153 |
end_time=$(date +%s) |
|
|
154 |
|
|
|
155 |
# Calculate the runtime |
|
|
156 |
runtime=$((end_time - start_time)) |
|
|
157 |
|
|
|
158 |
# Display the runtime |
|
|
159 |
echo "Script runtime: $runtime seconds" |