Diff of /build_dataset.sh [000000] .. [dec218]

Switch to unified view

a b/build_dataset.sh
1
#!/bin/bash
2
3
# Capture the start time
4
start_time=$(date +%s)
5
6
# Read username and password (for PhysioNet)
7
read -p "Username: " USERNAME
8
read -s -p "Password: " PASSWORD
9
10
# Define directories and file names
11
MIMIC_CXR="https://physionet.org/files/mimic-cxr-jpg/2.0.0"
12
CHEST_IMAGENOME_BASE="https://physionet.org/files/chest-imagenome/1.0.0"
13
CHEST_IMAGENOME_SILVER="${CHEST_IMAGENOME_BASE}/silver_dataset"
14
CHEST_IMAGENOME_GOLD="$CHEST_IMAGENOME_BASE/gold_dataset"
15
CHEST_IMAGENOME_UTILS="$CHEST_IMAGENOME_BASE/utils/scene_postprocessing"
16
CHEST_IMAGENOME_SEMANTICS="$CHEST_IMAGENOME_BASE/semantics"
17
MIMIC_IV="https://physionet.org/files/mimiciv/2.2"
18
19
# Define wget parameters for readability
20
WGET_PARAMS="-r -N -c -np --user $USERNAME --password $PASSWORD"
21
22
# Helper function to download and extract files
23
download_and_extract() {
24
    local file_url=$1
25
    local destination_dir=$2
26
    local file_name=$(basename "$file_url")
27
28
    # Download the file
29
    wget $WGET_PARAMS "$file_url"
30
31
    # Extract if it's a zip file
32
    if [[ "$file_name" == *.zip ]]; then
33
        unzip -o "$destination_dir/$file_name" -d "$destination_dir" # -o: overwrite
34
    fi
35
36
    # Extract if it's a gzip file
37
    if [[ "$file_name" == *.gz ]]; then
38
        gzip -d "$destination_dir/$file_name"
39
    fi
40
}
41
42
# Download MIMIC-CXR metadata
43
download_and_extract "$MIMIC_CXR/mimic-cxr-2.0.0-metadata.csv.gz" "physionet.org/files/mimic-cxr-jpg/2.0.0"
44
45
# Download Chest Imagenome files
46
download_and_extract "$CHEST_IMAGENOME_SILVER/scene_graph.zip" "physionet.org/files/chest-imagenome/1.0.0/silver_dataset"
47
download_and_extract "$CHEST_IMAGENOME_GOLD/gold_attributes_relations_500pts_500studies1st.txt" "physionet.org/files/chest-imagenome/1.0.0/gold_dataset"
48
download_and_extract "$CHEST_IMAGENOME_GOLD/gold_bbox_coordinate_annotations_1000images.csv" "physionet.org/files/chest-imagenome/1.0.0/gold_dataset"
49
download_and_extract "$CHEST_IMAGENOME_UTILS/scenegraph_postprocessing.py" "physionet.org/files/chest-imagenome/1.0.0/utils/scene_postprocessing"
50
download_and_extract "$CHEST_IMAGENOME_SEMANTICS/attribute_relations_v1.txt" "physionet.org/files/chest-imagenome/1.0.0/semantics"
51
download_and_extract "$CHEST_IMAGENOME_SEMANTICS/label_to_UMLS_mapping.json" "physionet.org/files/chest-imagenome/1.0.0/semantics"
52
download_and_extract "$CHEST_IMAGENOME_SEMANTICS/objects_extracted_from_reports_v1.txt" "physionet.org/files/chest-imagenome/1.0.0/semantics"
53
54
# Download MIMIC-IV hosp modules
55
download_and_extract "$MIMIC_IV/hosp/admissions.csv.gz" "physionet.org/files/mimiciv/2.2/hosp"
56
download_and_extract "$MIMIC_IV/hosp/diagnoses_icd.csv.gz" "physionet.org/files/mimiciv/2.2/hosp"
57
download_and_extract "$MIMIC_IV/hosp/d_icd_diagnoses.csv.gz" "physionet.org/files/mimiciv/2.2/hosp"
58
download_and_extract "$MIMIC_IV/hosp/d_icd_procedures.csv.gz" "physionet.org/files/mimiciv/2.2/hosp"
59
download_and_extract "$MIMIC_IV/hosp/d_labitems.csv.gz" "physionet.org/files/mimiciv/2.2/hosp"
60
download_and_extract "$MIMIC_IV/hosp/labevents.csv.gz" "physionet.org/files/mimiciv/2.2/hosp"
61
download_and_extract "$MIMIC_IV/hosp/microbiologyevents.csv.gz" "physionet.org/files/mimiciv/2.2/hosp"
62
download_and_extract "$MIMIC_IV/hosp/patients.csv.gz" "physionet.org/files/mimiciv/2.2/hosp"
63
download_and_extract "$MIMIC_IV/hosp/prescriptions.csv.gz" "physionet.org/files/mimiciv/2.2/hosp"
64
download_and_extract "$MIMIC_IV/hosp/procedures_icd.csv.gz" "physionet.org/files/mimiciv/2.2/hosp"
65
download_and_extract "$MIMIC_IV/hosp/transfers.csv.gz" "physionet.org/files/mimiciv/2.2/hosp"
66
67
# Download MIMIC-IV icu modules
68
download_and_extract "$MIMIC_IV/icu/chartevents.csv.gz" "physionet.org/files/mimiciv/2.2/icu"
69
download_and_extract "$MIMIC_IV/icu/d_items.csv.gz" "physionet.org/files/mimiciv/2.2/icu"
70
download_and_extract "$MIMIC_IV/icu/icustays.csv.gz" "physionet.org/files/mimiciv/2.2/icu"
71
download_and_extract "$MIMIC_IV/icu/inputevents.csv.gz" "physionet.org/files/mimiciv/2.2/icu"
72
download_and_extract "$MIMIC_IV/icu/outputevents.csv.gz" "physionet.org/files/mimiciv/2.2/icu"
73
74
# Save currentdirectory
75
orig_dir=$(pwd)
76
77
# Change directory and run python script
78
if [ ! -f "physionet.org/files/chest-imagenome/1.0.0/silver_dataset/scene_tabular/attribute_relations_tabular.txt" ] || [ ! -f "physionet.org/files/chest-imagenome/1.0.0/silver_dataset/scene_tabular/bbox_objects_tabular.txt" ]; then
79
    cd "physionet.org/files/chest-imagenome/1.0.0/utils/scene_postprocessing"
80
81
    echo '{
82
        "SCENE_DIR": "../../silver_dataset/scene_graph",
83
        "OUTPUT_DIR": "../../silver_dataset/scene_tabular",
84
        "OUTPUT_TYPE": ["attributes", "objects"],
85
        "RDF_LEVEL": "study_id",
86
        "RESOURCE": "../../semantics/label_to_UMLS_mapping.json",
87
        "AGGREGATION": "last",
88
        "INCLUDE_SECTIONS": "all"
89
    }' > scenegraph_postprocessing_settings.json
90
    python scenegraph_postprocessing.py
91
    echo "Done with scene postprocessing"
92
fi
93
94
# Return to the original directory
95
cd "$orig_dir"
96
97
# Preprocessing and generate dataset
98
SAVE_DIR="dataset_builder/preprocessed_data/"
99
PREPROCESS_SCRIPTS=("preprocess_cohort.py" "preprocess_label.py")
100
SPLITS=("train" "valid" "test")
101
102
mkdir -p "$SAVE_DIR"
103
104
for split in "${SPLITS[@]}"; do
105
    if [ ! -f "${SAVE_DIR}/${split}_dataset.csv" ]; then
106
        for script in "${PREPROCESS_SCRIPTS[@]}"; do
107
            python "dataset_builder/${script}" \
108
                --mimic_cxr_jpg_dir "physionet.org/files/mimic-cxr-jpg/2.0.0/" \
109
                --chest_imagenome_dir "physionet.org/files/chest-imagenome/1.0.0/" \
110
                --save_dir "$SAVE_DIR"
111
        done
112
    fi
113
done
114
115
# DB preprocessing code
116
declare -A splits=( ["test"]=400 ["train"]=800 )  # Array of splits and their corresponding number of patients
117
118
for split in "${!splits[@]}"; do
119
    num_patient=${splits[$split]}
120
    
121
    echo "Processing $split split with $num_patient patients..."
122
    
123
    python dataset_builder/preprocess_db.py \
124
    --split "$split" \
125
    --mimic_iv_dir "physionet.org/files/mimiciv/2.2/" \
126
    --mimic_cxr_jpg_dir "physionet.org/files/mimic-cxr-jpg/2.0.0/" \
127
    --chest_imagenome_dir "physionet.org/files/chest-imagenome/1.0.0/" \
128
    --db_name mimic_iv_cxr \
129
    --out_dir "./database" \
130
    --deid \
131
    --timeshift \
132
    --current_time "2105-12-31 23:59:00" \
133
    --start_year 2100 \
134
    --time_span 5 \
135
    --cur_patient_ratio 0.1 \
136
    --num_patient $num_patient
137
done
138
139
echo "Database preprocessing complete."
140
141
# Answer generation code
142
for split in "${SPLITS[@]}"; do
143
    python dataset_builder/generate_answer.py \
144
        --mimic_iv_dir "physionet.org/files/mimiciv/2.2/" \
145
        --mimic_cxr_jpg_dir "physionet.org/files/mimic-cxr-jpg/2.0.0/" \
146
        --chest_imagenome_dir "physionet.org/files/chest-imagenome/1.0.0/" \
147
        --json_file_path "dataset/mimic_iv_cxr/_${split}.json" \
148
        --db_file_path "database/mimic_iv_cxr/${split}/mimic_iv_cxr.db" \
149
        --output_path "dataset/mimic_iv_cxr/${split}.json"
150
done
151
152
# Capture the end time
153
end_time=$(date +%s)
154
155
# Calculate the runtime
156
runtime=$((end_time - start_time))
157
158
# Display the runtime
159
echo "Script runtime: $runtime seconds"