--- a +++ b/build_dataset.sh @@ -0,0 +1,159 @@ +#!/bin/bash + +# Capture the start time +start_time=$(date +%s) + +# Read username and password (for PhysioNet) +read -p "Username: " USERNAME +read -s -p "Password: " PASSWORD + +# Define directories and file names +MIMIC_CXR="https://physionet.org/files/mimic-cxr-jpg/2.0.0" +CHEST_IMAGENOME_BASE="https://physionet.org/files/chest-imagenome/1.0.0" +CHEST_IMAGENOME_SILVER="${CHEST_IMAGENOME_BASE}/silver_dataset" +CHEST_IMAGENOME_GOLD="$CHEST_IMAGENOME_BASE/gold_dataset" +CHEST_IMAGENOME_UTILS="$CHEST_IMAGENOME_BASE/utils/scene_postprocessing" +CHEST_IMAGENOME_SEMANTICS="$CHEST_IMAGENOME_BASE/semantics" +MIMIC_IV="https://physionet.org/files/mimiciv/2.2" + +# Define wget parameters for readability +WGET_PARAMS="-r -N -c -np --user $USERNAME --password $PASSWORD" + +# Helper function to download and extract files +download_and_extract() { + local file_url=$1 + local destination_dir=$2 + local file_name=$(basename "$file_url") + + # Download the file + wget $WGET_PARAMS "$file_url" + + # Extract if it's a zip file + if [[ "$file_name" == *.zip ]]; then + unzip -o "$destination_dir/$file_name" -d "$destination_dir" # -o: overwrite + fi + + # Extract if it's a gzip file + if [[ "$file_name" == *.gz ]]; then + gzip -d "$destination_dir/$file_name" + fi +} + +# Download MIMIC-CXR metadata +download_and_extract "$MIMIC_CXR/mimic-cxr-2.0.0-metadata.csv.gz" "physionet.org/files/mimic-cxr-jpg/2.0.0" + +# Download Chest Imagenome files +download_and_extract "$CHEST_IMAGENOME_SILVER/scene_graph.zip" "physionet.org/files/chest-imagenome/1.0.0/silver_dataset" +download_and_extract "$CHEST_IMAGENOME_GOLD/gold_attributes_relations_500pts_500studies1st.txt" "physionet.org/files/chest-imagenome/1.0.0/gold_dataset" +download_and_extract "$CHEST_IMAGENOME_GOLD/gold_bbox_coordinate_annotations_1000images.csv" "physionet.org/files/chest-imagenome/1.0.0/gold_dataset" +download_and_extract "$CHEST_IMAGENOME_UTILS/scenegraph_postprocessing.py" "physionet.org/files/chest-imagenome/1.0.0/utils/scene_postprocessing" +download_and_extract "$CHEST_IMAGENOME_SEMANTICS/attribute_relations_v1.txt" "physionet.org/files/chest-imagenome/1.0.0/semantics" +download_and_extract "$CHEST_IMAGENOME_SEMANTICS/label_to_UMLS_mapping.json" "physionet.org/files/chest-imagenome/1.0.0/semantics" +download_and_extract "$CHEST_IMAGENOME_SEMANTICS/objects_extracted_from_reports_v1.txt" "physionet.org/files/chest-imagenome/1.0.0/semantics" + +# Download MIMIC-IV hosp modules +download_and_extract "$MIMIC_IV/hosp/admissions.csv.gz" "physionet.org/files/mimiciv/2.2/hosp" +download_and_extract "$MIMIC_IV/hosp/diagnoses_icd.csv.gz" "physionet.org/files/mimiciv/2.2/hosp" +download_and_extract "$MIMIC_IV/hosp/d_icd_diagnoses.csv.gz" "physionet.org/files/mimiciv/2.2/hosp" +download_and_extract "$MIMIC_IV/hosp/d_icd_procedures.csv.gz" "physionet.org/files/mimiciv/2.2/hosp" +download_and_extract "$MIMIC_IV/hosp/d_labitems.csv.gz" "physionet.org/files/mimiciv/2.2/hosp" +download_and_extract "$MIMIC_IV/hosp/labevents.csv.gz" "physionet.org/files/mimiciv/2.2/hosp" +download_and_extract "$MIMIC_IV/hosp/microbiologyevents.csv.gz" "physionet.org/files/mimiciv/2.2/hosp" +download_and_extract "$MIMIC_IV/hosp/patients.csv.gz" "physionet.org/files/mimiciv/2.2/hosp" +download_and_extract "$MIMIC_IV/hosp/prescriptions.csv.gz" "physionet.org/files/mimiciv/2.2/hosp" +download_and_extract "$MIMIC_IV/hosp/procedures_icd.csv.gz" "physionet.org/files/mimiciv/2.2/hosp" +download_and_extract "$MIMIC_IV/hosp/transfers.csv.gz" "physionet.org/files/mimiciv/2.2/hosp" + +# Download MIMIC-IV icu modules +download_and_extract "$MIMIC_IV/icu/chartevents.csv.gz" "physionet.org/files/mimiciv/2.2/icu" +download_and_extract "$MIMIC_IV/icu/d_items.csv.gz" "physionet.org/files/mimiciv/2.2/icu" +download_and_extract "$MIMIC_IV/icu/icustays.csv.gz" "physionet.org/files/mimiciv/2.2/icu" +download_and_extract "$MIMIC_IV/icu/inputevents.csv.gz" "physionet.org/files/mimiciv/2.2/icu" +download_and_extract "$MIMIC_IV/icu/outputevents.csv.gz" "physionet.org/files/mimiciv/2.2/icu" + +# Save currentdirectory +orig_dir=$(pwd) + +# Change directory and run python script +if [ ! -f "physionet.org/files/chest-imagenome/1.0.0/silver_dataset/scene_tabular/attribute_relations_tabular.txt" ] || [ ! -f "physionet.org/files/chest-imagenome/1.0.0/silver_dataset/scene_tabular/bbox_objects_tabular.txt" ]; then + cd "physionet.org/files/chest-imagenome/1.0.0/utils/scene_postprocessing" + + echo '{ + "SCENE_DIR": "../../silver_dataset/scene_graph", + "OUTPUT_DIR": "../../silver_dataset/scene_tabular", + "OUTPUT_TYPE": ["attributes", "objects"], + "RDF_LEVEL": "study_id", + "RESOURCE": "../../semantics/label_to_UMLS_mapping.json", + "AGGREGATION": "last", + "INCLUDE_SECTIONS": "all" + }' > scenegraph_postprocessing_settings.json + python scenegraph_postprocessing.py + echo "Done with scene postprocessing" +fi + +# Return to the original directory +cd "$orig_dir" + +# Preprocessing and generate dataset +SAVE_DIR="dataset_builder/preprocessed_data/" +PREPROCESS_SCRIPTS=("preprocess_cohort.py" "preprocess_label.py") +SPLITS=("train" "valid" "test") + +mkdir -p "$SAVE_DIR" + +for split in "${SPLITS[@]}"; do + if [ ! -f "${SAVE_DIR}/${split}_dataset.csv" ]; then + for script in "${PREPROCESS_SCRIPTS[@]}"; do + python "dataset_builder/${script}" \ + --mimic_cxr_jpg_dir "physionet.org/files/mimic-cxr-jpg/2.0.0/" \ + --chest_imagenome_dir "physionet.org/files/chest-imagenome/1.0.0/" \ + --save_dir "$SAVE_DIR" + done + fi +done + +# DB preprocessing code +declare -A splits=( ["test"]=400 ["train"]=800 ) # Array of splits and their corresponding number of patients + +for split in "${!splits[@]}"; do + num_patient=${splits[$split]} + + echo "Processing $split split with $num_patient patients..." + + python dataset_builder/preprocess_db.py \ + --split "$split" \ + --mimic_iv_dir "physionet.org/files/mimiciv/2.2/" \ + --mimic_cxr_jpg_dir "physionet.org/files/mimic-cxr-jpg/2.0.0/" \ + --chest_imagenome_dir "physionet.org/files/chest-imagenome/1.0.0/" \ + --db_name mimic_iv_cxr \ + --out_dir "./database" \ + --deid \ + --timeshift \ + --current_time "2105-12-31 23:59:00" \ + --start_year 2100 \ + --time_span 5 \ + --cur_patient_ratio 0.1 \ + --num_patient $num_patient +done + +echo "Database preprocessing complete." + +# Answer generation code +for split in "${SPLITS[@]}"; do + python dataset_builder/generate_answer.py \ + --mimic_iv_dir "physionet.org/files/mimiciv/2.2/" \ + --mimic_cxr_jpg_dir "physionet.org/files/mimic-cxr-jpg/2.0.0/" \ + --chest_imagenome_dir "physionet.org/files/chest-imagenome/1.0.0/" \ + --json_file_path "dataset/mimic_iv_cxr/_${split}.json" \ + --db_file_path "database/mimic_iv_cxr/${split}/mimic_iv_cxr.db" \ + --output_path "dataset/mimic_iv_cxr/${split}.json" +done + +# Capture the end time +end_time=$(date +%s) + +# Calculate the runtime +runtime=$((end_time - start_time)) + +# Display the runtime +echo "Script runtime: $runtime seconds" \ No newline at end of file