In [None]:
import os
import sys

src_path = os.path.abspath("../..")
print(src_path)
sys.path.append(src_path)

In [None]:
from src.utils import create_directory, raw_data_path, processed_data_path, set_seed

In [None]:
set_seed(seed=42)

In [None]:
import pandas as pd

In [None]:
mimic_iv_path = os.path.join(raw_data_path, "physionet.org/files/mimiciv/2.2")
output_path = os.path.join(processed_data_path, "mimic4")

In [None]:
cohort = pd.read_csv(os.path.join(output_path, "cohort.csv"))
print(cohort.shape)
cohort.head()

In [None]:
cohort["hadm_intime"] = pd.to_datetime(cohort["hadm_intime"])
cohort["hadm_outtime"] = pd.to_datetime(cohort["hadm_outtime"])
cohort["stay_intime"] = pd.to_datetime(cohort["stay_intime"])
cohort["stay_outtime"] = pd.to_datetime(cohort["stay_outtime"])

In [None]:
hadm_ids = set(cohort.hadm_id.unique().tolist())
len(hadm_ids)

In [None]:
import ast
import numpy as np


def safe_literal_eval(s):
    if pd.isna(s):
        return np.nan
    return ast.literal_eval(s)


cohort.label_diagnosis = cohort.label_diagnosis.apply(safe_literal_eval)

helper

In [None]:
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from pandarallel import pandarallel

In [None]:
pandarallel.initialize(progress_bar=True)

merge

In [None]:
events_selected = [   
    "labevents",           
    "microbiologyevents",
    "prescriptions",
    "transfers",
    "procedureevents",
]

In [None]:
def merge_and_save(events, hadm_id, folder_name):
    
    df = []
    for event in events:
        try:
            tmp = pd.read_csv(os.path.join(output_path, f"event_{event}/event_{hadm_id}.csv"),
                              usecols=["hadm_id", "event_type", "timestamp", "event_value", "timestamp_avail"])
            df.append(tmp)
        except FileNotFoundError:
            continue
    
    assert len(df) > 0, hadm_id
    df = pd.concat(df)
    df.hadm_id = df.hadm_id.astype(int)
    df = df.sort_values(by="timestamp", ascending=True)
    
    tmp1 = pd.read_csv(os.path.join(output_path, f"event_patient_demographics/event_{hadm_id}.csv"))
    tmp2 = pd.read_csv(os.path.join(output_path, f"event_admission_info/event_{hadm_id}.csv"))
    df = pd.concat([tmp1, tmp2, df])
    
    df = df[["hadm_id", "event_type", "timestamp", "event_value", "timestamp_avail"]]

    file_path = os.path.join(output_path, f"{folder_name}/event_{hadm_id}.csv")
    df.to_csv(file_path, index=False)

    return True

In [None]:
!rm -r {output_path}/event_selected

In [None]:
create_directory(f"{output_path}/event_selected")

In [None]:
with ThreadPoolExecutor(max_workers=4) as executor:
    for hadm_id in tqdm(hadm_ids, total=len(hadm_ids)):
        future = executor.submit(
            merge_and_save, 
            events_selected, 
            hadm_id, 
            "event_selected"
        )

stat

In [None]:
from tqdm import tqdm

In [None]:
hadm_id_to_len = {}
for hadm_id in tqdm(hadm_ids):
    try:
        df = pd.read_csv(os.path.join(output_path, f"event_selected/event_{hadm_id}.csv"))        
        hadm_id_to_len[hadm_id] = len(df)
        del df
    except FileNotFoundError:
        print(f"{hadm_id} not found!")
        hadm_id_to_len[hadm_id] = 0

In [None]:
cohort["len_selected"] = cohort.hadm_id.map(hadm_id_to_len)
cohort.head()

In [None]:
len(cohort)

In [None]:
cohort.hadm_los.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])

In [None]:
cohort.stay_los.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])

In [None]:
cohort.len_selected.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])

In [None]:
cohort.to_csv(os.path.join(output_path, 'cohort+len.csv'), index=False)