In [None]:
import os
import sys

src_path = os.path.abspath("../..")
print(src_path)
sys.path.append(src_path)

In [None]:
from src.utils import create_directory, raw_data_path, processed_data_path, set_seed

In [None]:
set_seed(seed=42)

In [None]:
import pandas as pd

In [None]:
mimic_iv_path = os.path.join(raw_data_path, "physionet.org/files/mimiciv/2.2")
output_path = os.path.join(processed_data_path, "mimic4")

In [None]:
cohort = pd.read_csv(os.path.join(output_path, "cohort+len.csv"))
print(cohort.shape)
cohort.head()

In [None]:
cohort["hadm_intime"] = pd.to_datetime(cohort["hadm_intime"])
cohort["hadm_outtime"] = pd.to_datetime(cohort["hadm_outtime"])
cohort["stay_intime"] = pd.to_datetime(cohort["stay_intime"])
cohort["stay_outtime"] = pd.to_datetime(cohort["stay_outtime"])

In [None]:
hadm_ids = set(cohort.hadm_id.unique().tolist())
len(hadm_ids)

In [None]:
import ast
import numpy as np


def safe_literal_eval(s):
    if pd.isna(s):
        return np.nan
    return ast.literal_eval(s)


cohort.label_diagnosis = cohort.label_diagnosis.apply(safe_literal_eval)

In [None]:
qa_note = pd.read_json(os.path.join(output_path, "qa_note.jsonl"), lines = True)
qa_note

In [None]:
qa_note.hadm_id.nunique()

In [None]:
qa_event = pd.read_json(os.path.join(output_path, "qa_event.jsonl"), lines = True)
qa_event

In [None]:
qa_event.hadm_id.nunique()

In [None]:
qa_event.event_type.value_counts()

helper

In [None]:
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from pandarallel import pandarallel

In [None]:
pandarallel.initialize(progress_bar=True)

stat

In [None]:
cohort = cohort[cohort.hadm_id.isin(qa_note.hadm_id.unique())]
len(cohort)

In [None]:
cohort = cohort[cohort.hadm_id.isin(qa_event.hadm_id.unique())]
len(cohort)

In [None]:
cohort.hadm_los.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])

In [None]:
548.490833 / 24

In [None]:
cohort.stay_los.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])

In [None]:
265.649889 / 24

In [None]:
cohort.len_selected.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])

In [None]:
cohort_filtered = cohort[cohort.len_selected <= 1256.650000]
cohort_filtered

In [None]:
cohort_filtered.hadm_los.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])

In [None]:
cohort_filtered.stay_los.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])

In [None]:
cohort_filtered.len_selected.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])

In [None]:
all_patients = cohort_filtered.subject_id.unique()
len(all_patients)

In [None]:
from sklearn.model_selection import train_test_split


train_val_patients, test_patients = train_test_split(all_patients, test_size=0.1, random_state=42)
train_patients, val_patients = train_test_split(all_patients, test_size=0.111, random_state=42)

In [None]:
print(train_patients.shape)
print(val_patients.shape)
print(test_patients.shape)

In [None]:
train = cohort_filtered[cohort_filtered.subject_id.isin(train_patients)].reset_index(drop=True)
val = cohort_filtered[cohort_filtered.subject_id.isin(val_patients)].reset_index(drop=True)
test = cohort_filtered[cohort_filtered.subject_id.isin(test_patients)].reset_index(drop=True)

In [None]:
print(train.shape)
print(val.shape)
print(test.shape)

In [None]:
train.to_csv(os.path.join(output_path, "cohort_train.csv"), index=False)
val.to_csv(os.path.join(output_path, "cohort_val.csv"), index=False)
test.to_csv(os.path.join(output_path, "cohort_test.csv"), index=False)

In [None]:
_, test_subset_patients = train_test_split(test_patients, test_size=100, random_state=42)
len(test_subset_patients)

In [None]:
test_subset = test[test.subject_id.isin(test_subset_patients)].groupby("subject_id").apply(lambda x: x.sample(1)).reset_index(drop=True)
test_subset

In [None]:
test_subset.len_selected.describe()

In [None]:
print(test_subset.subject_id.nunique())
print(test_subset.hadm_id.nunique())

In [None]:
test_subset.to_csv(os.path.join(output_path, "cohort_test_subset.csv"), index=False)

In [None]:
qa_note_test_subset = qa_note[qa_note.hadm_id.isin(test_subset.hadm_id.unique())]
qa_note_test_subset

In [None]:
qa_event_test_subset = qa_event[qa_event.hadm_id.isin(test_subset.hadm_id.unique())].groupby("hadm_id").apply(lambda x: x.sample(1)).reset_index(drop=True)
qa_event_test_subset

In [None]:
qa_event_test_subset.hadm_id.nunique()

In [None]:
qa_event_test_subset.event_type.value_counts()

In [None]:
qa_test_subset = pd.concat([qa_event_test_subset, qa_note_test_subset]).reset_index(drop=True)
qa_test_subset

In [None]:
qa_test_subset.to_csv(os.path.join(output_path, "qa_test_subset.csv"), index=False)