In [None]:
import os
import sys

src_path = os.path.abspath('../..')
print(src_path)
sys.path.append(src_path)

In [None]:
from src.utils import create_directory, raw_data_path, processed_data_path, set_seed

In [None]:
set_seed(seed=42)

In [None]:
import pandas as pd

In [None]:
mimic_iv_path = os.path.join(raw_data_path, "physionet.org/files/mimiciv/2.2")
mimic_iv_note_path = os.path.join(raw_data_path, "physionet.org/files/mimic-iv-note/2.2")
output_path = os.path.join(processed_data_path, "mimic4")

In [None]:
cohort = pd.read_csv(os.path.join(output_path, "cohort.csv"))
print(cohort.shape)
cohort.head()

In [None]:
cohort["hadm_intime"] = pd.to_datetime(cohort["hadm_intime"])
cohort["hadm_outtime"] = pd.to_datetime(cohort["hadm_outtime"])
cohort["stay_intime"] = pd.to_datetime(cohort["stay_intime"])
cohort["stay_outtime"] = pd.to_datetime(cohort["stay_outtime"])

In [None]:
hadm_ids = set(cohort.hadm_id.unique().tolist())
len(hadm_ids)

helper

In [None]:
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from pandarallel import pandarallel

In [None]:
pandarallel.initialize(progress_bar=True)

In [None]:
def save_group(group_df, hadm_id, event_type):
    file_path = f"{output_path}/event_{event_type}/event_{int(hadm_id)}.csv"
    group_df.to_csv(file_path, index=False)
    return True

## patients

In [None]:
patients = pd.read_csv(os.path.join(mimic_iv_path, "hosp/patients.csv.gz"))
print(patients.shape)
patients.head()

In [None]:
cohort = cohort.merge(patients[["subject_id", "gender", "anchor_age", "anchor_year"]], on="subject_id", how="inner")
cohort["age"] = cohort.hadm_intime.dt.year - cohort.anchor_year + cohort.anchor_age
print(cohort.shape)
cohort.head()

In [None]:
print(cohort.age.min())
print(cohort.age.max())
print(cohort.age.mean())
print(cohort.age.std())

In [None]:
cohort.gender.value_counts()

## admissions

In [None]:
admissions = pd.read_csv(os.path.join(mimic_iv_path, "hosp/admissions.csv.gz"))
print(admissions.shape)
admissions.head()

In [None]:
cohort = cohort.merge(admissions[["subject_id", "hadm_id", "admission_type", "admission_location", "insurance", "language", "marital_status", "race"]], on=["subject_id", "hadm_id"], how="inner")
print(cohort.shape)
cohort.head()

## discharge

In [None]:
discharge = pd.read_csv(os.path.join(mimic_iv_note_path, "note/discharge.csv.gz"))
print(discharge.shape)
discharge.head()

In [None]:
import re

def extract_chief_complaint(discharge_summary):
    # Define the regex pattern to capture the Chief Complaint text
    # The pattern looks for the literal string "Chief Complaint:" followed by any characters until the first newline
    pattern = r"(Chief Complaint|___ Complaint):\s*(.+?)\s*\n"
    
    # Search for the pattern in the discharge summary
    match = re.search(pattern, discharge_summary)
    
    # If a match is found, return the captured group; otherwise, return None
    if match:
        return match.group(2).strip()  # Use strip to remove any extra whitespace
    else:
        return None

In [None]:
extract_chief_complaint(discharge.iloc[42332].text)

In [None]:
extract_chief_complaint(discharge.iloc[4332].text)

In [None]:
discharge["chief_complaint"] = discharge.text.parallel_apply(extract_chief_complaint)

In [None]:
discharge.head()

In [None]:
discharge.isna().sum()

In [None]:
cohort = cohort.merge(discharge[["subject_id", "hadm_id", "chief_complaint"]], on=["subject_id", "hadm_id"], how="inner")
print(cohort.shape)
cohort.head()

## post-process

In [None]:
cohort = cohort.drop(columns=["anchor_age", "anchor_year"])
cohort.head()

In [None]:
cohort.isna().sum()

In [None]:
cohort.admission_type.unique()

In [None]:
cohort.admission_location.unique()

In [None]:
cohort.insurance.unique()

In [None]:
cohort.language.unique()

In [None]:
cohort.marital_status.unique()

In [None]:
cohort.race.unique()

In [None]:
event_type = "patient_demographics"

In [None]:
def generate_event_value(x):
    s = f"gender: {x.gender}, age: {x.age}, race: {x.race}"
    if not pd.isna(x.marital_status):
        s += f", marital status: {x.marital_status}"
    s += f", insurance: {x.insurance}"
    return s

In [None]:
meta_cols = ["gender", "age", "race", "marital_status", "insurance"]
for c in meta_cols:
    cohort["meta_" + c] = cohort[c]
meta_cols = ["meta_" + c for c in meta_cols]

In [None]:
cohort["timestamp"] = 0
cohort["timestamp_avail"] = 0

In [None]:
print(generate_event_value(cohort.iloc[5]))

In [None]:
print(generate_event_value(cohort.iloc[520]))

In [None]:
cohort["event_type"] = event_type
cohort["event_value"] = cohort.parallel_apply(generate_event_value, axis=1)

In [None]:
cohort[cohort.hadm_id == 29079034]

In [None]:
cohort.groupby("hadm_id").event_type.count().describe()

In [None]:
!rm -r {output_path}/'event_{event_type}'

In [None]:
create_directory(f"{output_path}/event_{event_type}")

In [None]:
groups = cohort.groupby("hadm_id")
    
with ThreadPoolExecutor(max_workers=4) as executor:
    for hadm_id, group_df in tqdm(groups, total=groups.ngroups):
        future = executor.submit(
            save_group, 
            group_df[["hadm_id", "event_type", "timestamp", "event_value", "timestamp_avail"] + meta_cols], 
            hadm_id, 
            event_type
        )

In [None]:
!ls -1 {output_path}/'event_{event_type}' | wc -l

In [None]:
event_type = "admission_info"

In [None]:
def generate_event_value(x):
    s = f"type: {x.admission_type}, location: {x.admission_location}"
    if not pd.isna(x.chief_complaint):
        s += f", chief complaint: {x.chief_complaint}"
    return s

In [None]:
meta_cols = ["admission_type", "admission_location", "chief_complaint"]
for c in meta_cols:
    cohort["meta_" + c] = cohort[c]
meta_cols = ["meta_" + c for c in meta_cols]

In [None]:
print(generate_event_value(cohort.iloc[5]))

In [None]:
print(generate_event_value(cohort.iloc[520]))

In [None]:
cohort["event_type"] = event_type
cohort["event_value"] = cohort.parallel_apply(generate_event_value, axis=1)

In [None]:
cohort[cohort.hadm_id == 29079034]

In [None]:
cohort.groupby("hadm_id").event_type.count().describe()

In [None]:
!rm -r {output_path}/'event_{event_type}'

In [None]:
create_directory(f"{output_path}/event_{event_type}")

In [None]:
groups = cohort.groupby("hadm_id")
    
with ThreadPoolExecutor(max_workers=4) as executor:
    for hadm_id, group_df in tqdm(groups, total=groups.ngroups):
        future = executor.submit(
            save_group, 
            group_df[["hadm_id", "event_type", "timestamp", "event_value", "timestamp_avail"] + meta_cols], 
            hadm_id, 
            event_type
        )

In [None]:
!ls -1 {output_path}/'event_{event_type}' | wc -l