covid-ehr-benchmarks / Git / [d6904d] /datasets/cdsl/preprocess.py

Models:
philipB/
covid-ehr-benchmarks
Downloads: 1
[d6904d]: / datasets / cdsl / preprocess.py
History
Download this file
1513 lines (1290 with data), 54.4 kB

# %% [markdown]
# # hm dataset pre-processing
# 
# import packages

# %%
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle as pkl
import torch
import math
import datetime
from tqdm import tqdm
import datetime
import re
from functools import reduce

# %% [markdown]
# ## Demographic data

# %%
demographic = pd.read_csv('./raw_data/19_04_2021/COVID_DSL_01.CSV', encoding='ISO-8859-1', sep='|')
print(len(demographic))
demographic.head()

# %%
med = pd.read_csv('./raw_data/19_04_2021/COVID_DSL_04.CSV', encoding='ISO-8859-1', sep='|')
print(len(med))
med.head()

# %%
len(med['ID_ATC7'].unique())

# %% [markdown]
# get rid of patient with missing label

# %%
print(len(demographic))
demographic = demographic.dropna(axis=0, how='any', subset=['IDINGRESO', 'F_INGRESO_ING', 'F_ALTA_ING', 'MOTIVO_ALTA_ING'])
print(len(demographic))

# %%
def outcome2num(x):
    if x == 'Fallecimiento':
        return 1
    else:
        return 0

def to_one_hot(x, feature):
    if x == feature:
        return 1
    else:
        return 0

# %%
# select necessary columns from demographic
demographic = demographic[
        [
            'IDINGRESO', 
            'EDAD',
            'SEX',
            'F_INGRESO_ING', 
            'F_ALTA_ING', 
            'MOTIVO_ALTA_ING', 
            'ESPECIALIDAD_URGENCIA', 
            'DIAG_URG'
        ]
    ]

# rename column
demographic = demographic.rename(columns={
    'IDINGRESO': 'PATIENT_ID',
    'EDAD': 'AGE',
    'SEX': 'SEX',
    'F_INGRESO_ING': 'ADMISSION_DATE',
    'F_ALTA_ING': 'DEPARTURE_DATE',
    'MOTIVO_ALTA_ING': 'OUTCOME',
    'ESPECIALIDAD_URGENCIA': 'DEPARTMENT_OF_EMERGENCY',
    'DIAG_URG': 'DIAGNOSIS_AT_EMERGENCY_VISIT'
})

# SEX: male: 1; female: 0
demographic['SEX'].replace('MALE', 1, inplace=True)
demographic['SEX'].replace('FEMALE', 0, inplace=True)

# outcome: Fallecimiento(dead): 1; others: 0
demographic['OUTCOME'] = demographic['OUTCOME'].map(outcome2num)

# diagnosis at emergency visit (loss rate < 10%)
# demographic['DIFFICULTY_BREATHING'] = demographic['DIAGNOSIS_AT_EMERGENCY_VISIT'].map(lambda x: to_one_hot(x, 'DIFICULTAD RESPIRATORIA')) # 1674
# demographic['SUSPECT_COVID'] = demographic['DIAGNOSIS_AT_EMERGENCY_VISIT'].map(lambda x: to_one_hot(x, 'SOSPECHA COVID-19')) # 960
# demographic['FEVER'] = demographic['DIAGNOSIS_AT_EMERGENCY_VISIT'].map(lambda x: to_one_hot(x, 'FIEBRE')) # 455

# department of emergency (loss rate < 10%)
# demographic['EMERGENCY'] = demographic['DEPARTMENT_OF_EMERGENCY'].map(lambda x: to_one_hot(x, 'Medicina de Urgencias')) # 3914

# %%
# del useless data
demographic = demographic[
        [
            'PATIENT_ID',
            'AGE',
            'SEX',
            'ADMISSION_DATE',
            'DEPARTURE_DATE',
            'OUTCOME',
            # 'DIFFICULTY_BREATHING',
            # 'SUSPECT_COVID',
            # 'FEVER',
            # 'EMERGENCY'
        ]
    ]

# %%
demographic.describe().to_csv('demographic_overview.csv', mode='w', index=False)
demographic.describe()

# %% [markdown]
# ### Analyze data

# %%
plt.scatter(demographic['PATIENT_ID'], demographic['AGE'], s=1)
plt.xlabel('Patient Id')
plt.ylabel('Age')
plt.title('Patient-Age Scatter Plot')

# %%
plt.scatter(demographic['PATIENT_ID'], demographic['AGE'], s=1)
plt.xlabel('Patient Id')
plt.ylabel('Age')
plt.title('Patient-Age Scatter Plot')

# %%
demographic.to_csv('demographic.csv', mode='w', index=False)
demographic.head()

# %% [markdown]
# ## Vital Signal

# %%
vital_signs = pd.read_csv('./raw_data/19_04_2021/COVID_DSL_02.CSV', encoding='ISO-8859-1', sep='|')
print(len(vital_signs))
vital_signs.head()

# %%
vital_signs = vital_signs.rename(columns={
    'IDINGRESO': 'PATIENT_ID',
    'CONSTANTS_ING_DATE': 'RECORD_DATE',
    'CONSTANTS_ING_TIME': 'RECORD_TIME',
    'FC_HR_ING': 'HEART_RATE',
    'GLU_GLY_ING': 'BLOOD_GLUCOSE',
    'SAT_02_ING': 'OXYGEN_SATURATION',
    'TA_MAX_ING': 'MAX_BLOOD_PRESSURE',
    'TA_MIN_ING': 'MIN_BLOOD_PRESSURE',
    'TEMP_ING': 'TEMPERATURE'
})
vital_signs['RECORD_TIME'] = vital_signs['RECORD_DATE'] + ' ' + vital_signs['RECORD_TIME']
vital_signs['RECORD_TIME'] = vital_signs['RECORD_TIME'].map(lambda x: str(datetime.datetime.strptime(x, '%Y-%m-%d %H:%M')))
vital_signs = vital_signs.drop(['RECORD_DATE', 'SAT_02_ING_OBS', 'BLOOD_GLUCOSE'], axis=1)

# %%
vital_signs.describe()

# %%
vital_signs.head()

# %%
def format_temperature(x):
    if type(x) == str:
        return float(x.replace(',', '.'))
    else:
        return float(x)

def format_oxygen(x):
    x = float(x)
    if x > 100:
        return np.nan
    else:
        return x

def format_heart_rate(x):
    x = int(x)
    if x > 220:
        return np.nan
    else:
        return x

vital_signs['TEMPERATURE'] = vital_signs['TEMPERATURE'].map(lambda x: format_temperature(x))
vital_signs['OXYGEN_SATURATION'] = vital_signs['OXYGEN_SATURATION'].map(lambda x: format_oxygen(x))
vital_signs['HEART_RATE'] = vital_signs['HEART_RATE'].map(lambda x: format_heart_rate(x))

# %%
vital_signs = vital_signs.replace(0, np.NAN)

# %%
vital_signs = vital_signs.groupby(['PATIENT_ID', 'RECORD_TIME'], dropna=True, as_index = False).mean()
vital_signs.head()

# %%
vital_signs.describe()

# %%
vital_signs.describe().to_csv('vital_signs_overview.csv', index=False, mode='w')
vital_signs.describe()

# %%
"""
#plt.rcParams['figure.figsize'] = [10, 5]
fig=plt.figure(figsize=(16,10), dpi= 100, facecolor='w', edgecolor='k')

plt.subplot(2, 3, 1)
plt.scatter(vital_signs.index, vital_signs['MAX_BLOOD_PRESSURE'], s=1)
plt.xlabel('Index')
plt.ylabel('Max Blood Pressure')
plt.title('Visit-Max Blood Pressure Scatter Plot')

plt.subplot(2, 3, 2)
plt.scatter(vital_signs.index, vital_signs['MIN_BLOOD_PRESSURE'], s=1)
plt.xlabel('Index')
plt.ylabel('Min Blood Pressure')
plt.title('Visit-Min Blood Pressure Scatter Plot')

plt.subplot(2, 3, 3)
plt.scatter(vital_signs.index, vital_signs['TEMPERATURE'], s=1)
plt.xlabel('Index')
plt.ylabel('Temperature')
plt.title('Visit-Temperature Scatter Plot')

plt.subplot(2, 3, 4)
plt.scatter(vital_signs.index, vital_signs['HEART_RATE'], s=1)
plt.xlabel('Index')
plt.ylabel('Heart Rate')
plt.title('Visit-Heart Rate Scatter Plot')

plt.subplot(2, 3, 5)
plt.scatter(vital_signs.index, vital_signs['OXYGEN_SATURATION'], s=1)
plt.xlabel('Index')
plt.ylabel('Oxygen Saturation')
plt.title('Visit-Oxygen Saturation Scatter Plot')

plt.show()
"""
# %%
"""
#plt.rcParams['figure.figsize'] = [10, 5]
fig=plt.figure(figsize=(16,10), dpi= 100, facecolor='w', edgecolor='k')

plt.subplot(2, 3, 1)
plt.hist(vital_signs['MAX_BLOOD_PRESSURE'], bins=30)
plt.xlabel('Index')
plt.ylabel('Max Blood Pressure')
plt.title('Visit-Max Blood Pressure Histogram')

plt.subplot(2, 3, 2)
plt.hist(vital_signs['MIN_BLOOD_PRESSURE'], bins=30)
plt.xlabel('Index')
plt.ylabel('Min Blood Pressure')
plt.title('Visit-Min Blood Pressure Histogram')

plt.subplot(2, 3, 3)
plt.hist(vital_signs['TEMPERATURE'], bins=30)
plt.xlabel('Index')
plt.ylabel('Temperature')
plt.title('Visit-Temperature Histogram')

plt.subplot(2, 3, 4)
plt.hist(vital_signs['HEART_RATE'], bins=30)
plt.xlabel('Index')
plt.ylabel('Heart Rate')
plt.title('Visit-Heart Rate Histogram')

plt.subplot(2, 3, 5)
plt.hist(vital_signs['OXYGEN_SATURATION'], bins=30)
plt.xlabel('Index')
plt.ylabel('Oxygen Saturation')
plt.title('Visit-Oxygen Saturation Histogram')

plt.show()
"""
# %% [markdown]
# ### Missing rate of each visit

# %%
sum(vital_signs.T.isnull().sum()) / ((len(vital_signs.T) - 2) * len(vital_signs))

# %% [markdown]
# ### Normalize data

# %%
"""
for key in vital_signs.keys()[2:]:
    vital_signs[key] = (vital_signs[key] - vital_signs[key].mean()) / (vital_signs[key].std() + 1e-12)

vital_signs.describe()
"""

# %%
vital_signs.to_csv('visual_signs.csv', mode='w', index=False)

# %%
len(vital_signs) / len(vital_signs['PATIENT_ID'].unique())

# %% [markdown]
# ## Lab Tests

# %%
lab_tests = pd.read_csv('./raw_data/19_04_2021/COVID_DSL_06_v2.CSV', encoding='ISO-8859-1', sep=';')
lab_tests = lab_tests.rename(columns={'IDINGRESO': 'PATIENT_ID'})
print(len(lab_tests))

# del useless data
lab_tests = lab_tests[
        [
            'PATIENT_ID',
            'LAB_NUMBER',
            'LAB_DATE',
            'TIME_LAB',
            'ITEM_LAB',
            'VAL_RESULT'
            # UD_RESULT: unit
            # REF_VALUES: reference values
        ]
    ]

lab_tests.head()

# %%
lab_tests = lab_tests.groupby(['PATIENT_ID', 'LAB_NUMBER', 'LAB_DATE', 'TIME_LAB', 'ITEM_LAB'], dropna=True, as_index = False).first()
lab_tests = lab_tests.set_index(['PATIENT_ID', 'LAB_NUMBER', 'LAB_DATE', 'TIME_LAB', 'ITEM_LAB'], drop = True).unstack('ITEM_LAB')['VAL_RESULT'].reset_index()

lab_tests = lab_tests.drop([
    'CFLAG -- ALARMA HEMOGRAMA', 
    'CORONA -- PCR CORONAVIRUS 2019nCoV', 
    'CRIOGLO -- CRIOGLOBULINAS',
    'EGCOVID -- ESTUDIO GENETICO COVID-19',
    'FRO1 -- ',
    'FRO1 -- FROTIS EN SANGRE PERIFERICA',
    'FRO2 -- ',
    'FRO2 -- FROTIS EN SANGRE PERIFERICA',
    'FRO3 -- ',
    'FRO3 -- FROTIS EN SANGRE PERIFERICA',
    'FRO_COMEN -- ',
    'FRO_COMEN -- FROTIS EN SANGRE PERIFERICA',
    'G-CORONAV (RT-PCR) -- Tipo de muestra: ASPIRADO BRONCOALVEOLAR',
    'G-CORONAV (RT-PCR) -- Tipo de muestra: EXUDADO',
    'GRRH -- GRUPO SANGUÖNEO Y FACTOR Rh',
    'HEML -- RECUENTO CELULAR LIQUIDO',
    'HEML -- Recuento Hemat¡es',
    'IFSUERO -- INMUNOFIJACION EN SUERO',
    'OBS_BIOMOL -- OBSERVACIONES GENETICA MOLECULAR',
    'OBS_BIOO -- Observaciones Bioqu¡mica Orina',
    'OBS_CB -- Observaciones Coagulaci¢n',
    'OBS_GASES -- Observaciones Gasometr¡a Arterial',
    'OBS_GASV -- Observaciones Gasometr¡a Venosa',
    'OBS_GEN2 -- OBSERVACIONES GENETICA',
    'OBS_HOR -- Observaciones Hormonas',
    'OBS_MICRO -- Observaciones Microbiolog¡a',
    'OBS_NULA2 -- Observaciones Bioqu¡mica',
    'OBS_NULA3 -- Observaciones Hematolog¡a',
    'OBS_PESP -- Observaciones Pruebas especiales',
    'OBS_SERO -- Observaciones Serolog¡a',
    'OBS_SIS -- Observaciones Orina',
    'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: ASPIRADO BRONCOALVEOLAR',
    'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: BAS',
    'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: ESPUTO',
    'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: EXUDADO',
    'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: LAVADO BRONCOALVEOLAR',
    'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: LAVADO NASOFARÖNGEO',
    'PTGOR -- PROTEINOGRAMA ORINA',
    'RESUL_IFT -- ESTUDIO DE INMUNOFENOTIPO',
    'RESUL_IFT -- Resultado',
    'Resultado -- Resultado',
    'SED1 -- ',
    'SED1 -- SEDIMENTO',
    'SED2 -- ',
    'SED2 -- SEDIMENTO',
    'SED3 -- ',
    'SED3 -- SEDIMENTO',
    'TIPOL -- TIPO DE LIQUIDO',
    'Tecnica -- T\x82cnica',
    'TpMues -- Tipo de muestra',
    'VHCBLOT -- INMUNOBLOT VIRUS HEPATITIS C',
    'VIR_TM -- VIRUS TIPO DE MUESTRA',
    'LEGIORI -- AG. LEGIONELA PNEUMOPHILA EN ORINA',
    'NEUMOORI -- AG NEUMOCOCO EN ORINA',
    'VIHAC -- VIH AC'
    ], axis=1)

    
lab_tests.head()

# %%
lab_tests = lab_tests.replace('Sin resultado.', np.nan)
lab_tests = lab_tests.replace('Sin resultado', np.nan)
lab_tests = lab_tests.replace('----', np.nan).replace('---', np.nan)
lab_tests = lab_tests.replace('> ', '').replace('< ', '')

def change_format(x):
    if x is None:
        return np.nan
    elif type(x) == str:
        if x.startswith('Negativo ('):
            return x.replace('Negativo (', '-')[:-1]
        elif x.startswith('Positivo ('):
            return x.replace('Positivo (', '')[:-1]
        elif x.startswith('Zona limite ('):
            return x.replace('Zona limite (', '')[:-1]
        elif x.startswith('>'):
            return x.replace('> ', '').replace('>', '')
        elif x.startswith('<'):
            return x.replace('< ', '').replace('<', '')
        elif x.endswith(' mg/dl'):
            return x.replace(' mg/dl', '')
        elif x.endswith('/æl'):
            return x.replace('/æl', '')
        elif x.endswith(' copias/mL'):
            return x.replace(' copias/mL', '')
        elif x == 'Numerosos':
            return 1.5
        elif x == 'Aislados':
            return 0.5
        elif x == 'Se detecta' or x == 'Se observan' or x == 'Normal' or x == 'Positivo':
            return 1
        elif x == 'No se detecta' or x == 'No se observan' or x == 'Negativo':
            return 0
        elif x == 'Indeterminado':
            return np.nan
        else:
            num = re.findall("[-+]?\d+\.\d+", x)
            if len(num) == 0:
                return np.nan
            else:
                return num[0]
    else:
        return x

feature_value_dict = dict()

for k in tqdm(lab_tests.keys()[4:]):
    lab_tests[k] = lab_tests[k].map(lambda x: change_format(change_format(x)))
    feature_value_dict[k] = lab_tests[k].unique()

# %%
def nan_and_not_nan(x):
    if x == x:
        return 1
    else: # nan
        return 0

def is_float(num):
    try:
        float(num)
        return True
    except ValueError:
        return False

def is_all_float(x):
    for i in x:
        if i == i and (i != None):
            if not is_float(i):
                return False
    return True

def to_float(x):
    if x != None:
        return float(x)
    else:
        return np.nan

other_feature_dict = dict()

for feature in tqdm(feature_value_dict.keys()):
    values = feature_value_dict[feature]
    if is_all_float(values):
        lab_tests[feature] = lab_tests[feature].map(lambda x: to_float(x))
    elif len(values) == 2:
        lab_tests[feature] = lab_tests[feature].map(lambda x: nan_and_not_nan(x))
    else:
        other_feature_dict[feature] = values

# %%
other_feature_dict

# %%
def format_time(t):
    if '/' in t:
        return str(datetime.datetime.strptime(t, '%d/%m/%Y %H:%M'))
    else:
        return str(datetime.datetime.strptime(t, '%d-%m-%Y %H:%M'))

lab_tests['RECORD_TIME'] = lab_tests['LAB_DATE'] + ' ' + lab_tests['TIME_LAB']
lab_tests['RECORD_TIME'] = lab_tests['RECORD_TIME'].map(lambda x: format_time(x))
lab_tests = lab_tests.drop(['LAB_NUMBER', 'LAB_DATE', 'TIME_LAB'], axis=1)
# lab_tests = lab_tests.drop(['LAB_NUMBER', 'TIME_LAB'], axis=1)
lab_tests.head()

# %%
lab_tests_patient = lab_tests.groupby(['PATIENT_ID'], dropna=True, as_index = False).mean()
print(len(lab_tests_patient))
count = [i for i in lab_tests_patient.count()[1:]]
plt.hist(count)

# %%
patient_total = len(lab_tests_patient)
threshold = patient_total * 0.1
reserved_keys = []

for key in lab_tests_patient.keys():
    if lab_tests_patient[key].count() > threshold:
        reserved_keys.append(key)

print(len(reserved_keys))
reserved_keys

# %%
reserved_keys.insert(1, 'RECORD_TIME')

lab_tests = lab_tests.groupby(['PATIENT_ID', 'RECORD_TIME'], dropna=True, as_index = False).mean()

lab_tests = lab_tests[reserved_keys]
lab_tests.head()
"""
# %% [markdown]
# ### Missing rate of each visit

# %%
sum(lab_tests.T.isnull().sum()) / ((len(lab_tests.T) - 2) * len(lab_tests))

# %% [markdown]
# ### Scatter Plot

# %%
fig=plt.figure(figsize=(16,200), dpi= 100, facecolor='w', edgecolor='k')

i = 1
for key in lab_tests.keys()[2:]:
    plt.subplot(33, 3, i)
    plt.scatter(lab_tests.index, lab_tests[key], s=1)
    plt.ylabel(key)
    i += 1

plt.show()

# %%
fig=plt.figure(figsize=(20,120), dpi= 100, facecolor='w', edgecolor='k')

i = 1
for key in lab_tests.keys()[2:]:
    plt.subplot(23, 4, i)
    plt.hist(lab_tests[key], bins=30)
    q3 = lab_tests[key].quantile(0.75)
    q1 = lab_tests[key].quantile(0.25)
    qh = q3 + 3 * (q3 - q1)
    ql = q1 - 3 * (q3 - q1)
    sigma = 5
    plt.axline([sigma*lab_tests[key].std() + lab_tests[key].mean(), 0], [sigma*lab_tests[key].std() + lab_tests[key].mean(), 1], color = "r", linestyle=(0, (5, 5)))
    plt.axline([-sigma*lab_tests[key].std() + lab_tests[key].mean(), 0], [-sigma*lab_tests[key].std() + lab_tests[key].mean(), 1], color = "r", linestyle=(0, (5, 5)))
    #plt.axline([lab_tests[key].quantile(0.25), 0], [lab_tests[key].quantile(0.25), 1], color = "k", linestyle=(0, (5, 5)))
    #plt.axline([lab_tests[key].quantile(0.75), 0], [lab_tests[key].quantile(0.75), 1], color = "k", linestyle=(0, (5, 5)))
    plt.axline([qh, 0], [qh, 1], color='k', linestyle=(0, (5, 5)))
    plt.axline([ql, 0], [ql, 1], color='k', linestyle=(0, (5, 5)))
    plt.ylabel(key)
    i += 1

plt.show()
"""
# %% [markdown]
# ### Normalize data

# %%
"""
for key in lab_tests.keys()[2:]:
    lab_tests[key] = (lab_tests[key] - lab_tests[key].mean()) / (lab_tests[key].std() + 1e-12)

lab_tests.describe()
"""

# %%
# 【del normalization】
# for key in lab_tests.keys()[2:]:
#     r = lab_tests[lab_tests[key].between(lab_tests[key].quantile(0.05), lab_tests[key].quantile(0.95))]
#     lab_tests[key] = (lab_tests[key] - r[key].mean()) / (r[key].std() + 1e-12)

# %%
lab_tests.to_csv('lab_test.csv', mode='w', index=False)

# %% [markdown]
# # Concat data

# %%
demographic['PATIENT_ID'] = demographic['PATIENT_ID'].map(lambda x: str(int(x)))
vital_signs['PATIENT_ID'] = vital_signs['PATIENT_ID'].map(lambda x: str(int(x)))
lab_tests['PATIENT_ID'] = lab_tests['PATIENT_ID'].map(lambda x: str(int(x)))

# %%
len(demographic['PATIENT_ID'].unique()), len(vital_signs['PATIENT_ID'].unique()), len(lab_tests['PATIENT_ID'].unique())

# %%
train_df = pd.merge(vital_signs, lab_tests, on=['PATIENT_ID', 'RECORD_TIME'], how='outer')

train_df = train_df.groupby(['PATIENT_ID', 'RECORD_TIME'], dropna=True, as_index = False).mean()

train_df = pd.merge(demographic, train_df, on=['PATIENT_ID'], how='left')

train_df.head()

# %%
# del rows without patient_id, admission_date, record_time, or outcome
train_df = train_df.dropna(axis=0, how='any', subset=['PATIENT_ID', 'ADMISSION_DATE', 'RECORD_TIME', 'OUTCOME'])

# %%
train_df.to_csv('train.csv', mode='w', index=False)
train_df.describe()

# %% [markdown]
# ## Missing rate of each visit

# %%
sum(train_df.T.isnull().sum()) / ((len(train_df.T) - 2) * len(train_df))

# %% [markdown]
# # Split and save data

# %% [markdown]
# * demo: demographic data
# * x: lab test & vital signs
# * y: outcome & length of stay

# %%
patient_ids = train_df['PATIENT_ID'].unique()

demo_cols = ['AGE', 'SEX'] # , 'DIFFICULTY_BREATHING', 'FEVER', 'SUSPECT_COVID', 'EMERGENCY'
test_cols = []

# get column names
for k in train_df.keys():
    if not k in demographic.keys():
        if not k == 'RECORD_TIME':
            test_cols.append(k)

test_median = train_df[test_cols].median()

# %%
test_cols

# %%
train_df['RECORD_TIME_DAY'] = train_df['RECORD_TIME'].map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d'))
train_df['RECORD_TIME_HOUR'] = train_df['RECORD_TIME'].map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d %H'))
train_df.head()

# %%
train_df_day = train_df.groupby(['PATIENT_ID', 'ADMISSION_DATE', 'DEPARTURE_DATE', 'RECORD_TIME_DAY'], dropna=True, as_index = False).mean()
train_df_hour = train_df.groupby(['PATIENT_ID', 'ADMISSION_DATE', 'DEPARTURE_DATE', 'RECORD_TIME_HOUR'], dropna=True, as_index = False).mean()

len(train_df), len(train_df_day), len(train_df_hour)

# %% [markdown]
# 
# ```
# number of visits (total)
# - Original data: 168777
# - Merge by hour: 130141
# - Merge by day:  42204
# ```

# %%
len(train_df['PATIENT_ID'].unique())

# %%
def get_visit_intervals(df):
    ls = []
    for pat in df['PATIENT_ID'].unique():
        ls.append(len(df[df['PATIENT_ID'] == pat]))
    return ls

# %%
ls_org = get_visit_intervals(train_df)
ls_hour = get_visit_intervals(train_df_hour)
ls_day = get_visit_intervals(train_df_day)

# %%
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import matplotlib.font_manager as font_manager
import pandas as pd
import numpy as np
"""
csfont = {'fontname':'Times New Roman', 'fontsize': 18}
font = 'Times New Roman'
fig=plt.figure(figsize=(18,4), dpi= 100, facecolor='w', edgecolor='k')
plt.style.use('seaborn-whitegrid')
color = 'cornflowerblue'
ec = 'None'
alpha=0.5

ax = plt.subplot(1, 3, 1)
ax.hist(ls_org, bins=20, weights=np.ones(len(ls_org)) / len(ls_org), color=color, ec=ec, alpha=alpha, label='overall')
plt.xlabel('Num of visits (org)',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.xticks(**csfont)
plt.yticks(**csfont)

ax = plt.subplot(1, 3, 2)
ax.hist(ls_hour, bins=20, weights=np.ones(len(ls_hour)) / len(ls_hour), color=color, ec=ec, alpha=alpha, label='overall')
plt.xlabel('Num of visits (hour)',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.xticks(**csfont)
plt.yticks(**csfont)

ax = plt.subplot(1, 3, 3)
ax.hist(ls_day, bins=20, weights=np.ones(len(ls_day)) / len(ls_day), color=color, ec=ec, alpha=alpha, label='overall')
plt.xlabel('Num of visits (day)',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.xticks(**csfont)
plt.yticks(**csfont)

plt.show()
"""
# %%
def get_statistic(lst, name):
    print(f'[{name}]\tMax:\t{max(lst)}, Min:\t{min(lst)}, Median:\t{np.median(lst)}, Mean:\t{np.mean(lst)}, 80%:\t{np.quantile(lst, 0.8)}, 90%:\t{np.quantile(lst, 0.9)}, 95%:\t{np.quantile(lst, 0.95)}')

# %%
get_statistic(ls_org, 'ls_org')
get_statistic(ls_hour, 'ls_hour')
get_statistic(ls_day, 'ls_day')

# %%
train_df_hour['LOS'] = train_df_hour['ADMISSION_DATE']
train_df_hour['LOS_HOUR'] = train_df_hour['ADMISSION_DATE']

# %%
train_df_hour = train_df_hour.reset_index()

# %%
for idx in tqdm(range(len(train_df_hour))):
    info = train_df_hour.loc[idx]
    admission = datetime.datetime.strptime(info['ADMISSION_DATE'], '%Y-%m-%d %H:%M:%S')
    departure = datetime.datetime.strptime(info['DEPARTURE_DATE'], '%Y-%m-%d %H:%M:%S')
    visit_hour = datetime.datetime.strptime(info['RECORD_TIME_HOUR'], '%Y-%m-%d %H')
    hour = (departure - visit_hour).seconds / (24 * 60 * 60) + (departure - visit_hour).days
    los = (departure - admission).seconds / (24 * 60 * 60) + (departure - admission).days
    train_df_hour.at[idx, 'LOS'] = float(los)
    train_df_hour.at[idx, 'LOS_HOUR'] = float(hour)

# %%
train_df_hour['LOS']

# %%
los = []
for pat in tqdm(train_df_hour['PATIENT_ID'].unique()):
    los.append(float(train_df_hour[train_df_hour['PATIENT_ID'] == pat]['LOS'].head(1)))

# %%
get_statistic(los, 'los')

# %%
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import matplotlib.font_manager as font_manager
import pandas as pd
import numpy as np
"""
csfont = {'fontname':'Times New Roman', 'fontsize': 18}
font = 'Times New Roman'
fig=plt.figure(figsize=(6,6), dpi= 100, facecolor='w', edgecolor='k')
plt.style.use('seaborn-whitegrid')
color = 'cornflowerblue'
ec = 'None'
alpha=0.5

ax = plt.subplot(1, 1, 1)
ax.hist(los, bins=20, weights=np.ones(len(los)) / len(los), color=color, ec=ec, alpha=alpha, label='overall')
plt.xlabel('Length of stay',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.xticks(**csfont)
plt.yticks(**csfont)

plt.show()
"""
# %%
train_df_hour_idx = train_df_hour.reset_index()

# %%
train_df_hour_idx['LOS'] = train_df_hour_idx['ADMISSION_DATE']

for idx in tqdm(range(len(train_df_hour_idx))):
    info = train_df_hour_idx.loc[idx]
    # admission = datetime.datetime.strptime(info['ADMISSION_DATE'], '%Y-%m-%d %H:%M:%S')
    departure = datetime.datetime.strptime(info['DEPARTURE_DATE'], '%Y-%m-%d %H:%M:%S')
    visit_hour = datetime.datetime.strptime(info['RECORD_TIME_HOUR'], '%Y-%m-%d %H')
    hour = (departure - visit_hour).seconds / (24 * 60 * 60) + (departure - visit_hour).days
    train_df_hour_idx.at[idx, 'LOS'] = float(hour)

# %%
train_df_hour['LOS'] = train_df_hour['LOS_HOUR']
train_df_hour.drop(columns=['LOS_HOUR'])

# %%
# los_threshold = 13.0

# visit_num_hour = []

# for pat in tqdm(train_df_hour_idx['PATIENT_ID'].unique()):
#     pat_records = train_df_hour_idx[train_df_hour_idx['PATIENT_ID'] == pat]
#     hour = 0
#     for vis in pat_records.index:
#         pat_visit = pat_records.loc[vis]
#         if pat_visit['LOS_HOUR'] <= los_threshold:
#             hour += 1
#     visit_num_hour.append(hour)
#     if hour == 0:
#         print(pat)

# %%
# import matplotlib.pyplot as plt
# from matplotlib.ticker import PercentFormatter
# import matplotlib.font_manager as font_manager
# import pandas as pd
# import numpy as np
# csfont = {'fontname':'Times New Roman', 'fontsize': 18}
# font = 'Times New Roman'
# fig=plt.figure(figsize=(6,6), dpi= 100, facecolor='w', edgecolor='k')
# plt.style.use('seaborn-whitegrid')
# color = 'cornflowerblue'
# ec = 'None'
# alpha=0.5

# ax = plt.subplot(1, 1, 1)
# ax.hist(visit_num_hour, bins=20, weights=np.ones(len(visit_num_hour)) / len(visit_num_hour), color=color, ec=ec, alpha=alpha, label='overall')
# plt.xlabel('Visit num (80% los)',**csfont)
# plt.ylabel('Percentage',**csfont)
# plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.xticks(**csfont)
# plt.yticks(**csfont)

# plt.show()

# %%
train_df = train_df_hour
train_df.head()

# %%
train_df.describe()

# %%
get_statistic(train_df['LOS'], 'los')

# %%
train_df[train_df['PATIENT_ID'] == '1']['HEART_RATE'].count()

# %%
cols = train_df.columns[5:]
pats = train_df['PATIENT_ID'].unique()
all_pat_cnt = len(pats)
missing_rate = dict()
# for col in tqdm(cols):
#     miss = 0
#     for pat in pats:
#         if train_df[train_df['PATIENT_ID'] == pat][col].count() == 0:
#             miss += 1
#     missing_rate[col] = miss / all_pat_cnt
    
for col in cols:
    missing_rate[col] = 0
for pat in tqdm(pats):
    p = train_df[train_df['PATIENT_ID'] == pat]
    for col in cols:
        if p[col].count() == 0:
            missing_rate[col] += 1
for col in cols:
    missing_rate[col] = missing_rate[col] / all_pat_cnt
    
missing_rate

# %%
with open('missing_rate.csv', mode='w', encoding='utf-8') as file:
    for col in cols:
        file.write(f'"{col}", {100 * missing_rate[col]}\n')

# %%
train_df['LOS'] = train_df['LOS'].clip(lower=0)

# %%
get_statistic(train_df['LOS'], 'los')

# %%
# the first visit of each person
def init_prev(prev):
    miss = []
    l = len(prev)
    for idx in range(l):
        #print(prev[idx])
        #print(type(prev[idx]))
        if np.isnan(prev[idx]): # there is no previous record
            prev[idx] = test_median[idx] # replace nan to median
            miss.append(1) # mark miss as 1
        else: # there is a previous record
            miss.append(0)
    return miss

# the rest of the visits
def fill_nan(cur, prev):
    l = len(prev)
    miss = []
    for idx in range(l):
        #print(cur[idx])
        if np.isnan(cur[idx]): # there is no record in current timestep
            cur[idx] = prev[idx] # cur <- prev
            miss.append(1)
        else: # there is a record in current timestep
            miss.append(0)
    return miss

# %%
x, y, demo, x_lab_len, missing_mask = [], [], [], [], []

for pat in tqdm(patient_ids): # for all patients
    # get visits for pat.id == PATIENT_ID
    info = train_df[train_df['PATIENT_ID'] == pat]
    info = info[max(0, len(info) - 76):]
    indexes = info.index
    visit = info.loc[indexes[0]] # get the first visit

    # demographic data
    demo.append([visit[k] for k in demo_cols])
    
    # label
    outcome = visit['OUTCOME']
    los = []

    # lab test & vital signs
    tests = []
    prev = visit[test_cols]
    miss = [] # missing matrix
    miss.append(init_prev(prev)) # fill nan for the first visit for every patient and add missing status to missing matrix
    # leave = datetime.datetime.strptime(visit['DEPARTURE_DATE'], '%Y-%m-%d %H:%M:%S')
    
    first = True
    for i in indexes:
        visit = info.loc[i]
        # now = datetime.datetime.strptime(visit['RECORD_TIME'], '%Y-%m-%d %H')
        cur = visit[test_cols]
        tmp = fill_nan(cur, prev) # fill nan for the rest of the visits
        if not first:
            miss.append(tmp) # add missing status to missing matrix
        tests.append(cur)
        # los_visit = (leave - now).days
        # if los_visit < 0:
        #     los_visit = 0
        los.append(visit['LOS'])
        prev = cur
        first = False

    valid_visit = len(los)
    # outcome = [outcome] * valid_visit
    x_lab_len.append(valid_visit)
    missing_mask.append(miss) # append the patient's missing matrix to the total missing matrix

    # tests = np.pad(tests, ((0, max_visit - valid_visit), (0, 0)))
    # outcome = np.pad(outcome, (0, max_visit - valid_visit))
    # los = np.pad(los, (0, max_visit - valid_visit))
    
    y.append([outcome, los])
    x.append(tests)

# %%
all_x = x
all_x_demo = demo
all_y = y
all_missing_mask = missing_mask

# %%
all_x_labtest = np.array(all_x, dtype=object)
x_lab_length = [len(_) for _ in all_x_labtest]
x_lab_length = torch.tensor(x_lab_length, dtype=torch.int)
max_length = int(x_lab_length.max())
all_x_labtest = [torch.tensor(_) for _ in all_x_labtest]
all_x_labtest = torch.nn.utils.rnn.pad_sequence((all_x_labtest), batch_first=True)
all_x_demographic = torch.tensor(all_x_demo)
batch_size, demo_dim = all_x_demographic.shape
all_x_demographic = torch.reshape(all_x_demographic.repeat(1, max_length), (batch_size, max_length, demo_dim))
all_x = torch.cat((all_x_demographic, all_x_labtest), 2)

all_y = np.array(all_y, dtype=object)
patient_list = []
for pat in all_y:
    visits = []
    for i in pat[1]:
        visits.append([pat[0], i])
    patient_list.append(visits)
new_all_y = np.array(patient_list, dtype=object)
output_all_y = [torch.Tensor(_) for _ in new_all_y]
output_all_y = torch.nn.utils.rnn.pad_sequence((output_all_y), batch_first=True)

# %%
all_missing_mask = np.array(all_missing_mask, dtype=object)
all_missing_mask = [torch.tensor(_) for _ in all_missing_mask]
all_missing_mask = torch.nn.utils.rnn.pad_sequence((all_missing_mask), batch_first=True)

# %%
all_x.shape

# %%
all_missing_mask.shape

# %%
# save pickle format dataset (torch)
pd.to_pickle(all_x,f'./processed_data/x.pkl' )
pd.to_pickle(all_missing_mask,f'./processed_data/missing_mask.pkl' )
pd.to_pickle(output_all_y,f'./processed_data/y.pkl' )
pd.to_pickle(x_lab_length,f'./processed_data/visits_length.pkl' )

# %%
# Calculate patients' outcome statistics (patients-wise)
outcome_list = []
y_outcome = output_all_y[:, :, 0]
indices = torch.arange(len(x_lab_length), dtype=torch.int64)
for i in indices:
    outcome_list.append(y_outcome[i][0].item())
outcome_list = np.array(outcome_list)
print(len(outcome_list))
unique, count=np.unique(outcome_list,return_counts=True)
data_count=dict(zip(unique,count))
print(data_count)

# %%
# Calculate patients' outcome statistics (records-wise)
outcome_records_list = []
y_outcome = output_all_y[:, :, 0]
indices = torch.arange(len(x_lab_length), dtype=torch.int64)
for i in indices:
    outcome_records_list.extend(y_outcome[i][0:x_lab_length[i]].tolist())
outcome_records_list = np.array(outcome_records_list)
print(len(outcome_records_list))
unique, count=np.unique(outcome_records_list,return_counts=True)
data_count=dict(zip(unique,count))
print(data_count)

# %%
# Calculate patients' mean los and 95% percentile los
los_list = []
y_los = output_all_y[:, :, 1]
indices = torch.arange(len(x_lab_length), dtype=torch.int64)
for i in indices:
    # los_list.extend(y_los[i][: x_lab_length[i].long()].tolist())
    los_list.append(y_los[i][0].item())
los_list = np.array(los_list)
print(los_list.mean() * 0.5)
print(np.median(los_list) * 0.5)
print(np.percentile(los_list, 95))

print('median:', np.median(los_list))
print('Q1:', np.percentile(los_list, 25))
print('Q3:', np.percentile(los_list, 75))

# %%
los_alive_list = np.array([los_list[i] for i in range(len(los_list)) if outcome_list[i] == 0])
los_dead_list = np.array([los_list[i] for i in range(len(los_list)) if outcome_list[i] == 1])
print(len(los_alive_list))
print(len(los_dead_list))

print('[Alive]')
print('median:', np.median(los_alive_list))
print('Q1:', np.percentile(los_alive_list, 25))
print('Q3:', np.percentile(los_alive_list, 75))

print('[Dead]')
print('median:', np.median(los_dead_list))
print('Q1:', np.percentile(los_dead_list, 25))
print('Q3:', np.percentile(los_dead_list, 75))

# %%
cdsl_los_statistics = {
    'overall': los_list,
    'alive': los_alive_list,
    'dead': los_dead_list
}
pd.to_pickle(cdsl_los_statistics, 'cdsl_los_statistics.pkl')

# %%
# calculate visits length Median [Q1, Q3]
visits_list = np.array(x_lab_length)
visits_alive_list = np.array([x_lab_length[i] for i in range(len(x_lab_length)) if outcome_list[i] == 0])
visits_dead_list = np.array([x_lab_length[i] for i in range(len(x_lab_length)) if outcome_list[i] == 1])
print(len(visits_alive_list))
print(len(visits_dead_list))

print('[Total]')
print('median:', np.median(visits_list))
print('Q1:', np.percentile(visits_list, 25))
print('Q3:', np.percentile(visits_list, 75))

print('[Alive]')
print('median:', np.median(visits_alive_list))
print('Q1:', np.percentile(visits_alive_list, 25))
print('Q3:', np.percentile(visits_alive_list, 75))

print('[Dead]')
print('median:', np.median(visits_dead_list))
print('Q1:', np.percentile(visits_dead_list, 25))
print('Q3:', np.percentile(visits_dead_list, 75))

# %%
def check_nan(x):
    if np.isnan(np.sum(x.cpu().numpy())):
        print("some values from input are nan")
    else:
        print("no nan")

# %%
check_nan(all_x)

# %% [markdown]
# # Draw Charts

# %% [markdown]
# ## Import packages

# %%
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import matplotlib.font_manager as font_manager
import pandas as pd
import numpy as np

plt.style.use('seaborn-whitegrid')
color = 'cornflowerblue'
ec = 'None'
alpha=0.5
alive_color = 'olivedrab'
dead_color = 'orchid'

# %% [markdown]
# ## Read data

# %%
demographic.head()

# %%
train = pd.read_csv('./train.csv')
train['PATIENT_ID']=train['PATIENT_ID'].astype(str)
demographic['PATIENT_ID']=demographic['PATIENT_ID'].astype(str)
pat = {
    'PATIENT_ID': train['PATIENT_ID'].unique()
}
pat = pd.DataFrame(pat)
demo = pd.merge(demographic, pat, on='PATIENT_ID', how='inner')

demo_alive = demo.loc[demo['OUTCOME'] == 0]
demo_dead = demo.loc[demo['OUTCOME'] == 1]
demo_overall = demo

# %%
demo.to_csv('demo_overall.csv', index=False)
demo_alive.to_csv('demo_alive.csv', index=False)
demo_dead.to_csv('demo_dead.csv', index=False)

# %%
patient = pd.DataFrame({"PATIENT_ID": (demo_alive['PATIENT_ID'].unique())})
lab_tests_alive = pd.merge(lab_tests, patient, how='inner', on='PATIENT_ID')
print(len(lab_tests_alive['PATIENT_ID'].unique()))

patient = pd.DataFrame({"PATIENT_ID": (demo_dead['PATIENT_ID'].unique())})
lab_tests_dead = pd.merge(lab_tests, patient, how='inner', on='PATIENT_ID')
print(len(lab_tests_dead['PATIENT_ID'].unique()))

patient = pd.DataFrame({"PATIENT_ID": (demo_overall['PATIENT_ID'].unique())})
lab_tests_overall = pd.merge(lab_tests, patient, how='inner', on='PATIENT_ID')
print(len(lab_tests_overall['PATIENT_ID'].unique()))

# %%
patient = pd.DataFrame({"PATIENT_ID": (demo_alive['PATIENT_ID'].unique())})
vital_signs_alive = pd.merge(vital_signs, patient, how='inner', on='PATIENT_ID')
len(vital_signs_alive['PATIENT_ID'].unique())

# %%
patient = pd.DataFrame({"PATIENT_ID": (demo_dead['PATIENT_ID'].unique())})
vital_signs_dead = pd.merge(vital_signs, patient, how='inner', on='PATIENT_ID')
len(vital_signs_dead['PATIENT_ID'].unique())

# %%
patient = pd.DataFrame({"PATIENT_ID": (demo_overall['PATIENT_ID'].unique())})
vital_signs_overall = pd.merge(vital_signs, patient, how='inner', on='PATIENT_ID')
len(vital_signs_overall['PATIENT_ID'].unique())

# %%
"""
limit = 0.05

csfont = {'fontname':'Times New Roman', 'fontsize': 18}
font = 'Times New Roman'
fig=plt.figure(figsize=(16,12), dpi= 100, facecolor='w', edgecolor='k')

idx = 1

key = 'AGE'
low = demo_overall[key].quantile(limit)
high = demo_overall[key].quantile(1 - limit)
demo_AGE_overall = demo_overall[demo_overall[key].between(low, high)]
demo_AGE_dead = demo_dead[demo_dead[key].between(low, high)]
demo_AGE_alive = demo_alive[demo_alive[key].between(low, high)]
ax = plt.subplot(4, 4, idx)
ax.hist(demo_AGE_overall[key], bins=20, weights=np.ones(len(demo_AGE_overall[key])) / len(demo_AGE_overall), color=color, ec=ec, alpha=alpha, label='overall')
plt.xlabel('Age',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# ax.title('Age Histogram', **csfont)
ax.hist(demo_AGE_alive[key], bins=20, weights=np.ones(len(demo_AGE_alive[key])) / len(demo_AGE_alive), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2, label='alive')
ax.hist(demo_AGE_dead[key], bins=20, weights=np.ones(len(demo_AGE_dead[key])) / len(demo_AGE_dead), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2, label='dead')
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1

key = 'TEMPERATURE'
low = vital_signs_overall[key].quantile(limit)
high = vital_signs_overall[key].quantile(1 - limit)
vs_TEMPERATURE_overall = vital_signs_overall[vital_signs_overall[key].between(low, high)]
vs_TEMPERATURE_dead = vital_signs_dead[vital_signs_dead[key].between(low, high)]
vs_TEMPERATURE_alive = vital_signs_alive[vital_signs_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(vs_TEMPERATURE_overall['TEMPERATURE'], bins=20, weights=np.ones(len(vs_TEMPERATURE_overall)) / len(vs_TEMPERATURE_overall), color=color, ec=ec, alpha=alpha)
plt.xlabel('Temperature',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(vs_TEMPERATURE_alive['TEMPERATURE'], bins=20, weights=np.ones(len(vs_TEMPERATURE_alive)) / len(vs_TEMPERATURE_alive), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(vs_TEMPERATURE_dead['TEMPERATURE'], bins=20, weights=np.ones(len(vs_TEMPERATURE_dead)) / len(vs_TEMPERATURE_dead), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1

# plt.subplot(4, 4, 3)
# plt.hist(lab_tests_overall['CREA -- CREATININA'], bins=20, density=True, color=color, ec=ec, alpha=alpha)
# plt.xlabel('CREA -- CREATININA',**csfont)
# plt.ylabel('Percentage',**csfont)
# plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# # plt.title('Temperature Histogram', **csfont)
# plt.hist(lab_tests_alive['CREA -- CREATININA'], bins=20, density=True, color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
# plt.hist(lab_tests_dead['CREA -- CREATININA'], bins=20, density=True, color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
# plt.xticks(**csfont)
# plt.yticks(**csfont)

key = 'CREA -- CREATININA'
low = lab_tests_overall[key].quantile(limit)
high = lab_tests_overall[key].quantile(1 - limit)
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
plt.xlabel('CREA -- CREATININA',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1

key = 'HEM -- Hemat¡es'
low = lab_tests_overall[key].quantile(limit)
high = lab_tests_overall[key].quantile(1 - limit)
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
plt.xlabel('HEM -- Hemat¡es',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1

key = 'LEUC -- Leucocitos'
low = lab_tests_overall[key].quantile(limit)
high = lab_tests_overall[key].quantile(1 - limit)
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
plt.xlabel('LEUC -- Leucocitos',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1

key = 'PLAQ -- Recuento de plaquetas'
low = lab_tests_overall[key].quantile(limit)
high = lab_tests_overall[key].quantile(1 - limit)
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
plt.xlabel('PLAQ -- Recuento de plaquetas',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1

key = 'CHCM -- Conc. Hemoglobina Corpuscular Media'
low = lab_tests_overall[key].quantile(limit)
high = lab_tests_overall[key].quantile(1 - limit)
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
plt.xlabel('CHCM',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1

key = 'HCTO -- Hematocrito'
low = lab_tests_overall[key].quantile(limit)
high = lab_tests_overall[key].quantile(1 - limit)
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
plt.xlabel('HCTO -- Hematocrito',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1

key = 'VCM -- Volumen Corpuscular Medio'
low = lab_tests_overall[key].quantile(limit)
high = lab_tests_overall[key].quantile(1 - limit)
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
plt.xlabel('VCM -- Volumen Corpuscular Medio',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1

key = 'HGB -- Hemoglobina'
low = lab_tests_overall[key].quantile(limit)
high = lab_tests_overall[key].quantile(1 - limit)
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
plt.xlabel('HGB -- Hemoglobina',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1

key = 'HCM -- Hemoglobina Corpuscular Media'
low = lab_tests_overall[key].quantile(limit)
high = lab_tests_overall[key].quantile(1 - limit)
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
plt.xlabel('HCM -- Hemoglobina Corpuscular Media',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1

key = 'NEU -- Neutr¢filos'
low = lab_tests_overall[key].quantile(limit)
high = lab_tests_overall[key].quantile(1 - limit)
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
plt.xlabel('NEU -- Neutr¢filos',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1

key = 'NEU% -- Neutr¢filos %'
low = lab_tests_overall[key].quantile(limit)
high = lab_tests_overall[key].quantile(1 - limit)
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
plt.xlabel('NEU% -- Neutr¢filos%',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1

key = 'LIN -- Linfocitos'
low = lab_tests_overall[key].quantile(limit)
high = lab_tests_overall[key].quantile(1 - limit)
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
plt.xlabel('LIN -- Linfocitos',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1

key = 'LIN% -- Linfocitos %'
low = lab_tests_overall[key].quantile(limit)
high = lab_tests_overall[key].quantile(1 - limit)
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
plt.xlabel('LIN% -- Linfocitos%',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1

key = 'ADW -- Coeficiente de anisocitosis'
low = lab_tests_overall[key].quantile(limit)
high = lab_tests_overall[key].quantile(1 - limit)
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
plt.xlabel('ADW -- Coeficiente de anisocitosis',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1

handles, labels = ax.get_legend_handles_labels()
print(handles, labels)
# fig.legend(handles, labels, loc='upper center')
plt.figlegend(handles, labels, loc='upper center', ncol=5, fontsize=18, bbox_to_anchor=(0.5, 1.05), prop=font_manager.FontProperties(family='Times New Roman',
                                   style='normal', size=18))
# fig.legend(, [], loc='upper center')

fig.tight_layout()
plt.show()
"""