# %% [markdown]
# # hm dataset pre-processing
#
# import packages
# %%
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle as pkl
import torch
import math
import datetime
from tqdm import tqdm
import datetime
import re
from functools import reduce
# %% [markdown]
# ## Demographic data
# %%
demographic = pd.read_csv('./raw_data/19_04_2021/COVID_DSL_01.CSV', encoding='ISO-8859-1', sep='|')
print(len(demographic))
demographic.head()
# %%
med = pd.read_csv('./raw_data/19_04_2021/COVID_DSL_04.CSV', encoding='ISO-8859-1', sep='|')
print(len(med))
med.head()
# %%
len(med['ID_ATC7'].unique())
# %% [markdown]
# get rid of patient with missing label
# %%
print(len(demographic))
demographic = demographic.dropna(axis=0, how='any', subset=['IDINGRESO', 'F_INGRESO_ING', 'F_ALTA_ING', 'MOTIVO_ALTA_ING'])
print(len(demographic))
# %%
def outcome2num(x):
if x == 'Fallecimiento':
return 1
else:
return 0
def to_one_hot(x, feature):
if x == feature:
return 1
else:
return 0
# %%
# select necessary columns from demographic
demographic = demographic[
[
'IDINGRESO',
'EDAD',
'SEX',
'F_INGRESO_ING',
'F_ALTA_ING',
'MOTIVO_ALTA_ING',
'ESPECIALIDAD_URGENCIA',
'DIAG_URG'
]
]
# rename column
demographic = demographic.rename(columns={
'IDINGRESO': 'PATIENT_ID',
'EDAD': 'AGE',
'SEX': 'SEX',
'F_INGRESO_ING': 'ADMISSION_DATE',
'F_ALTA_ING': 'DEPARTURE_DATE',
'MOTIVO_ALTA_ING': 'OUTCOME',
'ESPECIALIDAD_URGENCIA': 'DEPARTMENT_OF_EMERGENCY',
'DIAG_URG': 'DIAGNOSIS_AT_EMERGENCY_VISIT'
})
# SEX: male: 1; female: 0
demographic['SEX'].replace('MALE', 1, inplace=True)
demographic['SEX'].replace('FEMALE', 0, inplace=True)
# outcome: Fallecimiento(dead): 1; others: 0
demographic['OUTCOME'] = demographic['OUTCOME'].map(outcome2num)
# diagnosis at emergency visit (loss rate < 10%)
# demographic['DIFFICULTY_BREATHING'] = demographic['DIAGNOSIS_AT_EMERGENCY_VISIT'].map(lambda x: to_one_hot(x, 'DIFICULTAD RESPIRATORIA')) # 1674
# demographic['SUSPECT_COVID'] = demographic['DIAGNOSIS_AT_EMERGENCY_VISIT'].map(lambda x: to_one_hot(x, 'SOSPECHA COVID-19')) # 960
# demographic['FEVER'] = demographic['DIAGNOSIS_AT_EMERGENCY_VISIT'].map(lambda x: to_one_hot(x, 'FIEBRE')) # 455
# department of emergency (loss rate < 10%)
# demographic['EMERGENCY'] = demographic['DEPARTMENT_OF_EMERGENCY'].map(lambda x: to_one_hot(x, 'Medicina de Urgencias')) # 3914
# %%
# del useless data
demographic = demographic[
[
'PATIENT_ID',
'AGE',
'SEX',
'ADMISSION_DATE',
'DEPARTURE_DATE',
'OUTCOME',
# 'DIFFICULTY_BREATHING',
# 'SUSPECT_COVID',
# 'FEVER',
# 'EMERGENCY'
]
]
# %%
demographic.describe().to_csv('demographic_overview.csv', mode='w', index=False)
demographic.describe()
# %% [markdown]
# ### Analyze data
# %%
plt.scatter(demographic['PATIENT_ID'], demographic['AGE'], s=1)
plt.xlabel('Patient Id')
plt.ylabel('Age')
plt.title('Patient-Age Scatter Plot')
# %%
plt.scatter(demographic['PATIENT_ID'], demographic['AGE'], s=1)
plt.xlabel('Patient Id')
plt.ylabel('Age')
plt.title('Patient-Age Scatter Plot')
# %%
demographic.to_csv('demographic.csv', mode='w', index=False)
demographic.head()
# %% [markdown]
# ## Vital Signal
# %%
vital_signs = pd.read_csv('./raw_data/19_04_2021/COVID_DSL_02.CSV', encoding='ISO-8859-1', sep='|')
print(len(vital_signs))
vital_signs.head()
# %%
vital_signs = vital_signs.rename(columns={
'IDINGRESO': 'PATIENT_ID',
'CONSTANTS_ING_DATE': 'RECORD_DATE',
'CONSTANTS_ING_TIME': 'RECORD_TIME',
'FC_HR_ING': 'HEART_RATE',
'GLU_GLY_ING': 'BLOOD_GLUCOSE',
'SAT_02_ING': 'OXYGEN_SATURATION',
'TA_MAX_ING': 'MAX_BLOOD_PRESSURE',
'TA_MIN_ING': 'MIN_BLOOD_PRESSURE',
'TEMP_ING': 'TEMPERATURE'
})
vital_signs['RECORD_TIME'] = vital_signs['RECORD_DATE'] + ' ' + vital_signs['RECORD_TIME']
vital_signs['RECORD_TIME'] = vital_signs['RECORD_TIME'].map(lambda x: str(datetime.datetime.strptime(x, '%Y-%m-%d %H:%M')))
vital_signs = vital_signs.drop(['RECORD_DATE', 'SAT_02_ING_OBS', 'BLOOD_GLUCOSE'], axis=1)
# %%
vital_signs.describe()
# %%
vital_signs.head()
# %%
def format_temperature(x):
if type(x) == str:
return float(x.replace(',', '.'))
else:
return float(x)
def format_oxygen(x):
x = float(x)
if x > 100:
return np.nan
else:
return x
def format_heart_rate(x):
x = int(x)
if x > 220:
return np.nan
else:
return x
vital_signs['TEMPERATURE'] = vital_signs['TEMPERATURE'].map(lambda x: format_temperature(x))
vital_signs['OXYGEN_SATURATION'] = vital_signs['OXYGEN_SATURATION'].map(lambda x: format_oxygen(x))
vital_signs['HEART_RATE'] = vital_signs['HEART_RATE'].map(lambda x: format_heart_rate(x))
# %%
vital_signs = vital_signs.replace(0, np.NAN)
# %%
vital_signs = vital_signs.groupby(['PATIENT_ID', 'RECORD_TIME'], dropna=True, as_index = False).mean()
vital_signs.head()
# %%
vital_signs.describe()
# %%
vital_signs.describe().to_csv('vital_signs_overview.csv', index=False, mode='w')
vital_signs.describe()
# %%
"""
#plt.rcParams['figure.figsize'] = [10, 5]
fig=plt.figure(figsize=(16,10), dpi= 100, facecolor='w', edgecolor='k')
plt.subplot(2, 3, 1)
plt.scatter(vital_signs.index, vital_signs['MAX_BLOOD_PRESSURE'], s=1)
plt.xlabel('Index')
plt.ylabel('Max Blood Pressure')
plt.title('Visit-Max Blood Pressure Scatter Plot')
plt.subplot(2, 3, 2)
plt.scatter(vital_signs.index, vital_signs['MIN_BLOOD_PRESSURE'], s=1)
plt.xlabel('Index')
plt.ylabel('Min Blood Pressure')
plt.title('Visit-Min Blood Pressure Scatter Plot')
plt.subplot(2, 3, 3)
plt.scatter(vital_signs.index, vital_signs['TEMPERATURE'], s=1)
plt.xlabel('Index')
plt.ylabel('Temperature')
plt.title('Visit-Temperature Scatter Plot')
plt.subplot(2, 3, 4)
plt.scatter(vital_signs.index, vital_signs['HEART_RATE'], s=1)
plt.xlabel('Index')
plt.ylabel('Heart Rate')
plt.title('Visit-Heart Rate Scatter Plot')
plt.subplot(2, 3, 5)
plt.scatter(vital_signs.index, vital_signs['OXYGEN_SATURATION'], s=1)
plt.xlabel('Index')
plt.ylabel('Oxygen Saturation')
plt.title('Visit-Oxygen Saturation Scatter Plot')
plt.show()
"""
# %%
"""
#plt.rcParams['figure.figsize'] = [10, 5]
fig=plt.figure(figsize=(16,10), dpi= 100, facecolor='w', edgecolor='k')
plt.subplot(2, 3, 1)
plt.hist(vital_signs['MAX_BLOOD_PRESSURE'], bins=30)
plt.xlabel('Index')
plt.ylabel('Max Blood Pressure')
plt.title('Visit-Max Blood Pressure Histogram')
plt.subplot(2, 3, 2)
plt.hist(vital_signs['MIN_BLOOD_PRESSURE'], bins=30)
plt.xlabel('Index')
plt.ylabel('Min Blood Pressure')
plt.title('Visit-Min Blood Pressure Histogram')
plt.subplot(2, 3, 3)
plt.hist(vital_signs['TEMPERATURE'], bins=30)
plt.xlabel('Index')
plt.ylabel('Temperature')
plt.title('Visit-Temperature Histogram')
plt.subplot(2, 3, 4)
plt.hist(vital_signs['HEART_RATE'], bins=30)
plt.xlabel('Index')
plt.ylabel('Heart Rate')
plt.title('Visit-Heart Rate Histogram')
plt.subplot(2, 3, 5)
plt.hist(vital_signs['OXYGEN_SATURATION'], bins=30)
plt.xlabel('Index')
plt.ylabel('Oxygen Saturation')
plt.title('Visit-Oxygen Saturation Histogram')
plt.show()
"""
# %% [markdown]
# ### Missing rate of each visit
# %%
sum(vital_signs.T.isnull().sum()) / ((len(vital_signs.T) - 2) * len(vital_signs))
# %% [markdown]
# ### Normalize data
# %%
"""
for key in vital_signs.keys()[2:]:
vital_signs[key] = (vital_signs[key] - vital_signs[key].mean()) / (vital_signs[key].std() + 1e-12)
vital_signs.describe()
"""
# %%
vital_signs.to_csv('visual_signs.csv', mode='w', index=False)
# %%
len(vital_signs) / len(vital_signs['PATIENT_ID'].unique())
# %% [markdown]
# ## Lab Tests
# %%
lab_tests = pd.read_csv('./raw_data/19_04_2021/COVID_DSL_06_v2.CSV', encoding='ISO-8859-1', sep=';')
lab_tests = lab_tests.rename(columns={'IDINGRESO': 'PATIENT_ID'})
print(len(lab_tests))
# del useless data
lab_tests = lab_tests[
[
'PATIENT_ID',
'LAB_NUMBER',
'LAB_DATE',
'TIME_LAB',
'ITEM_LAB',
'VAL_RESULT'
# UD_RESULT: unit
# REF_VALUES: reference values
]
]
lab_tests.head()
# %%
lab_tests = lab_tests.groupby(['PATIENT_ID', 'LAB_NUMBER', 'LAB_DATE', 'TIME_LAB', 'ITEM_LAB'], dropna=True, as_index = False).first()
lab_tests = lab_tests.set_index(['PATIENT_ID', 'LAB_NUMBER', 'LAB_DATE', 'TIME_LAB', 'ITEM_LAB'], drop = True).unstack('ITEM_LAB')['VAL_RESULT'].reset_index()
lab_tests = lab_tests.drop([
'CFLAG -- ALARMA HEMOGRAMA',
'CORONA -- PCR CORONAVIRUS 2019nCoV',
'CRIOGLO -- CRIOGLOBULINAS',
'EGCOVID -- ESTUDIO GENETICO COVID-19',
'FRO1 -- ',
'FRO1 -- FROTIS EN SANGRE PERIFERICA',
'FRO2 -- ',
'FRO2 -- FROTIS EN SANGRE PERIFERICA',
'FRO3 -- ',
'FRO3 -- FROTIS EN SANGRE PERIFERICA',
'FRO_COMEN -- ',
'FRO_COMEN -- FROTIS EN SANGRE PERIFERICA',
'G-CORONAV (RT-PCR) -- Tipo de muestra: ASPIRADO BRONCOALVEOLAR',
'G-CORONAV (RT-PCR) -- Tipo de muestra: EXUDADO',
'GRRH -- GRUPO SANGUÖNEO Y FACTOR Rh',
'HEML -- RECUENTO CELULAR LIQUIDO',
'HEML -- Recuento Hemat¡es',
'IFSUERO -- INMUNOFIJACION EN SUERO',
'OBS_BIOMOL -- OBSERVACIONES GENETICA MOLECULAR',
'OBS_BIOO -- Observaciones Bioqu¡mica Orina',
'OBS_CB -- Observaciones Coagulaci¢n',
'OBS_GASES -- Observaciones Gasometr¡a Arterial',
'OBS_GASV -- Observaciones Gasometr¡a Venosa',
'OBS_GEN2 -- OBSERVACIONES GENETICA',
'OBS_HOR -- Observaciones Hormonas',
'OBS_MICRO -- Observaciones Microbiolog¡a',
'OBS_NULA2 -- Observaciones Bioqu¡mica',
'OBS_NULA3 -- Observaciones Hematolog¡a',
'OBS_PESP -- Observaciones Pruebas especiales',
'OBS_SERO -- Observaciones Serolog¡a',
'OBS_SIS -- Observaciones Orina',
'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: ASPIRADO BRONCOALVEOLAR',
'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: BAS',
'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: ESPUTO',
'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: EXUDADO',
'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: LAVADO BRONCOALVEOLAR',
'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: LAVADO NASOFARÖNGEO',
'PTGOR -- PROTEINOGRAMA ORINA',
'RESUL_IFT -- ESTUDIO DE INMUNOFENOTIPO',
'RESUL_IFT -- Resultado',
'Resultado -- Resultado',
'SED1 -- ',
'SED1 -- SEDIMENTO',
'SED2 -- ',
'SED2 -- SEDIMENTO',
'SED3 -- ',
'SED3 -- SEDIMENTO',
'TIPOL -- TIPO DE LIQUIDO',
'Tecnica -- T\x82cnica',
'TpMues -- Tipo de muestra',
'VHCBLOT -- INMUNOBLOT VIRUS HEPATITIS C',
'VIR_TM -- VIRUS TIPO DE MUESTRA',
'LEGIORI -- AG. LEGIONELA PNEUMOPHILA EN ORINA',
'NEUMOORI -- AG NEUMOCOCO EN ORINA',
'VIHAC -- VIH AC'
], axis=1)
lab_tests.head()
# %%
lab_tests = lab_tests.replace('Sin resultado.', np.nan)
lab_tests = lab_tests.replace('Sin resultado', np.nan)
lab_tests = lab_tests.replace('----', np.nan).replace('---', np.nan)
lab_tests = lab_tests.replace('> ', '').replace('< ', '')
def change_format(x):
if x is None:
return np.nan
elif type(x) == str:
if x.startswith('Negativo ('):
return x.replace('Negativo (', '-')[:-1]
elif x.startswith('Positivo ('):
return x.replace('Positivo (', '')[:-1]
elif x.startswith('Zona limite ('):
return x.replace('Zona limite (', '')[:-1]
elif x.startswith('>'):
return x.replace('> ', '').replace('>', '')
elif x.startswith('<'):
return x.replace('< ', '').replace('<', '')
elif x.endswith(' mg/dl'):
return x.replace(' mg/dl', '')
elif x.endswith('/æl'):
return x.replace('/æl', '')
elif x.endswith(' copias/mL'):
return x.replace(' copias/mL', '')
elif x == 'Numerosos':
return 1.5
elif x == 'Aislados':
return 0.5
elif x == 'Se detecta' or x == 'Se observan' or x == 'Normal' or x == 'Positivo':
return 1
elif x == 'No se detecta' or x == 'No se observan' or x == 'Negativo':
return 0
elif x == 'Indeterminado':
return np.nan
else:
num = re.findall("[-+]?\d+\.\d+", x)
if len(num) == 0:
return np.nan
else:
return num[0]
else:
return x
feature_value_dict = dict()
for k in tqdm(lab_tests.keys()[4:]):
lab_tests[k] = lab_tests[k].map(lambda x: change_format(change_format(x)))
feature_value_dict[k] = lab_tests[k].unique()
# %%
def nan_and_not_nan(x):
if x == x:
return 1
else: # nan
return 0
def is_float(num):
try:
float(num)
return True
except ValueError:
return False
def is_all_float(x):
for i in x:
if i == i and (i != None):
if not is_float(i):
return False
return True
def to_float(x):
if x != None:
return float(x)
else:
return np.nan
other_feature_dict = dict()
for feature in tqdm(feature_value_dict.keys()):
values = feature_value_dict[feature]
if is_all_float(values):
lab_tests[feature] = lab_tests[feature].map(lambda x: to_float(x))
elif len(values) == 2:
lab_tests[feature] = lab_tests[feature].map(lambda x: nan_and_not_nan(x))
else:
other_feature_dict[feature] = values
# %%
other_feature_dict
# %%
def format_time(t):
if '/' in t:
return str(datetime.datetime.strptime(t, '%d/%m/%Y %H:%M'))
else:
return str(datetime.datetime.strptime(t, '%d-%m-%Y %H:%M'))
lab_tests['RECORD_TIME'] = lab_tests['LAB_DATE'] + ' ' + lab_tests['TIME_LAB']
lab_tests['RECORD_TIME'] = lab_tests['RECORD_TIME'].map(lambda x: format_time(x))
lab_tests = lab_tests.drop(['LAB_NUMBER', 'LAB_DATE', 'TIME_LAB'], axis=1)
# lab_tests = lab_tests.drop(['LAB_NUMBER', 'TIME_LAB'], axis=1)
lab_tests.head()
# %%
lab_tests_patient = lab_tests.groupby(['PATIENT_ID'], dropna=True, as_index = False).mean()
print(len(lab_tests_patient))
count = [i for i in lab_tests_patient.count()[1:]]
plt.hist(count)
# %%
patient_total = len(lab_tests_patient)
threshold = patient_total * 0.1
reserved_keys = []
for key in lab_tests_patient.keys():
if lab_tests_patient[key].count() > threshold:
reserved_keys.append(key)
print(len(reserved_keys))
reserved_keys
# %%
reserved_keys.insert(1, 'RECORD_TIME')
lab_tests = lab_tests.groupby(['PATIENT_ID', 'RECORD_TIME'], dropna=True, as_index = False).mean()
lab_tests = lab_tests[reserved_keys]
lab_tests.head()
"""
# %% [markdown]
# ### Missing rate of each visit
# %%
sum(lab_tests.T.isnull().sum()) / ((len(lab_tests.T) - 2) * len(lab_tests))
# %% [markdown]
# ### Scatter Plot
# %%
fig=plt.figure(figsize=(16,200), dpi= 100, facecolor='w', edgecolor='k')
i = 1
for key in lab_tests.keys()[2:]:
plt.subplot(33, 3, i)
plt.scatter(lab_tests.index, lab_tests[key], s=1)
plt.ylabel(key)
i += 1
plt.show()
# %%
fig=plt.figure(figsize=(20,120), dpi= 100, facecolor='w', edgecolor='k')
i = 1
for key in lab_tests.keys()[2:]:
plt.subplot(23, 4, i)
plt.hist(lab_tests[key], bins=30)
q3 = lab_tests[key].quantile(0.75)
q1 = lab_tests[key].quantile(0.25)
qh = q3 + 3 * (q3 - q1)
ql = q1 - 3 * (q3 - q1)
sigma = 5
plt.axline([sigma*lab_tests[key].std() + lab_tests[key].mean(), 0], [sigma*lab_tests[key].std() + lab_tests[key].mean(), 1], color = "r", linestyle=(0, (5, 5)))
plt.axline([-sigma*lab_tests[key].std() + lab_tests[key].mean(), 0], [-sigma*lab_tests[key].std() + lab_tests[key].mean(), 1], color = "r", linestyle=(0, (5, 5)))
#plt.axline([lab_tests[key].quantile(0.25), 0], [lab_tests[key].quantile(0.25), 1], color = "k", linestyle=(0, (5, 5)))
#plt.axline([lab_tests[key].quantile(0.75), 0], [lab_tests[key].quantile(0.75), 1], color = "k", linestyle=(0, (5, 5)))
plt.axline([qh, 0], [qh, 1], color='k', linestyle=(0, (5, 5)))
plt.axline([ql, 0], [ql, 1], color='k', linestyle=(0, (5, 5)))
plt.ylabel(key)
i += 1
plt.show()
"""
# %% [markdown]
# ### Normalize data
# %%
"""
for key in lab_tests.keys()[2:]:
lab_tests[key] = (lab_tests[key] - lab_tests[key].mean()) / (lab_tests[key].std() + 1e-12)
lab_tests.describe()
"""
# %%
# 【del normalization】
# for key in lab_tests.keys()[2:]:
# r = lab_tests[lab_tests[key].between(lab_tests[key].quantile(0.05), lab_tests[key].quantile(0.95))]
# lab_tests[key] = (lab_tests[key] - r[key].mean()) / (r[key].std() + 1e-12)
# %%
lab_tests.to_csv('lab_test.csv', mode='w', index=False)
# %% [markdown]
# # Concat data
# %%
demographic['PATIENT_ID'] = demographic['PATIENT_ID'].map(lambda x: str(int(x)))
vital_signs['PATIENT_ID'] = vital_signs['PATIENT_ID'].map(lambda x: str(int(x)))
lab_tests['PATIENT_ID'] = lab_tests['PATIENT_ID'].map(lambda x: str(int(x)))
# %%
len(demographic['PATIENT_ID'].unique()), len(vital_signs['PATIENT_ID'].unique()), len(lab_tests['PATIENT_ID'].unique())
# %%
train_df = pd.merge(vital_signs, lab_tests, on=['PATIENT_ID', 'RECORD_TIME'], how='outer')
train_df = train_df.groupby(['PATIENT_ID', 'RECORD_TIME'], dropna=True, as_index = False).mean()
train_df = pd.merge(demographic, train_df, on=['PATIENT_ID'], how='left')
train_df.head()
# %%
# del rows without patient_id, admission_date, record_time, or outcome
train_df = train_df.dropna(axis=0, how='any', subset=['PATIENT_ID', 'ADMISSION_DATE', 'RECORD_TIME', 'OUTCOME'])
# %%
train_df.to_csv('train.csv', mode='w', index=False)
train_df.describe()
# %% [markdown]
# ## Missing rate of each visit
# %%
sum(train_df.T.isnull().sum()) / ((len(train_df.T) - 2) * len(train_df))
# %% [markdown]
# # Split and save data
# %% [markdown]
# * demo: demographic data
# * x: lab test & vital signs
# * y: outcome & length of stay
# %%
patient_ids = train_df['PATIENT_ID'].unique()
demo_cols = ['AGE', 'SEX'] # , 'DIFFICULTY_BREATHING', 'FEVER', 'SUSPECT_COVID', 'EMERGENCY'
test_cols = []
# get column names
for k in train_df.keys():
if not k in demographic.keys():
if not k == 'RECORD_TIME':
test_cols.append(k)
test_median = train_df[test_cols].median()
# %%
test_cols
# %%
train_df['RECORD_TIME_DAY'] = train_df['RECORD_TIME'].map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d'))
train_df['RECORD_TIME_HOUR'] = train_df['RECORD_TIME'].map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d %H'))
train_df.head()
# %%
train_df_day = train_df.groupby(['PATIENT_ID', 'ADMISSION_DATE', 'DEPARTURE_DATE', 'RECORD_TIME_DAY'], dropna=True, as_index = False).mean()
train_df_hour = train_df.groupby(['PATIENT_ID', 'ADMISSION_DATE', 'DEPARTURE_DATE', 'RECORD_TIME_HOUR'], dropna=True, as_index = False).mean()
len(train_df), len(train_df_day), len(train_df_hour)
# %% [markdown]
#
# ```
# number of visits (total)
# - Original data: 168777
# - Merge by hour: 130141
# - Merge by day: 42204
# ```
# %%
len(train_df['PATIENT_ID'].unique())
# %%
def get_visit_intervals(df):
ls = []
for pat in df['PATIENT_ID'].unique():
ls.append(len(df[df['PATIENT_ID'] == pat]))
return ls
# %%
ls_org = get_visit_intervals(train_df)
ls_hour = get_visit_intervals(train_df_hour)
ls_day = get_visit_intervals(train_df_day)
# %%
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import matplotlib.font_manager as font_manager
import pandas as pd
import numpy as np
"""
csfont = {'fontname':'Times New Roman', 'fontsize': 18}
font = 'Times New Roman'
fig=plt.figure(figsize=(18,4), dpi= 100, facecolor='w', edgecolor='k')
plt.style.use('seaborn-whitegrid')
color = 'cornflowerblue'
ec = 'None'
alpha=0.5
ax = plt.subplot(1, 3, 1)
ax.hist(ls_org, bins=20, weights=np.ones(len(ls_org)) / len(ls_org), color=color, ec=ec, alpha=alpha, label='overall')
plt.xlabel('Num of visits (org)',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.xticks(**csfont)
plt.yticks(**csfont)
ax = plt.subplot(1, 3, 2)
ax.hist(ls_hour, bins=20, weights=np.ones(len(ls_hour)) / len(ls_hour), color=color, ec=ec, alpha=alpha, label='overall')
plt.xlabel('Num of visits (hour)',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.xticks(**csfont)
plt.yticks(**csfont)
ax = plt.subplot(1, 3, 3)
ax.hist(ls_day, bins=20, weights=np.ones(len(ls_day)) / len(ls_day), color=color, ec=ec, alpha=alpha, label='overall')
plt.xlabel('Num of visits (day)',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.xticks(**csfont)
plt.yticks(**csfont)
plt.show()
"""
# %%
def get_statistic(lst, name):
print(f'[{name}]\tMax:\t{max(lst)}, Min:\t{min(lst)}, Median:\t{np.median(lst)}, Mean:\t{np.mean(lst)}, 80%:\t{np.quantile(lst, 0.8)}, 90%:\t{np.quantile(lst, 0.9)}, 95%:\t{np.quantile(lst, 0.95)}')
# %%
get_statistic(ls_org, 'ls_org')
get_statistic(ls_hour, 'ls_hour')
get_statistic(ls_day, 'ls_day')
# %%
train_df_hour['LOS'] = train_df_hour['ADMISSION_DATE']
train_df_hour['LOS_HOUR'] = train_df_hour['ADMISSION_DATE']
# %%
train_df_hour = train_df_hour.reset_index()
# %%
for idx in tqdm(range(len(train_df_hour))):
info = train_df_hour.loc[idx]
admission = datetime.datetime.strptime(info['ADMISSION_DATE'], '%Y-%m-%d %H:%M:%S')
departure = datetime.datetime.strptime(info['DEPARTURE_DATE'], '%Y-%m-%d %H:%M:%S')
visit_hour = datetime.datetime.strptime(info['RECORD_TIME_HOUR'], '%Y-%m-%d %H')
hour = (departure - visit_hour).seconds / (24 * 60 * 60) + (departure - visit_hour).days
los = (departure - admission).seconds / (24 * 60 * 60) + (departure - admission).days
train_df_hour.at[idx, 'LOS'] = float(los)
train_df_hour.at[idx, 'LOS_HOUR'] = float(hour)
# %%
train_df_hour['LOS']
# %%
los = []
for pat in tqdm(train_df_hour['PATIENT_ID'].unique()):
los.append(float(train_df_hour[train_df_hour['PATIENT_ID'] == pat]['LOS'].head(1)))
# %%
get_statistic(los, 'los')
# %%
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import matplotlib.font_manager as font_manager
import pandas as pd
import numpy as np
"""
csfont = {'fontname':'Times New Roman', 'fontsize': 18}
font = 'Times New Roman'
fig=plt.figure(figsize=(6,6), dpi= 100, facecolor='w', edgecolor='k')
plt.style.use('seaborn-whitegrid')
color = 'cornflowerblue'
ec = 'None'
alpha=0.5
ax = plt.subplot(1, 1, 1)
ax.hist(los, bins=20, weights=np.ones(len(los)) / len(los), color=color, ec=ec, alpha=alpha, label='overall')
plt.xlabel('Length of stay',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.xticks(**csfont)
plt.yticks(**csfont)
plt.show()
"""
# %%
train_df_hour_idx = train_df_hour.reset_index()
# %%
train_df_hour_idx['LOS'] = train_df_hour_idx['ADMISSION_DATE']
for idx in tqdm(range(len(train_df_hour_idx))):
info = train_df_hour_idx.loc[idx]
# admission = datetime.datetime.strptime(info['ADMISSION_DATE'], '%Y-%m-%d %H:%M:%S')
departure = datetime.datetime.strptime(info['DEPARTURE_DATE'], '%Y-%m-%d %H:%M:%S')
visit_hour = datetime.datetime.strptime(info['RECORD_TIME_HOUR'], '%Y-%m-%d %H')
hour = (departure - visit_hour).seconds / (24 * 60 * 60) + (departure - visit_hour).days
train_df_hour_idx.at[idx, 'LOS'] = float(hour)
# %%
train_df_hour['LOS'] = train_df_hour['LOS_HOUR']
train_df_hour.drop(columns=['LOS_HOUR'])
# %%
# los_threshold = 13.0
# visit_num_hour = []
# for pat in tqdm(train_df_hour_idx['PATIENT_ID'].unique()):
# pat_records = train_df_hour_idx[train_df_hour_idx['PATIENT_ID'] == pat]
# hour = 0
# for vis in pat_records.index:
# pat_visit = pat_records.loc[vis]
# if pat_visit['LOS_HOUR'] <= los_threshold:
# hour += 1
# visit_num_hour.append(hour)
# if hour == 0:
# print(pat)
# %%
# import matplotlib.pyplot as plt
# from matplotlib.ticker import PercentFormatter
# import matplotlib.font_manager as font_manager
# import pandas as pd
# import numpy as np
# csfont = {'fontname':'Times New Roman', 'fontsize': 18}
# font = 'Times New Roman'
# fig=plt.figure(figsize=(6,6), dpi= 100, facecolor='w', edgecolor='k')
# plt.style.use('seaborn-whitegrid')
# color = 'cornflowerblue'
# ec = 'None'
# alpha=0.5
# ax = plt.subplot(1, 1, 1)
# ax.hist(visit_num_hour, bins=20, weights=np.ones(len(visit_num_hour)) / len(visit_num_hour), color=color, ec=ec, alpha=alpha, label='overall')
# plt.xlabel('Visit num (80% los)',**csfont)
# plt.ylabel('Percentage',**csfont)
# plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.xticks(**csfont)
# plt.yticks(**csfont)
# plt.show()
# %%
train_df = train_df_hour
train_df.head()
# %%
train_df.describe()
# %%
get_statistic(train_df['LOS'], 'los')
# %%
train_df[train_df['PATIENT_ID'] == '1']['HEART_RATE'].count()
# %%
cols = train_df.columns[5:]
pats = train_df['PATIENT_ID'].unique()
all_pat_cnt = len(pats)
missing_rate = dict()
# for col in tqdm(cols):
# miss = 0
# for pat in pats:
# if train_df[train_df['PATIENT_ID'] == pat][col].count() == 0:
# miss += 1
# missing_rate[col] = miss / all_pat_cnt
for col in cols:
missing_rate[col] = 0
for pat in tqdm(pats):
p = train_df[train_df['PATIENT_ID'] == pat]
for col in cols:
if p[col].count() == 0:
missing_rate[col] += 1
for col in cols:
missing_rate[col] = missing_rate[col] / all_pat_cnt
missing_rate
# %%
with open('missing_rate.csv', mode='w', encoding='utf-8') as file:
for col in cols:
file.write(f'"{col}", {100 * missing_rate[col]}\n')
# %%
train_df['LOS'] = train_df['LOS'].clip(lower=0)
# %%
get_statistic(train_df['LOS'], 'los')
# %%
# the first visit of each person
def init_prev(prev):
miss = []
l = len(prev)
for idx in range(l):
#print(prev[idx])
#print(type(prev[idx]))
if np.isnan(prev[idx]): # there is no previous record
prev[idx] = test_median[idx] # replace nan to median
miss.append(1) # mark miss as 1
else: # there is a previous record
miss.append(0)
return miss
# the rest of the visits
def fill_nan(cur, prev):
l = len(prev)
miss = []
for idx in range(l):
#print(cur[idx])
if np.isnan(cur[idx]): # there is no record in current timestep
cur[idx] = prev[idx] # cur <- prev
miss.append(1)
else: # there is a record in current timestep
miss.append(0)
return miss
# %%
x, y, demo, x_lab_len, missing_mask = [], [], [], [], []
for pat in tqdm(patient_ids): # for all patients
# get visits for pat.id == PATIENT_ID
info = train_df[train_df['PATIENT_ID'] == pat]
info = info[max(0, len(info) - 76):]
indexes = info.index
visit = info.loc[indexes[0]] # get the first visit
# demographic data
demo.append([visit[k] for k in demo_cols])
# label
outcome = visit['OUTCOME']
los = []
# lab test & vital signs
tests = []
prev = visit[test_cols]
miss = [] # missing matrix
miss.append(init_prev(prev)) # fill nan for the first visit for every patient and add missing status to missing matrix
# leave = datetime.datetime.strptime(visit['DEPARTURE_DATE'], '%Y-%m-%d %H:%M:%S')
first = True
for i in indexes:
visit = info.loc[i]
# now = datetime.datetime.strptime(visit['RECORD_TIME'], '%Y-%m-%d %H')
cur = visit[test_cols]
tmp = fill_nan(cur, prev) # fill nan for the rest of the visits
if not first:
miss.append(tmp) # add missing status to missing matrix
tests.append(cur)
# los_visit = (leave - now).days
# if los_visit < 0:
# los_visit = 0
los.append(visit['LOS'])
prev = cur
first = False
valid_visit = len(los)
# outcome = [outcome] * valid_visit
x_lab_len.append(valid_visit)
missing_mask.append(miss) # append the patient's missing matrix to the total missing matrix
# tests = np.pad(tests, ((0, max_visit - valid_visit), (0, 0)))
# outcome = np.pad(outcome, (0, max_visit - valid_visit))
# los = np.pad(los, (0, max_visit - valid_visit))
y.append([outcome, los])
x.append(tests)
# %%
all_x = x
all_x_demo = demo
all_y = y
all_missing_mask = missing_mask
# %%
all_x_labtest = np.array(all_x, dtype=object)
x_lab_length = [len(_) for _ in all_x_labtest]
x_lab_length = torch.tensor(x_lab_length, dtype=torch.int)
max_length = int(x_lab_length.max())
all_x_labtest = [torch.tensor(_) for _ in all_x_labtest]
all_x_labtest = torch.nn.utils.rnn.pad_sequence((all_x_labtest), batch_first=True)
all_x_demographic = torch.tensor(all_x_demo)
batch_size, demo_dim = all_x_demographic.shape
all_x_demographic = torch.reshape(all_x_demographic.repeat(1, max_length), (batch_size, max_length, demo_dim))
all_x = torch.cat((all_x_demographic, all_x_labtest), 2)
all_y = np.array(all_y, dtype=object)
patient_list = []
for pat in all_y:
visits = []
for i in pat[1]:
visits.append([pat[0], i])
patient_list.append(visits)
new_all_y = np.array(patient_list, dtype=object)
output_all_y = [torch.Tensor(_) for _ in new_all_y]
output_all_y = torch.nn.utils.rnn.pad_sequence((output_all_y), batch_first=True)
# %%
all_missing_mask = np.array(all_missing_mask, dtype=object)
all_missing_mask = [torch.tensor(_) for _ in all_missing_mask]
all_missing_mask = torch.nn.utils.rnn.pad_sequence((all_missing_mask), batch_first=True)
# %%
all_x.shape
# %%
all_missing_mask.shape
# %%
# save pickle format dataset (torch)
pd.to_pickle(all_x,f'./processed_data/x.pkl' )
pd.to_pickle(all_missing_mask,f'./processed_data/missing_mask.pkl' )
pd.to_pickle(output_all_y,f'./processed_data/y.pkl' )
pd.to_pickle(x_lab_length,f'./processed_data/visits_length.pkl' )
# %%
# Calculate patients' outcome statistics (patients-wise)
outcome_list = []
y_outcome = output_all_y[:, :, 0]
indices = torch.arange(len(x_lab_length), dtype=torch.int64)
for i in indices:
outcome_list.append(y_outcome[i][0].item())
outcome_list = np.array(outcome_list)
print(len(outcome_list))
unique, count=np.unique(outcome_list,return_counts=True)
data_count=dict(zip(unique,count))
print(data_count)
# %%
# Calculate patients' outcome statistics (records-wise)
outcome_records_list = []
y_outcome = output_all_y[:, :, 0]
indices = torch.arange(len(x_lab_length), dtype=torch.int64)
for i in indices:
outcome_records_list.extend(y_outcome[i][0:x_lab_length[i]].tolist())
outcome_records_list = np.array(outcome_records_list)
print(len(outcome_records_list))
unique, count=np.unique(outcome_records_list,return_counts=True)
data_count=dict(zip(unique,count))
print(data_count)
# %%
# Calculate patients' mean los and 95% percentile los
los_list = []
y_los = output_all_y[:, :, 1]
indices = torch.arange(len(x_lab_length), dtype=torch.int64)
for i in indices:
# los_list.extend(y_los[i][: x_lab_length[i].long()].tolist())
los_list.append(y_los[i][0].item())
los_list = np.array(los_list)
print(los_list.mean() * 0.5)
print(np.median(los_list) * 0.5)
print(np.percentile(los_list, 95))
print('median:', np.median(los_list))
print('Q1:', np.percentile(los_list, 25))
print('Q3:', np.percentile(los_list, 75))
# %%
los_alive_list = np.array([los_list[i] for i in range(len(los_list)) if outcome_list[i] == 0])
los_dead_list = np.array([los_list[i] for i in range(len(los_list)) if outcome_list[i] == 1])
print(len(los_alive_list))
print(len(los_dead_list))
print('[Alive]')
print('median:', np.median(los_alive_list))
print('Q1:', np.percentile(los_alive_list, 25))
print('Q3:', np.percentile(los_alive_list, 75))
print('[Dead]')
print('median:', np.median(los_dead_list))
print('Q1:', np.percentile(los_dead_list, 25))
print('Q3:', np.percentile(los_dead_list, 75))
# %%
cdsl_los_statistics = {
'overall': los_list,
'alive': los_alive_list,
'dead': los_dead_list
}
pd.to_pickle(cdsl_los_statistics, 'cdsl_los_statistics.pkl')
# %%
# calculate visits length Median [Q1, Q3]
visits_list = np.array(x_lab_length)
visits_alive_list = np.array([x_lab_length[i] for i in range(len(x_lab_length)) if outcome_list[i] == 0])
visits_dead_list = np.array([x_lab_length[i] for i in range(len(x_lab_length)) if outcome_list[i] == 1])
print(len(visits_alive_list))
print(len(visits_dead_list))
print('[Total]')
print('median:', np.median(visits_list))
print('Q1:', np.percentile(visits_list, 25))
print('Q3:', np.percentile(visits_list, 75))
print('[Alive]')
print('median:', np.median(visits_alive_list))
print('Q1:', np.percentile(visits_alive_list, 25))
print('Q3:', np.percentile(visits_alive_list, 75))
print('[Dead]')
print('median:', np.median(visits_dead_list))
print('Q1:', np.percentile(visits_dead_list, 25))
print('Q3:', np.percentile(visits_dead_list, 75))
# %%
def check_nan(x):
if np.isnan(np.sum(x.cpu().numpy())):
print("some values from input are nan")
else:
print("no nan")
# %%
check_nan(all_x)
# %% [markdown]
# # Draw Charts
# %% [markdown]
# ## Import packages
# %%
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import matplotlib.font_manager as font_manager
import pandas as pd
import numpy as np
plt.style.use('seaborn-whitegrid')
color = 'cornflowerblue'
ec = 'None'
alpha=0.5
alive_color = 'olivedrab'
dead_color = 'orchid'
# %% [markdown]
# ## Read data
# %%
demographic.head()
# %%
train = pd.read_csv('./train.csv')
train['PATIENT_ID']=train['PATIENT_ID'].astype(str)
demographic['PATIENT_ID']=demographic['PATIENT_ID'].astype(str)
pat = {
'PATIENT_ID': train['PATIENT_ID'].unique()
}
pat = pd.DataFrame(pat)
demo = pd.merge(demographic, pat, on='PATIENT_ID', how='inner')
demo_alive = demo.loc[demo['OUTCOME'] == 0]
demo_dead = demo.loc[demo['OUTCOME'] == 1]
demo_overall = demo
# %%
demo.to_csv('demo_overall.csv', index=False)
demo_alive.to_csv('demo_alive.csv', index=False)
demo_dead.to_csv('demo_dead.csv', index=False)
# %%
patient = pd.DataFrame({"PATIENT_ID": (demo_alive['PATIENT_ID'].unique())})
lab_tests_alive = pd.merge(lab_tests, patient, how='inner', on='PATIENT_ID')
print(len(lab_tests_alive['PATIENT_ID'].unique()))
patient = pd.DataFrame({"PATIENT_ID": (demo_dead['PATIENT_ID'].unique())})
lab_tests_dead = pd.merge(lab_tests, patient, how='inner', on='PATIENT_ID')
print(len(lab_tests_dead['PATIENT_ID'].unique()))
patient = pd.DataFrame({"PATIENT_ID": (demo_overall['PATIENT_ID'].unique())})
lab_tests_overall = pd.merge(lab_tests, patient, how='inner', on='PATIENT_ID')
print(len(lab_tests_overall['PATIENT_ID'].unique()))
# %%
patient = pd.DataFrame({"PATIENT_ID": (demo_alive['PATIENT_ID'].unique())})
vital_signs_alive = pd.merge(vital_signs, patient, how='inner', on='PATIENT_ID')
len(vital_signs_alive['PATIENT_ID'].unique())
# %%
patient = pd.DataFrame({"PATIENT_ID": (demo_dead['PATIENT_ID'].unique())})
vital_signs_dead = pd.merge(vital_signs, patient, how='inner', on='PATIENT_ID')
len(vital_signs_dead['PATIENT_ID'].unique())
# %%
patient = pd.DataFrame({"PATIENT_ID": (demo_overall['PATIENT_ID'].unique())})
vital_signs_overall = pd.merge(vital_signs, patient, how='inner', on='PATIENT_ID')
len(vital_signs_overall['PATIENT_ID'].unique())
# %%
"""
limit = 0.05
csfont = {'fontname':'Times New Roman', 'fontsize': 18}
font = 'Times New Roman'
fig=plt.figure(figsize=(16,12), dpi= 100, facecolor='w', edgecolor='k')
idx = 1
key = 'AGE'
low = demo_overall[key].quantile(limit)
high = demo_overall[key].quantile(1 - limit)
demo_AGE_overall = demo_overall[demo_overall[key].between(low, high)]
demo_AGE_dead = demo_dead[demo_dead[key].between(low, high)]
demo_AGE_alive = demo_alive[demo_alive[key].between(low, high)]
ax = plt.subplot(4, 4, idx)
ax.hist(demo_AGE_overall[key], bins=20, weights=np.ones(len(demo_AGE_overall[key])) / len(demo_AGE_overall), color=color, ec=ec, alpha=alpha, label='overall')
plt.xlabel('Age',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# ax.title('Age Histogram', **csfont)
ax.hist(demo_AGE_alive[key], bins=20, weights=np.ones(len(demo_AGE_alive[key])) / len(demo_AGE_alive), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2, label='alive')
ax.hist(demo_AGE_dead[key], bins=20, weights=np.ones(len(demo_AGE_dead[key])) / len(demo_AGE_dead), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2, label='dead')
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1
key = 'TEMPERATURE'
low = vital_signs_overall[key].quantile(limit)
high = vital_signs_overall[key].quantile(1 - limit)
vs_TEMPERATURE_overall = vital_signs_overall[vital_signs_overall[key].between(low, high)]
vs_TEMPERATURE_dead = vital_signs_dead[vital_signs_dead[key].between(low, high)]
vs_TEMPERATURE_alive = vital_signs_alive[vital_signs_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(vs_TEMPERATURE_overall['TEMPERATURE'], bins=20, weights=np.ones(len(vs_TEMPERATURE_overall)) / len(vs_TEMPERATURE_overall), color=color, ec=ec, alpha=alpha)
plt.xlabel('Temperature',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(vs_TEMPERATURE_alive['TEMPERATURE'], bins=20, weights=np.ones(len(vs_TEMPERATURE_alive)) / len(vs_TEMPERATURE_alive), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(vs_TEMPERATURE_dead['TEMPERATURE'], bins=20, weights=np.ones(len(vs_TEMPERATURE_dead)) / len(vs_TEMPERATURE_dead), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1
# plt.subplot(4, 4, 3)
# plt.hist(lab_tests_overall['CREA -- CREATININA'], bins=20, density=True, color=color, ec=ec, alpha=alpha)
# plt.xlabel('CREA -- CREATININA',**csfont)
# plt.ylabel('Percentage',**csfont)
# plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# # plt.title('Temperature Histogram', **csfont)
# plt.hist(lab_tests_alive['CREA -- CREATININA'], bins=20, density=True, color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
# plt.hist(lab_tests_dead['CREA -- CREATININA'], bins=20, density=True, color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
# plt.xticks(**csfont)
# plt.yticks(**csfont)
key = 'CREA -- CREATININA'
low = lab_tests_overall[key].quantile(limit)
high = lab_tests_overall[key].quantile(1 - limit)
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
plt.xlabel('CREA -- CREATININA',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1
key = 'HEM -- Hemat¡es'
low = lab_tests_overall[key].quantile(limit)
high = lab_tests_overall[key].quantile(1 - limit)
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
plt.xlabel('HEM -- Hemat¡es',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1
key = 'LEUC -- Leucocitos'
low = lab_tests_overall[key].quantile(limit)
high = lab_tests_overall[key].quantile(1 - limit)
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
plt.xlabel('LEUC -- Leucocitos',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1
key = 'PLAQ -- Recuento de plaquetas'
low = lab_tests_overall[key].quantile(limit)
high = lab_tests_overall[key].quantile(1 - limit)
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
plt.xlabel('PLAQ -- Recuento de plaquetas',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1
key = 'CHCM -- Conc. Hemoglobina Corpuscular Media'
low = lab_tests_overall[key].quantile(limit)
high = lab_tests_overall[key].quantile(1 - limit)
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
plt.xlabel('CHCM',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1
key = 'HCTO -- Hematocrito'
low = lab_tests_overall[key].quantile(limit)
high = lab_tests_overall[key].quantile(1 - limit)
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
plt.xlabel('HCTO -- Hematocrito',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1
key = 'VCM -- Volumen Corpuscular Medio'
low = lab_tests_overall[key].quantile(limit)
high = lab_tests_overall[key].quantile(1 - limit)
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
plt.xlabel('VCM -- Volumen Corpuscular Medio',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1
key = 'HGB -- Hemoglobina'
low = lab_tests_overall[key].quantile(limit)
high = lab_tests_overall[key].quantile(1 - limit)
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
plt.xlabel('HGB -- Hemoglobina',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1
key = 'HCM -- Hemoglobina Corpuscular Media'
low = lab_tests_overall[key].quantile(limit)
high = lab_tests_overall[key].quantile(1 - limit)
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
plt.xlabel('HCM -- Hemoglobina Corpuscular Media',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1
key = 'NEU -- Neutr¢filos'
low = lab_tests_overall[key].quantile(limit)
high = lab_tests_overall[key].quantile(1 - limit)
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
plt.xlabel('NEU -- Neutr¢filos',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1
key = 'NEU% -- Neutr¢filos %'
low = lab_tests_overall[key].quantile(limit)
high = lab_tests_overall[key].quantile(1 - limit)
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
plt.xlabel('NEU% -- Neutr¢filos%',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1
key = 'LIN -- Linfocitos'
low = lab_tests_overall[key].quantile(limit)
high = lab_tests_overall[key].quantile(1 - limit)
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
plt.xlabel('LIN -- Linfocitos',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1
key = 'LIN% -- Linfocitos %'
low = lab_tests_overall[key].quantile(limit)
high = lab_tests_overall[key].quantile(1 - limit)
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
plt.xlabel('LIN% -- Linfocitos%',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1
key = 'ADW -- Coeficiente de anisocitosis'
low = lab_tests_overall[key].quantile(limit)
high = lab_tests_overall[key].quantile(1 - limit)
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
plt.subplot(4, 4, idx)
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
plt.xlabel('ADW -- Coeficiente de anisocitosis',**csfont)
plt.ylabel('Percentage',**csfont)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.title('Temperature Histogram', **csfont)
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
plt.xticks(**csfont)
plt.yticks(**csfont)
idx += 1
handles, labels = ax.get_legend_handles_labels()
print(handles, labels)
# fig.legend(handles, labels, loc='upper center')
plt.figlegend(handles, labels, loc='upper center', ncol=5, fontsize=18, bbox_to_anchor=(0.5, 1.05), prop=font_manager.FontProperties(family='Times New Roman',
style='normal', size=18))
# fig.legend(, [], loc='upper center')
fig.tight_layout()
plt.show()
"""