--- a +++ b/datasets/cdsl/preprocess.py @@ -0,0 +1,1512 @@ +# %% [markdown] +# # hm dataset pre-processing +# +# import packages + +# %% +import os +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import pickle as pkl +import torch +import math +import datetime +from tqdm import tqdm +import datetime +import re +from functools import reduce + +# %% [markdown] +# ## Demographic data + +# %% +demographic = pd.read_csv('./raw_data/19_04_2021/COVID_DSL_01.CSV', encoding='ISO-8859-1', sep='|') +print(len(demographic)) +demographic.head() + +# %% +med = pd.read_csv('./raw_data/19_04_2021/COVID_DSL_04.CSV', encoding='ISO-8859-1', sep='|') +print(len(med)) +med.head() + +# %% +len(med['ID_ATC7'].unique()) + +# %% [markdown] +# get rid of patient with missing label + +# %% +print(len(demographic)) +demographic = demographic.dropna(axis=0, how='any', subset=['IDINGRESO', 'F_INGRESO_ING', 'F_ALTA_ING', 'MOTIVO_ALTA_ING']) +print(len(demographic)) + +# %% +def outcome2num(x): + if x == 'Fallecimiento': + return 1 + else: + return 0 + +def to_one_hot(x, feature): + if x == feature: + return 1 + else: + return 0 + +# %% +# select necessary columns from demographic +demographic = demographic[ + [ + 'IDINGRESO', + 'EDAD', + 'SEX', + 'F_INGRESO_ING', + 'F_ALTA_ING', + 'MOTIVO_ALTA_ING', + 'ESPECIALIDAD_URGENCIA', + 'DIAG_URG' + ] + ] + +# rename column +demographic = demographic.rename(columns={ + 'IDINGRESO': 'PATIENT_ID', + 'EDAD': 'AGE', + 'SEX': 'SEX', + 'F_INGRESO_ING': 'ADMISSION_DATE', + 'F_ALTA_ING': 'DEPARTURE_DATE', + 'MOTIVO_ALTA_ING': 'OUTCOME', + 'ESPECIALIDAD_URGENCIA': 'DEPARTMENT_OF_EMERGENCY', + 'DIAG_URG': 'DIAGNOSIS_AT_EMERGENCY_VISIT' +}) + +# SEX: male: 1; female: 0 +demographic['SEX'].replace('MALE', 1, inplace=True) +demographic['SEX'].replace('FEMALE', 0, inplace=True) + +# outcome: Fallecimiento(dead): 1; others: 0 +demographic['OUTCOME'] = demographic['OUTCOME'].map(outcome2num) + +# diagnosis at emergency visit (loss rate < 10%) +# demographic['DIFFICULTY_BREATHING'] = demographic['DIAGNOSIS_AT_EMERGENCY_VISIT'].map(lambda x: to_one_hot(x, 'DIFICULTAD RESPIRATORIA')) # 1674 +# demographic['SUSPECT_COVID'] = demographic['DIAGNOSIS_AT_EMERGENCY_VISIT'].map(lambda x: to_one_hot(x, 'SOSPECHA COVID-19')) # 960 +# demographic['FEVER'] = demographic['DIAGNOSIS_AT_EMERGENCY_VISIT'].map(lambda x: to_one_hot(x, 'FIEBRE')) # 455 + +# department of emergency (loss rate < 10%) +# demographic['EMERGENCY'] = demographic['DEPARTMENT_OF_EMERGENCY'].map(lambda x: to_one_hot(x, 'Medicina de Urgencias')) # 3914 + +# %% +# del useless data +demographic = demographic[ + [ + 'PATIENT_ID', + 'AGE', + 'SEX', + 'ADMISSION_DATE', + 'DEPARTURE_DATE', + 'OUTCOME', + # 'DIFFICULTY_BREATHING', + # 'SUSPECT_COVID', + # 'FEVER', + # 'EMERGENCY' + ] + ] + +# %% +demographic.describe().to_csv('demographic_overview.csv', mode='w', index=False) +demographic.describe() + +# %% [markdown] +# ### Analyze data + +# %% +plt.scatter(demographic['PATIENT_ID'], demographic['AGE'], s=1) +plt.xlabel('Patient Id') +plt.ylabel('Age') +plt.title('Patient-Age Scatter Plot') + +# %% +plt.scatter(demographic['PATIENT_ID'], demographic['AGE'], s=1) +plt.xlabel('Patient Id') +plt.ylabel('Age') +plt.title('Patient-Age Scatter Plot') + +# %% +demographic.to_csv('demographic.csv', mode='w', index=False) +demographic.head() + +# %% [markdown] +# ## Vital Signal + +# %% +vital_signs = pd.read_csv('./raw_data/19_04_2021/COVID_DSL_02.CSV', encoding='ISO-8859-1', sep='|') +print(len(vital_signs)) +vital_signs.head() + +# %% +vital_signs = vital_signs.rename(columns={ + 'IDINGRESO': 'PATIENT_ID', + 'CONSTANTS_ING_DATE': 'RECORD_DATE', + 'CONSTANTS_ING_TIME': 'RECORD_TIME', + 'FC_HR_ING': 'HEART_RATE', + 'GLU_GLY_ING': 'BLOOD_GLUCOSE', + 'SAT_02_ING': 'OXYGEN_SATURATION', + 'TA_MAX_ING': 'MAX_BLOOD_PRESSURE', + 'TA_MIN_ING': 'MIN_BLOOD_PRESSURE', + 'TEMP_ING': 'TEMPERATURE' +}) +vital_signs['RECORD_TIME'] = vital_signs['RECORD_DATE'] + ' ' + vital_signs['RECORD_TIME'] +vital_signs['RECORD_TIME'] = vital_signs['RECORD_TIME'].map(lambda x: str(datetime.datetime.strptime(x, '%Y-%m-%d %H:%M'))) +vital_signs = vital_signs.drop(['RECORD_DATE', 'SAT_02_ING_OBS', 'BLOOD_GLUCOSE'], axis=1) + +# %% +vital_signs.describe() + +# %% +vital_signs.head() + +# %% +def format_temperature(x): + if type(x) == str: + return float(x.replace(',', '.')) + else: + return float(x) + +def format_oxygen(x): + x = float(x) + if x > 100: + return np.nan + else: + return x + +def format_heart_rate(x): + x = int(x) + if x > 220: + return np.nan + else: + return x + +vital_signs['TEMPERATURE'] = vital_signs['TEMPERATURE'].map(lambda x: format_temperature(x)) +vital_signs['OXYGEN_SATURATION'] = vital_signs['OXYGEN_SATURATION'].map(lambda x: format_oxygen(x)) +vital_signs['HEART_RATE'] = vital_signs['HEART_RATE'].map(lambda x: format_heart_rate(x)) + +# %% +vital_signs = vital_signs.replace(0, np.NAN) + +# %% +vital_signs = vital_signs.groupby(['PATIENT_ID', 'RECORD_TIME'], dropna=True, as_index = False).mean() +vital_signs.head() + +# %% +vital_signs.describe() + +# %% +vital_signs.describe().to_csv('vital_signs_overview.csv', index=False, mode='w') +vital_signs.describe() + +# %% +""" +#plt.rcParams['figure.figsize'] = [10, 5] +fig=plt.figure(figsize=(16,10), dpi= 100, facecolor='w', edgecolor='k') + +plt.subplot(2, 3, 1) +plt.scatter(vital_signs.index, vital_signs['MAX_BLOOD_PRESSURE'], s=1) +plt.xlabel('Index') +plt.ylabel('Max Blood Pressure') +plt.title('Visit-Max Blood Pressure Scatter Plot') + +plt.subplot(2, 3, 2) +plt.scatter(vital_signs.index, vital_signs['MIN_BLOOD_PRESSURE'], s=1) +plt.xlabel('Index') +plt.ylabel('Min Blood Pressure') +plt.title('Visit-Min Blood Pressure Scatter Plot') + +plt.subplot(2, 3, 3) +plt.scatter(vital_signs.index, vital_signs['TEMPERATURE'], s=1) +plt.xlabel('Index') +plt.ylabel('Temperature') +plt.title('Visit-Temperature Scatter Plot') + +plt.subplot(2, 3, 4) +plt.scatter(vital_signs.index, vital_signs['HEART_RATE'], s=1) +plt.xlabel('Index') +plt.ylabel('Heart Rate') +plt.title('Visit-Heart Rate Scatter Plot') + +plt.subplot(2, 3, 5) +plt.scatter(vital_signs.index, vital_signs['OXYGEN_SATURATION'], s=1) +plt.xlabel('Index') +plt.ylabel('Oxygen Saturation') +plt.title('Visit-Oxygen Saturation Scatter Plot') + +plt.show() +""" +# %% +""" +#plt.rcParams['figure.figsize'] = [10, 5] +fig=plt.figure(figsize=(16,10), dpi= 100, facecolor='w', edgecolor='k') + +plt.subplot(2, 3, 1) +plt.hist(vital_signs['MAX_BLOOD_PRESSURE'], bins=30) +plt.xlabel('Index') +plt.ylabel('Max Blood Pressure') +plt.title('Visit-Max Blood Pressure Histogram') + +plt.subplot(2, 3, 2) +plt.hist(vital_signs['MIN_BLOOD_PRESSURE'], bins=30) +plt.xlabel('Index') +plt.ylabel('Min Blood Pressure') +plt.title('Visit-Min Blood Pressure Histogram') + +plt.subplot(2, 3, 3) +plt.hist(vital_signs['TEMPERATURE'], bins=30) +plt.xlabel('Index') +plt.ylabel('Temperature') +plt.title('Visit-Temperature Histogram') + +plt.subplot(2, 3, 4) +plt.hist(vital_signs['HEART_RATE'], bins=30) +plt.xlabel('Index') +plt.ylabel('Heart Rate') +plt.title('Visit-Heart Rate Histogram') + +plt.subplot(2, 3, 5) +plt.hist(vital_signs['OXYGEN_SATURATION'], bins=30) +plt.xlabel('Index') +plt.ylabel('Oxygen Saturation') +plt.title('Visit-Oxygen Saturation Histogram') + +plt.show() +""" +# %% [markdown] +# ### Missing rate of each visit + +# %% +sum(vital_signs.T.isnull().sum()) / ((len(vital_signs.T) - 2) * len(vital_signs)) + +# %% [markdown] +# ### Normalize data + +# %% +""" +for key in vital_signs.keys()[2:]: + vital_signs[key] = (vital_signs[key] - vital_signs[key].mean()) / (vital_signs[key].std() + 1e-12) + +vital_signs.describe() +""" + +# %% +vital_signs.to_csv('visual_signs.csv', mode='w', index=False) + +# %% +len(vital_signs) / len(vital_signs['PATIENT_ID'].unique()) + +# %% [markdown] +# ## Lab Tests + +# %% +lab_tests = pd.read_csv('./raw_data/19_04_2021/COVID_DSL_06_v2.CSV', encoding='ISO-8859-1', sep=';') +lab_tests = lab_tests.rename(columns={'IDINGRESO': 'PATIENT_ID'}) +print(len(lab_tests)) + +# del useless data +lab_tests = lab_tests[ + [ + 'PATIENT_ID', + 'LAB_NUMBER', + 'LAB_DATE', + 'TIME_LAB', + 'ITEM_LAB', + 'VAL_RESULT' + # UD_RESULT: unit + # REF_VALUES: reference values + ] + ] + +lab_tests.head() + +# %% +lab_tests = lab_tests.groupby(['PATIENT_ID', 'LAB_NUMBER', 'LAB_DATE', 'TIME_LAB', 'ITEM_LAB'], dropna=True, as_index = False).first() +lab_tests = lab_tests.set_index(['PATIENT_ID', 'LAB_NUMBER', 'LAB_DATE', 'TIME_LAB', 'ITEM_LAB'], drop = True).unstack('ITEM_LAB')['VAL_RESULT'].reset_index() + +lab_tests = lab_tests.drop([ + 'CFLAG -- ALARMA HEMOGRAMA', + 'CORONA -- PCR CORONAVIRUS 2019nCoV', + 'CRIOGLO -- CRIOGLOBULINAS', + 'EGCOVID -- ESTUDIO GENETICO COVID-19', + 'FRO1 -- ', + 'FRO1 -- FROTIS EN SANGRE PERIFERICA', + 'FRO2 -- ', + 'FRO2 -- FROTIS EN SANGRE PERIFERICA', + 'FRO3 -- ', + 'FRO3 -- FROTIS EN SANGRE PERIFERICA', + 'FRO_COMEN -- ', + 'FRO_COMEN -- FROTIS EN SANGRE PERIFERICA', + 'G-CORONAV (RT-PCR) -- Tipo de muestra: ASPIRADO BRONCOALVEOLAR', + 'G-CORONAV (RT-PCR) -- Tipo de muestra: EXUDADO', + 'GRRH -- GRUPO SANGUÖNEO Y FACTOR Rh', + 'HEML -- RECUENTO CELULAR LIQUIDO', + 'HEML -- Recuento Hemat¡es', + 'IFSUERO -- INMUNOFIJACION EN SUERO', + 'OBS_BIOMOL -- OBSERVACIONES GENETICA MOLECULAR', + 'OBS_BIOO -- Observaciones Bioqu¡mica Orina', + 'OBS_CB -- Observaciones Coagulaci¢n', + 'OBS_GASES -- Observaciones Gasometr¡a Arterial', + 'OBS_GASV -- Observaciones Gasometr¡a Venosa', + 'OBS_GEN2 -- OBSERVACIONES GENETICA', + 'OBS_HOR -- Observaciones Hormonas', + 'OBS_MICRO -- Observaciones Microbiolog¡a', + 'OBS_NULA2 -- Observaciones Bioqu¡mica', + 'OBS_NULA3 -- Observaciones Hematolog¡a', + 'OBS_PESP -- Observaciones Pruebas especiales', + 'OBS_SERO -- Observaciones Serolog¡a', + 'OBS_SIS -- Observaciones Orina', + 'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: ASPIRADO BRONCOALVEOLAR', + 'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: BAS', + 'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: ESPUTO', + 'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: EXUDADO', + 'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: LAVADO BRONCOALVEOLAR', + 'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: LAVADO NASOFARÖNGEO', + 'PTGOR -- PROTEINOGRAMA ORINA', + 'RESUL_IFT -- ESTUDIO DE INMUNOFENOTIPO', + 'RESUL_IFT -- Resultado', + 'Resultado -- Resultado', + 'SED1 -- ', + 'SED1 -- SEDIMENTO', + 'SED2 -- ', + 'SED2 -- SEDIMENTO', + 'SED3 -- ', + 'SED3 -- SEDIMENTO', + 'TIPOL -- TIPO DE LIQUIDO', + 'Tecnica -- T\x82cnica', + 'TpMues -- Tipo de muestra', + 'VHCBLOT -- INMUNOBLOT VIRUS HEPATITIS C', + 'VIR_TM -- VIRUS TIPO DE MUESTRA', + 'LEGIORI -- AG. LEGIONELA PNEUMOPHILA EN ORINA', + 'NEUMOORI -- AG NEUMOCOCO EN ORINA', + 'VIHAC -- VIH AC' + ], axis=1) + + +lab_tests.head() + +# %% +lab_tests = lab_tests.replace('Sin resultado.', np.nan) +lab_tests = lab_tests.replace('Sin resultado', np.nan) +lab_tests = lab_tests.replace('----', np.nan).replace('---', np.nan) +lab_tests = lab_tests.replace('> ', '').replace('< ', '') + +def change_format(x): + if x is None: + return np.nan + elif type(x) == str: + if x.startswith('Negativo ('): + return x.replace('Negativo (', '-')[:-1] + elif x.startswith('Positivo ('): + return x.replace('Positivo (', '')[:-1] + elif x.startswith('Zona limite ('): + return x.replace('Zona limite (', '')[:-1] + elif x.startswith('>'): + return x.replace('> ', '').replace('>', '') + elif x.startswith('<'): + return x.replace('< ', '').replace('<', '') + elif x.endswith(' mg/dl'): + return x.replace(' mg/dl', '') + elif x.endswith('/æl'): + return x.replace('/æl', '') + elif x.endswith(' copias/mL'): + return x.replace(' copias/mL', '') + elif x == 'Numerosos': + return 1.5 + elif x == 'Aislados': + return 0.5 + elif x == 'Se detecta' or x == 'Se observan' or x == 'Normal' or x == 'Positivo': + return 1 + elif x == 'No se detecta' or x == 'No se observan' or x == 'Negativo': + return 0 + elif x == 'Indeterminado': + return np.nan + else: + num = re.findall("[-+]?\d+\.\d+", x) + if len(num) == 0: + return np.nan + else: + return num[0] + else: + return x + +feature_value_dict = dict() + +for k in tqdm(lab_tests.keys()[4:]): + lab_tests[k] = lab_tests[k].map(lambda x: change_format(change_format(x))) + feature_value_dict[k] = lab_tests[k].unique() + +# %% +def nan_and_not_nan(x): + if x == x: + return 1 + else: # nan + return 0 + +def is_float(num): + try: + float(num) + return True + except ValueError: + return False + +def is_all_float(x): + for i in x: + if i == i and (i != None): + if not is_float(i): + return False + return True + +def to_float(x): + if x != None: + return float(x) + else: + return np.nan + +other_feature_dict = dict() + +for feature in tqdm(feature_value_dict.keys()): + values = feature_value_dict[feature] + if is_all_float(values): + lab_tests[feature] = lab_tests[feature].map(lambda x: to_float(x)) + elif len(values) == 2: + lab_tests[feature] = lab_tests[feature].map(lambda x: nan_and_not_nan(x)) + else: + other_feature_dict[feature] = values + +# %% +other_feature_dict + +# %% +def format_time(t): + if '/' in t: + return str(datetime.datetime.strptime(t, '%d/%m/%Y %H:%M')) + else: + return str(datetime.datetime.strptime(t, '%d-%m-%Y %H:%M')) + +lab_tests['RECORD_TIME'] = lab_tests['LAB_DATE'] + ' ' + lab_tests['TIME_LAB'] +lab_tests['RECORD_TIME'] = lab_tests['RECORD_TIME'].map(lambda x: format_time(x)) +lab_tests = lab_tests.drop(['LAB_NUMBER', 'LAB_DATE', 'TIME_LAB'], axis=1) +# lab_tests = lab_tests.drop(['LAB_NUMBER', 'TIME_LAB'], axis=1) +lab_tests.head() + +# %% +lab_tests_patient = lab_tests.groupby(['PATIENT_ID'], dropna=True, as_index = False).mean() +print(len(lab_tests_patient)) +count = [i for i in lab_tests_patient.count()[1:]] +plt.hist(count) + +# %% +patient_total = len(lab_tests_patient) +threshold = patient_total * 0.1 +reserved_keys = [] + +for key in lab_tests_patient.keys(): + if lab_tests_patient[key].count() > threshold: + reserved_keys.append(key) + +print(len(reserved_keys)) +reserved_keys + +# %% +reserved_keys.insert(1, 'RECORD_TIME') + +lab_tests = lab_tests.groupby(['PATIENT_ID', 'RECORD_TIME'], dropna=True, as_index = False).mean() + +lab_tests = lab_tests[reserved_keys] +lab_tests.head() +""" +# %% [markdown] +# ### Missing rate of each visit + +# %% +sum(lab_tests.T.isnull().sum()) / ((len(lab_tests.T) - 2) * len(lab_tests)) + +# %% [markdown] +# ### Scatter Plot + +# %% +fig=plt.figure(figsize=(16,200), dpi= 100, facecolor='w', edgecolor='k') + +i = 1 +for key in lab_tests.keys()[2:]: + plt.subplot(33, 3, i) + plt.scatter(lab_tests.index, lab_tests[key], s=1) + plt.ylabel(key) + i += 1 + +plt.show() + +# %% +fig=plt.figure(figsize=(20,120), dpi= 100, facecolor='w', edgecolor='k') + +i = 1 +for key in lab_tests.keys()[2:]: + plt.subplot(23, 4, i) + plt.hist(lab_tests[key], bins=30) + q3 = lab_tests[key].quantile(0.75) + q1 = lab_tests[key].quantile(0.25) + qh = q3 + 3 * (q3 - q1) + ql = q1 - 3 * (q3 - q1) + sigma = 5 + plt.axline([sigma*lab_tests[key].std() + lab_tests[key].mean(), 0], [sigma*lab_tests[key].std() + lab_tests[key].mean(), 1], color = "r", linestyle=(0, (5, 5))) + plt.axline([-sigma*lab_tests[key].std() + lab_tests[key].mean(), 0], [-sigma*lab_tests[key].std() + lab_tests[key].mean(), 1], color = "r", linestyle=(0, (5, 5))) + #plt.axline([lab_tests[key].quantile(0.25), 0], [lab_tests[key].quantile(0.25), 1], color = "k", linestyle=(0, (5, 5))) + #plt.axline([lab_tests[key].quantile(0.75), 0], [lab_tests[key].quantile(0.75), 1], color = "k", linestyle=(0, (5, 5))) + plt.axline([qh, 0], [qh, 1], color='k', linestyle=(0, (5, 5))) + plt.axline([ql, 0], [ql, 1], color='k', linestyle=(0, (5, 5))) + plt.ylabel(key) + i += 1 + +plt.show() +""" +# %% [markdown] +# ### Normalize data + +# %% +""" +for key in lab_tests.keys()[2:]: + lab_tests[key] = (lab_tests[key] - lab_tests[key].mean()) / (lab_tests[key].std() + 1e-12) + +lab_tests.describe() +""" + +# %% +# 【del normalization】 +# for key in lab_tests.keys()[2:]: +# r = lab_tests[lab_tests[key].between(lab_tests[key].quantile(0.05), lab_tests[key].quantile(0.95))] +# lab_tests[key] = (lab_tests[key] - r[key].mean()) / (r[key].std() + 1e-12) + +# %% +lab_tests.to_csv('lab_test.csv', mode='w', index=False) + +# %% [markdown] +# # Concat data + +# %% +demographic['PATIENT_ID'] = demographic['PATIENT_ID'].map(lambda x: str(int(x))) +vital_signs['PATIENT_ID'] = vital_signs['PATIENT_ID'].map(lambda x: str(int(x))) +lab_tests['PATIENT_ID'] = lab_tests['PATIENT_ID'].map(lambda x: str(int(x))) + +# %% +len(demographic['PATIENT_ID'].unique()), len(vital_signs['PATIENT_ID'].unique()), len(lab_tests['PATIENT_ID'].unique()) + +# %% +train_df = pd.merge(vital_signs, lab_tests, on=['PATIENT_ID', 'RECORD_TIME'], how='outer') + +train_df = train_df.groupby(['PATIENT_ID', 'RECORD_TIME'], dropna=True, as_index = False).mean() + +train_df = pd.merge(demographic, train_df, on=['PATIENT_ID'], how='left') + +train_df.head() + +# %% +# del rows without patient_id, admission_date, record_time, or outcome +train_df = train_df.dropna(axis=0, how='any', subset=['PATIENT_ID', 'ADMISSION_DATE', 'RECORD_TIME', 'OUTCOME']) + +# %% +train_df.to_csv('train.csv', mode='w', index=False) +train_df.describe() + +# %% [markdown] +# ## Missing rate of each visit + +# %% +sum(train_df.T.isnull().sum()) / ((len(train_df.T) - 2) * len(train_df)) + +# %% [markdown] +# # Split and save data + +# %% [markdown] +# * demo: demographic data +# * x: lab test & vital signs +# * y: outcome & length of stay + +# %% +patient_ids = train_df['PATIENT_ID'].unique() + +demo_cols = ['AGE', 'SEX'] # , 'DIFFICULTY_BREATHING', 'FEVER', 'SUSPECT_COVID', 'EMERGENCY' +test_cols = [] + +# get column names +for k in train_df.keys(): + if not k in demographic.keys(): + if not k == 'RECORD_TIME': + test_cols.append(k) + +test_median = train_df[test_cols].median() + +# %% +test_cols + +# %% +train_df['RECORD_TIME_DAY'] = train_df['RECORD_TIME'].map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d')) +train_df['RECORD_TIME_HOUR'] = train_df['RECORD_TIME'].map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d %H')) +train_df.head() + +# %% +train_df_day = train_df.groupby(['PATIENT_ID', 'ADMISSION_DATE', 'DEPARTURE_DATE', 'RECORD_TIME_DAY'], dropna=True, as_index = False).mean() +train_df_hour = train_df.groupby(['PATIENT_ID', 'ADMISSION_DATE', 'DEPARTURE_DATE', 'RECORD_TIME_HOUR'], dropna=True, as_index = False).mean() + +len(train_df), len(train_df_day), len(train_df_hour) + +# %% [markdown] +# +# ``` +# number of visits (total) +# - Original data: 168777 +# - Merge by hour: 130141 +# - Merge by day: 42204 +# ``` + +# %% +len(train_df['PATIENT_ID'].unique()) + +# %% +def get_visit_intervals(df): + ls = [] + for pat in df['PATIENT_ID'].unique(): + ls.append(len(df[df['PATIENT_ID'] == pat])) + return ls + +# %% +ls_org = get_visit_intervals(train_df) +ls_hour = get_visit_intervals(train_df_hour) +ls_day = get_visit_intervals(train_df_day) + +# %% +import matplotlib.pyplot as plt +from matplotlib.ticker import PercentFormatter +import matplotlib.font_manager as font_manager +import pandas as pd +import numpy as np +""" +csfont = {'fontname':'Times New Roman', 'fontsize': 18} +font = 'Times New Roman' +fig=plt.figure(figsize=(18,4), dpi= 100, facecolor='w', edgecolor='k') +plt.style.use('seaborn-whitegrid') +color = 'cornflowerblue' +ec = 'None' +alpha=0.5 + +ax = plt.subplot(1, 3, 1) +ax.hist(ls_org, bins=20, weights=np.ones(len(ls_org)) / len(ls_org), color=color, ec=ec, alpha=alpha, label='overall') +plt.xlabel('Num of visits (org)',**csfont) +plt.ylabel('Percentage',**csfont) +plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) +plt.xticks(**csfont) +plt.yticks(**csfont) + +ax = plt.subplot(1, 3, 2) +ax.hist(ls_hour, bins=20, weights=np.ones(len(ls_hour)) / len(ls_hour), color=color, ec=ec, alpha=alpha, label='overall') +plt.xlabel('Num of visits (hour)',**csfont) +plt.ylabel('Percentage',**csfont) +plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) +plt.xticks(**csfont) +plt.yticks(**csfont) + +ax = plt.subplot(1, 3, 3) +ax.hist(ls_day, bins=20, weights=np.ones(len(ls_day)) / len(ls_day), color=color, ec=ec, alpha=alpha, label='overall') +plt.xlabel('Num of visits (day)',**csfont) +plt.ylabel('Percentage',**csfont) +plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) +plt.xticks(**csfont) +plt.yticks(**csfont) + +plt.show() +""" +# %% +def get_statistic(lst, name): + print(f'[{name}]\tMax:\t{max(lst)}, Min:\t{min(lst)}, Median:\t{np.median(lst)}, Mean:\t{np.mean(lst)}, 80%:\t{np.quantile(lst, 0.8)}, 90%:\t{np.quantile(lst, 0.9)}, 95%:\t{np.quantile(lst, 0.95)}') + +# %% +get_statistic(ls_org, 'ls_org') +get_statistic(ls_hour, 'ls_hour') +get_statistic(ls_day, 'ls_day') + +# %% +train_df_hour['LOS'] = train_df_hour['ADMISSION_DATE'] +train_df_hour['LOS_HOUR'] = train_df_hour['ADMISSION_DATE'] + +# %% +train_df_hour = train_df_hour.reset_index() + +# %% +for idx in tqdm(range(len(train_df_hour))): + info = train_df_hour.loc[idx] + admission = datetime.datetime.strptime(info['ADMISSION_DATE'], '%Y-%m-%d %H:%M:%S') + departure = datetime.datetime.strptime(info['DEPARTURE_DATE'], '%Y-%m-%d %H:%M:%S') + visit_hour = datetime.datetime.strptime(info['RECORD_TIME_HOUR'], '%Y-%m-%d %H') + hour = (departure - visit_hour).seconds / (24 * 60 * 60) + (departure - visit_hour).days + los = (departure - admission).seconds / (24 * 60 * 60) + (departure - admission).days + train_df_hour.at[idx, 'LOS'] = float(los) + train_df_hour.at[idx, 'LOS_HOUR'] = float(hour) + +# %% +train_df_hour['LOS'] + +# %% +los = [] +for pat in tqdm(train_df_hour['PATIENT_ID'].unique()): + los.append(float(train_df_hour[train_df_hour['PATIENT_ID'] == pat]['LOS'].head(1))) + +# %% +get_statistic(los, 'los') + +# %% +import matplotlib.pyplot as plt +from matplotlib.ticker import PercentFormatter +import matplotlib.font_manager as font_manager +import pandas as pd +import numpy as np +""" +csfont = {'fontname':'Times New Roman', 'fontsize': 18} +font = 'Times New Roman' +fig=plt.figure(figsize=(6,6), dpi= 100, facecolor='w', edgecolor='k') +plt.style.use('seaborn-whitegrid') +color = 'cornflowerblue' +ec = 'None' +alpha=0.5 + +ax = plt.subplot(1, 1, 1) +ax.hist(los, bins=20, weights=np.ones(len(los)) / len(los), color=color, ec=ec, alpha=alpha, label='overall') +plt.xlabel('Length of stay',**csfont) +plt.ylabel('Percentage',**csfont) +plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) +plt.xticks(**csfont) +plt.yticks(**csfont) + +plt.show() +""" +# %% +train_df_hour_idx = train_df_hour.reset_index() + +# %% +train_df_hour_idx['LOS'] = train_df_hour_idx['ADMISSION_DATE'] + +for idx in tqdm(range(len(train_df_hour_idx))): + info = train_df_hour_idx.loc[idx] + # admission = datetime.datetime.strptime(info['ADMISSION_DATE'], '%Y-%m-%d %H:%M:%S') + departure = datetime.datetime.strptime(info['DEPARTURE_DATE'], '%Y-%m-%d %H:%M:%S') + visit_hour = datetime.datetime.strptime(info['RECORD_TIME_HOUR'], '%Y-%m-%d %H') + hour = (departure - visit_hour).seconds / (24 * 60 * 60) + (departure - visit_hour).days + train_df_hour_idx.at[idx, 'LOS'] = float(hour) + +# %% +train_df_hour['LOS'] = train_df_hour['LOS_HOUR'] +train_df_hour.drop(columns=['LOS_HOUR']) + +# %% +# los_threshold = 13.0 + +# visit_num_hour = [] + +# for pat in tqdm(train_df_hour_idx['PATIENT_ID'].unique()): +# pat_records = train_df_hour_idx[train_df_hour_idx['PATIENT_ID'] == pat] +# hour = 0 +# for vis in pat_records.index: +# pat_visit = pat_records.loc[vis] +# if pat_visit['LOS_HOUR'] <= los_threshold: +# hour += 1 +# visit_num_hour.append(hour) +# if hour == 0: +# print(pat) + +# %% +# import matplotlib.pyplot as plt +# from matplotlib.ticker import PercentFormatter +# import matplotlib.font_manager as font_manager +# import pandas as pd +# import numpy as np +# csfont = {'fontname':'Times New Roman', 'fontsize': 18} +# font = 'Times New Roman' +# fig=plt.figure(figsize=(6,6), dpi= 100, facecolor='w', edgecolor='k') +# plt.style.use('seaborn-whitegrid') +# color = 'cornflowerblue' +# ec = 'None' +# alpha=0.5 + +# ax = plt.subplot(1, 1, 1) +# ax.hist(visit_num_hour, bins=20, weights=np.ones(len(visit_num_hour)) / len(visit_num_hour), color=color, ec=ec, alpha=alpha, label='overall') +# plt.xlabel('Visit num (80% los)',**csfont) +# plt.ylabel('Percentage',**csfont) +# plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) +# plt.xticks(**csfont) +# plt.yticks(**csfont) + +# plt.show() + +# %% +train_df = train_df_hour +train_df.head() + +# %% +train_df.describe() + +# %% +get_statistic(train_df['LOS'], 'los') + +# %% +train_df[train_df['PATIENT_ID'] == '1']['HEART_RATE'].count() + +# %% +cols = train_df.columns[5:] +pats = train_df['PATIENT_ID'].unique() +all_pat_cnt = len(pats) +missing_rate = dict() +# for col in tqdm(cols): +# miss = 0 +# for pat in pats: +# if train_df[train_df['PATIENT_ID'] == pat][col].count() == 0: +# miss += 1 +# missing_rate[col] = miss / all_pat_cnt + +for col in cols: + missing_rate[col] = 0 +for pat in tqdm(pats): + p = train_df[train_df['PATIENT_ID'] == pat] + for col in cols: + if p[col].count() == 0: + missing_rate[col] += 1 +for col in cols: + missing_rate[col] = missing_rate[col] / all_pat_cnt + +missing_rate + +# %% +with open('missing_rate.csv', mode='w', encoding='utf-8') as file: + for col in cols: + file.write(f'"{col}", {100 * missing_rate[col]}\n') + +# %% +train_df['LOS'] = train_df['LOS'].clip(lower=0) + +# %% +get_statistic(train_df['LOS'], 'los') + +# %% +# the first visit of each person +def init_prev(prev): + miss = [] + l = len(prev) + for idx in range(l): + #print(prev[idx]) + #print(type(prev[idx])) + if np.isnan(prev[idx]): # there is no previous record + prev[idx] = test_median[idx] # replace nan to median + miss.append(1) # mark miss as 1 + else: # there is a previous record + miss.append(0) + return miss + +# the rest of the visits +def fill_nan(cur, prev): + l = len(prev) + miss = [] + for idx in range(l): + #print(cur[idx]) + if np.isnan(cur[idx]): # there is no record in current timestep + cur[idx] = prev[idx] # cur <- prev + miss.append(1) + else: # there is a record in current timestep + miss.append(0) + return miss + +# %% +x, y, demo, x_lab_len, missing_mask = [], [], [], [], [] + +for pat in tqdm(patient_ids): # for all patients + # get visits for pat.id == PATIENT_ID + info = train_df[train_df['PATIENT_ID'] == pat] + info = info[max(0, len(info) - 76):] + indexes = info.index + visit = info.loc[indexes[0]] # get the first visit + + # demographic data + demo.append([visit[k] for k in demo_cols]) + + # label + outcome = visit['OUTCOME'] + los = [] + + # lab test & vital signs + tests = [] + prev = visit[test_cols] + miss = [] # missing matrix + miss.append(init_prev(prev)) # fill nan for the first visit for every patient and add missing status to missing matrix + # leave = datetime.datetime.strptime(visit['DEPARTURE_DATE'], '%Y-%m-%d %H:%M:%S') + + first = True + for i in indexes: + visit = info.loc[i] + # now = datetime.datetime.strptime(visit['RECORD_TIME'], '%Y-%m-%d %H') + cur = visit[test_cols] + tmp = fill_nan(cur, prev) # fill nan for the rest of the visits + if not first: + miss.append(tmp) # add missing status to missing matrix + tests.append(cur) + # los_visit = (leave - now).days + # if los_visit < 0: + # los_visit = 0 + los.append(visit['LOS']) + prev = cur + first = False + + valid_visit = len(los) + # outcome = [outcome] * valid_visit + x_lab_len.append(valid_visit) + missing_mask.append(miss) # append the patient's missing matrix to the total missing matrix + + # tests = np.pad(tests, ((0, max_visit - valid_visit), (0, 0))) + # outcome = np.pad(outcome, (0, max_visit - valid_visit)) + # los = np.pad(los, (0, max_visit - valid_visit)) + + y.append([outcome, los]) + x.append(tests) + +# %% +all_x = x +all_x_demo = demo +all_y = y +all_missing_mask = missing_mask + +# %% +all_x_labtest = np.array(all_x, dtype=object) +x_lab_length = [len(_) for _ in all_x_labtest] +x_lab_length = torch.tensor(x_lab_length, dtype=torch.int) +max_length = int(x_lab_length.max()) +all_x_labtest = [torch.tensor(_) for _ in all_x_labtest] +all_x_labtest = torch.nn.utils.rnn.pad_sequence((all_x_labtest), batch_first=True) +all_x_demographic = torch.tensor(all_x_demo) +batch_size, demo_dim = all_x_demographic.shape +all_x_demographic = torch.reshape(all_x_demographic.repeat(1, max_length), (batch_size, max_length, demo_dim)) +all_x = torch.cat((all_x_demographic, all_x_labtest), 2) + +all_y = np.array(all_y, dtype=object) +patient_list = [] +for pat in all_y: + visits = [] + for i in pat[1]: + visits.append([pat[0], i]) + patient_list.append(visits) +new_all_y = np.array(patient_list, dtype=object) +output_all_y = [torch.Tensor(_) for _ in new_all_y] +output_all_y = torch.nn.utils.rnn.pad_sequence((output_all_y), batch_first=True) + +# %% +all_missing_mask = np.array(all_missing_mask, dtype=object) +all_missing_mask = [torch.tensor(_) for _ in all_missing_mask] +all_missing_mask = torch.nn.utils.rnn.pad_sequence((all_missing_mask), batch_first=True) + +# %% +all_x.shape + +# %% +all_missing_mask.shape + +# %% +# save pickle format dataset (torch) +pd.to_pickle(all_x,f'./processed_data/x.pkl' ) +pd.to_pickle(all_missing_mask,f'./processed_data/missing_mask.pkl' ) +pd.to_pickle(output_all_y,f'./processed_data/y.pkl' ) +pd.to_pickle(x_lab_length,f'./processed_data/visits_length.pkl' ) + +# %% +# Calculate patients' outcome statistics (patients-wise) +outcome_list = [] +y_outcome = output_all_y[:, :, 0] +indices = torch.arange(len(x_lab_length), dtype=torch.int64) +for i in indices: + outcome_list.append(y_outcome[i][0].item()) +outcome_list = np.array(outcome_list) +print(len(outcome_list)) +unique, count=np.unique(outcome_list,return_counts=True) +data_count=dict(zip(unique,count)) +print(data_count) + +# %% +# Calculate patients' outcome statistics (records-wise) +outcome_records_list = [] +y_outcome = output_all_y[:, :, 0] +indices = torch.arange(len(x_lab_length), dtype=torch.int64) +for i in indices: + outcome_records_list.extend(y_outcome[i][0:x_lab_length[i]].tolist()) +outcome_records_list = np.array(outcome_records_list) +print(len(outcome_records_list)) +unique, count=np.unique(outcome_records_list,return_counts=True) +data_count=dict(zip(unique,count)) +print(data_count) + +# %% +# Calculate patients' mean los and 95% percentile los +los_list = [] +y_los = output_all_y[:, :, 1] +indices = torch.arange(len(x_lab_length), dtype=torch.int64) +for i in indices: + # los_list.extend(y_los[i][: x_lab_length[i].long()].tolist()) + los_list.append(y_los[i][0].item()) +los_list = np.array(los_list) +print(los_list.mean() * 0.5) +print(np.median(los_list) * 0.5) +print(np.percentile(los_list, 95)) + +print('median:', np.median(los_list)) +print('Q1:', np.percentile(los_list, 25)) +print('Q3:', np.percentile(los_list, 75)) + +# %% +los_alive_list = np.array([los_list[i] for i in range(len(los_list)) if outcome_list[i] == 0]) +los_dead_list = np.array([los_list[i] for i in range(len(los_list)) if outcome_list[i] == 1]) +print(len(los_alive_list)) +print(len(los_dead_list)) + +print('[Alive]') +print('median:', np.median(los_alive_list)) +print('Q1:', np.percentile(los_alive_list, 25)) +print('Q3:', np.percentile(los_alive_list, 75)) + +print('[Dead]') +print('median:', np.median(los_dead_list)) +print('Q1:', np.percentile(los_dead_list, 25)) +print('Q3:', np.percentile(los_dead_list, 75)) + +# %% +cdsl_los_statistics = { + 'overall': los_list, + 'alive': los_alive_list, + 'dead': los_dead_list +} +pd.to_pickle(cdsl_los_statistics, 'cdsl_los_statistics.pkl') + +# %% +# calculate visits length Median [Q1, Q3] +visits_list = np.array(x_lab_length) +visits_alive_list = np.array([x_lab_length[i] for i in range(len(x_lab_length)) if outcome_list[i] == 0]) +visits_dead_list = np.array([x_lab_length[i] for i in range(len(x_lab_length)) if outcome_list[i] == 1]) +print(len(visits_alive_list)) +print(len(visits_dead_list)) + +print('[Total]') +print('median:', np.median(visits_list)) +print('Q1:', np.percentile(visits_list, 25)) +print('Q3:', np.percentile(visits_list, 75)) + +print('[Alive]') +print('median:', np.median(visits_alive_list)) +print('Q1:', np.percentile(visits_alive_list, 25)) +print('Q3:', np.percentile(visits_alive_list, 75)) + +print('[Dead]') +print('median:', np.median(visits_dead_list)) +print('Q1:', np.percentile(visits_dead_list, 25)) +print('Q3:', np.percentile(visits_dead_list, 75)) + +# %% +def check_nan(x): + if np.isnan(np.sum(x.cpu().numpy())): + print("some values from input are nan") + else: + print("no nan") + +# %% +check_nan(all_x) + +# %% [markdown] +# # Draw Charts + +# %% [markdown] +# ## Import packages + +# %% +import matplotlib.pyplot as plt +from matplotlib.ticker import PercentFormatter +import matplotlib.font_manager as font_manager +import pandas as pd +import numpy as np + +plt.style.use('seaborn-whitegrid') +color = 'cornflowerblue' +ec = 'None' +alpha=0.5 +alive_color = 'olivedrab' +dead_color = 'orchid' + +# %% [markdown] +# ## Read data + +# %% +demographic.head() + +# %% +train = pd.read_csv('./train.csv') +train['PATIENT_ID']=train['PATIENT_ID'].astype(str) +demographic['PATIENT_ID']=demographic['PATIENT_ID'].astype(str) +pat = { + 'PATIENT_ID': train['PATIENT_ID'].unique() +} +pat = pd.DataFrame(pat) +demo = pd.merge(demographic, pat, on='PATIENT_ID', how='inner') + +demo_alive = demo.loc[demo['OUTCOME'] == 0] +demo_dead = demo.loc[demo['OUTCOME'] == 1] +demo_overall = demo + +# %% +demo.to_csv('demo_overall.csv', index=False) +demo_alive.to_csv('demo_alive.csv', index=False) +demo_dead.to_csv('demo_dead.csv', index=False) + +# %% +patient = pd.DataFrame({"PATIENT_ID": (demo_alive['PATIENT_ID'].unique())}) +lab_tests_alive = pd.merge(lab_tests, patient, how='inner', on='PATIENT_ID') +print(len(lab_tests_alive['PATIENT_ID'].unique())) + +patient = pd.DataFrame({"PATIENT_ID": (demo_dead['PATIENT_ID'].unique())}) +lab_tests_dead = pd.merge(lab_tests, patient, how='inner', on='PATIENT_ID') +print(len(lab_tests_dead['PATIENT_ID'].unique())) + +patient = pd.DataFrame({"PATIENT_ID": (demo_overall['PATIENT_ID'].unique())}) +lab_tests_overall = pd.merge(lab_tests, patient, how='inner', on='PATIENT_ID') +print(len(lab_tests_overall['PATIENT_ID'].unique())) + +# %% +patient = pd.DataFrame({"PATIENT_ID": (demo_alive['PATIENT_ID'].unique())}) +vital_signs_alive = pd.merge(vital_signs, patient, how='inner', on='PATIENT_ID') +len(vital_signs_alive['PATIENT_ID'].unique()) + +# %% +patient = pd.DataFrame({"PATIENT_ID": (demo_dead['PATIENT_ID'].unique())}) +vital_signs_dead = pd.merge(vital_signs, patient, how='inner', on='PATIENT_ID') +len(vital_signs_dead['PATIENT_ID'].unique()) + +# %% +patient = pd.DataFrame({"PATIENT_ID": (demo_overall['PATIENT_ID'].unique())}) +vital_signs_overall = pd.merge(vital_signs, patient, how='inner', on='PATIENT_ID') +len(vital_signs_overall['PATIENT_ID'].unique()) + +# %% +""" +limit = 0.05 + +csfont = {'fontname':'Times New Roman', 'fontsize': 18} +font = 'Times New Roman' +fig=plt.figure(figsize=(16,12), dpi= 100, facecolor='w', edgecolor='k') + +idx = 1 + +key = 'AGE' +low = demo_overall[key].quantile(limit) +high = demo_overall[key].quantile(1 - limit) +demo_AGE_overall = demo_overall[demo_overall[key].between(low, high)] +demo_AGE_dead = demo_dead[demo_dead[key].between(low, high)] +demo_AGE_alive = demo_alive[demo_alive[key].between(low, high)] +ax = plt.subplot(4, 4, idx) +ax.hist(demo_AGE_overall[key], bins=20, weights=np.ones(len(demo_AGE_overall[key])) / len(demo_AGE_overall), color=color, ec=ec, alpha=alpha, label='overall') +plt.xlabel('Age',**csfont) +plt.ylabel('Percentage',**csfont) +plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) +# ax.title('Age Histogram', **csfont) +ax.hist(demo_AGE_alive[key], bins=20, weights=np.ones(len(demo_AGE_alive[key])) / len(demo_AGE_alive), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2, label='alive') +ax.hist(demo_AGE_dead[key], bins=20, weights=np.ones(len(demo_AGE_dead[key])) / len(demo_AGE_dead), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2, label='dead') +plt.xticks(**csfont) +plt.yticks(**csfont) +idx += 1 + +key = 'TEMPERATURE' +low = vital_signs_overall[key].quantile(limit) +high = vital_signs_overall[key].quantile(1 - limit) +vs_TEMPERATURE_overall = vital_signs_overall[vital_signs_overall[key].between(low, high)] +vs_TEMPERATURE_dead = vital_signs_dead[vital_signs_dead[key].between(low, high)] +vs_TEMPERATURE_alive = vital_signs_alive[vital_signs_alive[key].between(low, high)] +plt.subplot(4, 4, idx) +plt.hist(vs_TEMPERATURE_overall['TEMPERATURE'], bins=20, weights=np.ones(len(vs_TEMPERATURE_overall)) / len(vs_TEMPERATURE_overall), color=color, ec=ec, alpha=alpha) +plt.xlabel('Temperature',**csfont) +plt.ylabel('Percentage',**csfont) +plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) +# plt.title('Temperature Histogram', **csfont) +plt.hist(vs_TEMPERATURE_alive['TEMPERATURE'], bins=20, weights=np.ones(len(vs_TEMPERATURE_alive)) / len(vs_TEMPERATURE_alive), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) +plt.hist(vs_TEMPERATURE_dead['TEMPERATURE'], bins=20, weights=np.ones(len(vs_TEMPERATURE_dead)) / len(vs_TEMPERATURE_dead), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) +plt.xticks(**csfont) +plt.yticks(**csfont) +idx += 1 + +# plt.subplot(4, 4, 3) +# plt.hist(lab_tests_overall['CREA -- CREATININA'], bins=20, density=True, color=color, ec=ec, alpha=alpha) +# plt.xlabel('CREA -- CREATININA',**csfont) +# plt.ylabel('Percentage',**csfont) +# plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) +# # plt.title('Temperature Histogram', **csfont) +# plt.hist(lab_tests_alive['CREA -- CREATININA'], bins=20, density=True, color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) +# plt.hist(lab_tests_dead['CREA -- CREATININA'], bins=20, density=True, color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) +# plt.xticks(**csfont) +# plt.yticks(**csfont) + +key = 'CREA -- CREATININA' +low = lab_tests_overall[key].quantile(limit) +high = lab_tests_overall[key].quantile(1 - limit) +lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)] +lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)] +lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)] +plt.subplot(4, 4, idx) +plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha) +plt.xlabel('CREA -- CREATININA',**csfont) +plt.ylabel('Percentage',**csfont) +plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) +# plt.title('Temperature Histogram', **csfont) +plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) +plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) +plt.xticks(**csfont) +plt.yticks(**csfont) +idx += 1 + +key = 'HEM -- Hemat¡es' +low = lab_tests_overall[key].quantile(limit) +high = lab_tests_overall[key].quantile(1 - limit) +lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)] +lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)] +lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)] +plt.subplot(4, 4, idx) +plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha) +plt.xlabel('HEM -- Hemat¡es',**csfont) +plt.ylabel('Percentage',**csfont) +plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) +# plt.title('Temperature Histogram', **csfont) +plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) +plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) +plt.xticks(**csfont) +plt.yticks(**csfont) +idx += 1 + +key = 'LEUC -- Leucocitos' +low = lab_tests_overall[key].quantile(limit) +high = lab_tests_overall[key].quantile(1 - limit) +lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)] +lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)] +lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)] +plt.subplot(4, 4, idx) +plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha) +plt.xlabel('LEUC -- Leucocitos',**csfont) +plt.ylabel('Percentage',**csfont) +plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) +# plt.title('Temperature Histogram', **csfont) +plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) +plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) +plt.xticks(**csfont) +plt.yticks(**csfont) +idx += 1 + +key = 'PLAQ -- Recuento de plaquetas' +low = lab_tests_overall[key].quantile(limit) +high = lab_tests_overall[key].quantile(1 - limit) +lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)] +lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)] +lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)] +plt.subplot(4, 4, idx) +plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha) +plt.xlabel('PLAQ -- Recuento de plaquetas',**csfont) +plt.ylabel('Percentage',**csfont) +plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) +# plt.title('Temperature Histogram', **csfont) +plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) +plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) +plt.xticks(**csfont) +plt.yticks(**csfont) +idx += 1 + +key = 'CHCM -- Conc. Hemoglobina Corpuscular Media' +low = lab_tests_overall[key].quantile(limit) +high = lab_tests_overall[key].quantile(1 - limit) +lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)] +lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)] +lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)] +plt.subplot(4, 4, idx) +plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha) +plt.xlabel('CHCM',**csfont) +plt.ylabel('Percentage',**csfont) +plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) +# plt.title('Temperature Histogram', **csfont) +plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) +plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) +plt.xticks(**csfont) +plt.yticks(**csfont) +idx += 1 + +key = 'HCTO -- Hematocrito' +low = lab_tests_overall[key].quantile(limit) +high = lab_tests_overall[key].quantile(1 - limit) +lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)] +lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)] +lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)] +plt.subplot(4, 4, idx) +plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha) +plt.xlabel('HCTO -- Hematocrito',**csfont) +plt.ylabel('Percentage',**csfont) +plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) +# plt.title('Temperature Histogram', **csfont) +plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) +plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) +plt.xticks(**csfont) +plt.yticks(**csfont) +idx += 1 + +key = 'VCM -- Volumen Corpuscular Medio' +low = lab_tests_overall[key].quantile(limit) +high = lab_tests_overall[key].quantile(1 - limit) +lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)] +lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)] +lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)] +plt.subplot(4, 4, idx) +plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha) +plt.xlabel('VCM -- Volumen Corpuscular Medio',**csfont) +plt.ylabel('Percentage',**csfont) +plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) +# plt.title('Temperature Histogram', **csfont) +plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) +plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) +plt.xticks(**csfont) +plt.yticks(**csfont) +idx += 1 + +key = 'HGB -- Hemoglobina' +low = lab_tests_overall[key].quantile(limit) +high = lab_tests_overall[key].quantile(1 - limit) +lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)] +lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)] +lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)] +plt.subplot(4, 4, idx) +plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha) +plt.xlabel('HGB -- Hemoglobina',**csfont) +plt.ylabel('Percentage',**csfont) +plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) +# plt.title('Temperature Histogram', **csfont) +plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) +plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) +plt.xticks(**csfont) +plt.yticks(**csfont) +idx += 1 + +key = 'HCM -- Hemoglobina Corpuscular Media' +low = lab_tests_overall[key].quantile(limit) +high = lab_tests_overall[key].quantile(1 - limit) +lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)] +lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)] +lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)] +plt.subplot(4, 4, idx) +plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha) +plt.xlabel('HCM -- Hemoglobina Corpuscular Media',**csfont) +plt.ylabel('Percentage',**csfont) +plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) +# plt.title('Temperature Histogram', **csfont) +plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) +plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) +plt.xticks(**csfont) +plt.yticks(**csfont) +idx += 1 + +key = 'NEU -- Neutr¢filos' +low = lab_tests_overall[key].quantile(limit) +high = lab_tests_overall[key].quantile(1 - limit) +lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)] +lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)] +lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)] +plt.subplot(4, 4, idx) +plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha) +plt.xlabel('NEU -- Neutr¢filos',**csfont) +plt.ylabel('Percentage',**csfont) +plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) +# plt.title('Temperature Histogram', **csfont) +plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) +plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) +plt.xticks(**csfont) +plt.yticks(**csfont) +idx += 1 + +key = 'NEU% -- Neutr¢filos %' +low = lab_tests_overall[key].quantile(limit) +high = lab_tests_overall[key].quantile(1 - limit) +lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)] +lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)] +lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)] +plt.subplot(4, 4, idx) +plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha) +plt.xlabel('NEU% -- Neutr¢filos%',**csfont) +plt.ylabel('Percentage',**csfont) +plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) +# plt.title('Temperature Histogram', **csfont) +plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) +plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) +plt.xticks(**csfont) +plt.yticks(**csfont) +idx += 1 + +key = 'LIN -- Linfocitos' +low = lab_tests_overall[key].quantile(limit) +high = lab_tests_overall[key].quantile(1 - limit) +lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)] +lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)] +lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)] +plt.subplot(4, 4, idx) +plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha) +plt.xlabel('LIN -- Linfocitos',**csfont) +plt.ylabel('Percentage',**csfont) +plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) +# plt.title('Temperature Histogram', **csfont) +plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) +plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) +plt.xticks(**csfont) +plt.yticks(**csfont) +idx += 1 + +key = 'LIN% -- Linfocitos %' +low = lab_tests_overall[key].quantile(limit) +high = lab_tests_overall[key].quantile(1 - limit) +lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)] +lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)] +lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)] +plt.subplot(4, 4, idx) +plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha) +plt.xlabel('LIN% -- Linfocitos%',**csfont) +plt.ylabel('Percentage',**csfont) +plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) +# plt.title('Temperature Histogram', **csfont) +plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) +plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) +plt.xticks(**csfont) +plt.yticks(**csfont) +idx += 1 + +key = 'ADW -- Coeficiente de anisocitosis' +low = lab_tests_overall[key].quantile(limit) +high = lab_tests_overall[key].quantile(1 - limit) +lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)] +lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)] +lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)] +plt.subplot(4, 4, idx) +plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha) +plt.xlabel('ADW -- Coeficiente de anisocitosis',**csfont) +plt.ylabel('Percentage',**csfont) +plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) +# plt.title('Temperature Histogram', **csfont) +plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) +plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) +plt.xticks(**csfont) +plt.yticks(**csfont) +idx += 1 + +handles, labels = ax.get_legend_handles_labels() +print(handles, labels) +# fig.legend(handles, labels, loc='upper center') +plt.figlegend(handles, labels, loc='upper center', ncol=5, fontsize=18, bbox_to_anchor=(0.5, 1.05), prop=font_manager.FontProperties(family='Times New Roman', + style='normal', size=18)) +# fig.legend(, [], loc='upper center') + +fig.tight_layout() +plt.show() +""" +