Switch to unified view

a b/datasets/cdsl/preprocess.py
1
# %% [markdown]
2
# # hm dataset pre-processing
3
# 
4
# import packages
5
6
# %%
7
import os
8
import pandas as pd
9
import numpy as np
10
import matplotlib.pyplot as plt
11
import pickle as pkl
12
import torch
13
import math
14
import datetime
15
from tqdm import tqdm
16
import datetime
17
import re
18
from functools import reduce
19
20
# %% [markdown]
21
# ## Demographic data
22
23
# %%
24
demographic = pd.read_csv('./raw_data/19_04_2021/COVID_DSL_01.CSV', encoding='ISO-8859-1', sep='|')
25
print(len(demographic))
26
demographic.head()
27
28
# %%
29
med = pd.read_csv('./raw_data/19_04_2021/COVID_DSL_04.CSV', encoding='ISO-8859-1', sep='|')
30
print(len(med))
31
med.head()
32
33
# %%
34
len(med['ID_ATC7'].unique())
35
36
# %% [markdown]
37
# get rid of patient with missing label
38
39
# %%
40
print(len(demographic))
41
demographic = demographic.dropna(axis=0, how='any', subset=['IDINGRESO', 'F_INGRESO_ING', 'F_ALTA_ING', 'MOTIVO_ALTA_ING'])
42
print(len(demographic))
43
44
# %%
45
def outcome2num(x):
46
    if x == 'Fallecimiento':
47
        return 1
48
    else:
49
        return 0
50
51
def to_one_hot(x, feature):
52
    if x == feature:
53
        return 1
54
    else:
55
        return 0
56
57
# %%
58
# select necessary columns from demographic
59
demographic = demographic[
60
        [
61
            'IDINGRESO', 
62
            'EDAD',
63
            'SEX',
64
            'F_INGRESO_ING', 
65
            'F_ALTA_ING', 
66
            'MOTIVO_ALTA_ING', 
67
            'ESPECIALIDAD_URGENCIA', 
68
            'DIAG_URG'
69
        ]
70
    ]
71
72
# rename column
73
demographic = demographic.rename(columns={
74
    'IDINGRESO': 'PATIENT_ID',
75
    'EDAD': 'AGE',
76
    'SEX': 'SEX',
77
    'F_INGRESO_ING': 'ADMISSION_DATE',
78
    'F_ALTA_ING': 'DEPARTURE_DATE',
79
    'MOTIVO_ALTA_ING': 'OUTCOME',
80
    'ESPECIALIDAD_URGENCIA': 'DEPARTMENT_OF_EMERGENCY',
81
    'DIAG_URG': 'DIAGNOSIS_AT_EMERGENCY_VISIT'
82
})
83
84
# SEX: male: 1; female: 0
85
demographic['SEX'].replace('MALE', 1, inplace=True)
86
demographic['SEX'].replace('FEMALE', 0, inplace=True)
87
88
# outcome: Fallecimiento(dead): 1; others: 0
89
demographic['OUTCOME'] = demographic['OUTCOME'].map(outcome2num)
90
91
# diagnosis at emergency visit (loss rate < 10%)
92
# demographic['DIFFICULTY_BREATHING'] = demographic['DIAGNOSIS_AT_EMERGENCY_VISIT'].map(lambda x: to_one_hot(x, 'DIFICULTAD RESPIRATORIA')) # 1674
93
# demographic['SUSPECT_COVID'] = demographic['DIAGNOSIS_AT_EMERGENCY_VISIT'].map(lambda x: to_one_hot(x, 'SOSPECHA COVID-19')) # 960
94
# demographic['FEVER'] = demographic['DIAGNOSIS_AT_EMERGENCY_VISIT'].map(lambda x: to_one_hot(x, 'FIEBRE')) # 455
95
96
# department of emergency (loss rate < 10%)
97
# demographic['EMERGENCY'] = demographic['DEPARTMENT_OF_EMERGENCY'].map(lambda x: to_one_hot(x, 'Medicina de Urgencias')) # 3914
98
99
# %%
100
# del useless data
101
demographic = demographic[
102
        [
103
            'PATIENT_ID',
104
            'AGE',
105
            'SEX',
106
            'ADMISSION_DATE',
107
            'DEPARTURE_DATE',
108
            'OUTCOME',
109
            # 'DIFFICULTY_BREATHING',
110
            # 'SUSPECT_COVID',
111
            # 'FEVER',
112
            # 'EMERGENCY'
113
        ]
114
    ]
115
116
# %%
117
demographic.describe().to_csv('demographic_overview.csv', mode='w', index=False)
118
demographic.describe()
119
120
# %% [markdown]
121
# ### Analyze data
122
123
# %%
124
plt.scatter(demographic['PATIENT_ID'], demographic['AGE'], s=1)
125
plt.xlabel('Patient Id')
126
plt.ylabel('Age')
127
plt.title('Patient-Age Scatter Plot')
128
129
# %%
130
plt.scatter(demographic['PATIENT_ID'], demographic['AGE'], s=1)
131
plt.xlabel('Patient Id')
132
plt.ylabel('Age')
133
plt.title('Patient-Age Scatter Plot')
134
135
# %%
136
demographic.to_csv('demographic.csv', mode='w', index=False)
137
demographic.head()
138
139
# %% [markdown]
140
# ## Vital Signal
141
142
# %%
143
vital_signs = pd.read_csv('./raw_data/19_04_2021/COVID_DSL_02.CSV', encoding='ISO-8859-1', sep='|')
144
print(len(vital_signs))
145
vital_signs.head()
146
147
# %%
148
vital_signs = vital_signs.rename(columns={
149
    'IDINGRESO': 'PATIENT_ID',
150
    'CONSTANTS_ING_DATE': 'RECORD_DATE',
151
    'CONSTANTS_ING_TIME': 'RECORD_TIME',
152
    'FC_HR_ING': 'HEART_RATE',
153
    'GLU_GLY_ING': 'BLOOD_GLUCOSE',
154
    'SAT_02_ING': 'OXYGEN_SATURATION',
155
    'TA_MAX_ING': 'MAX_BLOOD_PRESSURE',
156
    'TA_MIN_ING': 'MIN_BLOOD_PRESSURE',
157
    'TEMP_ING': 'TEMPERATURE'
158
})
159
vital_signs['RECORD_TIME'] = vital_signs['RECORD_DATE'] + ' ' + vital_signs['RECORD_TIME']
160
vital_signs['RECORD_TIME'] = vital_signs['RECORD_TIME'].map(lambda x: str(datetime.datetime.strptime(x, '%Y-%m-%d %H:%M')))
161
vital_signs = vital_signs.drop(['RECORD_DATE', 'SAT_02_ING_OBS', 'BLOOD_GLUCOSE'], axis=1)
162
163
# %%
164
vital_signs.describe()
165
166
# %%
167
vital_signs.head()
168
169
# %%
170
def format_temperature(x):
171
    if type(x) == str:
172
        return float(x.replace(',', '.'))
173
    else:
174
        return float(x)
175
176
def format_oxygen(x):
177
    x = float(x)
178
    if x > 100:
179
        return np.nan
180
    else:
181
        return x
182
183
def format_heart_rate(x):
184
    x = int(x)
185
    if x > 220:
186
        return np.nan
187
    else:
188
        return x
189
190
vital_signs['TEMPERATURE'] = vital_signs['TEMPERATURE'].map(lambda x: format_temperature(x))
191
vital_signs['OXYGEN_SATURATION'] = vital_signs['OXYGEN_SATURATION'].map(lambda x: format_oxygen(x))
192
vital_signs['HEART_RATE'] = vital_signs['HEART_RATE'].map(lambda x: format_heart_rate(x))
193
194
# %%
195
vital_signs = vital_signs.replace(0, np.NAN)
196
197
# %%
198
vital_signs = vital_signs.groupby(['PATIENT_ID', 'RECORD_TIME'], dropna=True, as_index = False).mean()
199
vital_signs.head()
200
201
# %%
202
vital_signs.describe()
203
204
# %%
205
vital_signs.describe().to_csv('vital_signs_overview.csv', index=False, mode='w')
206
vital_signs.describe()
207
208
# %%
209
"""
210
#plt.rcParams['figure.figsize'] = [10, 5]
211
fig=plt.figure(figsize=(16,10), dpi= 100, facecolor='w', edgecolor='k')
212
213
plt.subplot(2, 3, 1)
214
plt.scatter(vital_signs.index, vital_signs['MAX_BLOOD_PRESSURE'], s=1)
215
plt.xlabel('Index')
216
plt.ylabel('Max Blood Pressure')
217
plt.title('Visit-Max Blood Pressure Scatter Plot')
218
219
plt.subplot(2, 3, 2)
220
plt.scatter(vital_signs.index, vital_signs['MIN_BLOOD_PRESSURE'], s=1)
221
plt.xlabel('Index')
222
plt.ylabel('Min Blood Pressure')
223
plt.title('Visit-Min Blood Pressure Scatter Plot')
224
225
plt.subplot(2, 3, 3)
226
plt.scatter(vital_signs.index, vital_signs['TEMPERATURE'], s=1)
227
plt.xlabel('Index')
228
plt.ylabel('Temperature')
229
plt.title('Visit-Temperature Scatter Plot')
230
231
plt.subplot(2, 3, 4)
232
plt.scatter(vital_signs.index, vital_signs['HEART_RATE'], s=1)
233
plt.xlabel('Index')
234
plt.ylabel('Heart Rate')
235
plt.title('Visit-Heart Rate Scatter Plot')
236
237
plt.subplot(2, 3, 5)
238
plt.scatter(vital_signs.index, vital_signs['OXYGEN_SATURATION'], s=1)
239
plt.xlabel('Index')
240
plt.ylabel('Oxygen Saturation')
241
plt.title('Visit-Oxygen Saturation Scatter Plot')
242
243
plt.show()
244
"""
245
# %%
246
"""
247
#plt.rcParams['figure.figsize'] = [10, 5]
248
fig=plt.figure(figsize=(16,10), dpi= 100, facecolor='w', edgecolor='k')
249
250
plt.subplot(2, 3, 1)
251
plt.hist(vital_signs['MAX_BLOOD_PRESSURE'], bins=30)
252
plt.xlabel('Index')
253
plt.ylabel('Max Blood Pressure')
254
plt.title('Visit-Max Blood Pressure Histogram')
255
256
plt.subplot(2, 3, 2)
257
plt.hist(vital_signs['MIN_BLOOD_PRESSURE'], bins=30)
258
plt.xlabel('Index')
259
plt.ylabel('Min Blood Pressure')
260
plt.title('Visit-Min Blood Pressure Histogram')
261
262
plt.subplot(2, 3, 3)
263
plt.hist(vital_signs['TEMPERATURE'], bins=30)
264
plt.xlabel('Index')
265
plt.ylabel('Temperature')
266
plt.title('Visit-Temperature Histogram')
267
268
plt.subplot(2, 3, 4)
269
plt.hist(vital_signs['HEART_RATE'], bins=30)
270
plt.xlabel('Index')
271
plt.ylabel('Heart Rate')
272
plt.title('Visit-Heart Rate Histogram')
273
274
plt.subplot(2, 3, 5)
275
plt.hist(vital_signs['OXYGEN_SATURATION'], bins=30)
276
plt.xlabel('Index')
277
plt.ylabel('Oxygen Saturation')
278
plt.title('Visit-Oxygen Saturation Histogram')
279
280
plt.show()
281
"""
282
# %% [markdown]
283
# ### Missing rate of each visit
284
285
# %%
286
sum(vital_signs.T.isnull().sum()) / ((len(vital_signs.T) - 2) * len(vital_signs))
287
288
# %% [markdown]
289
# ### Normalize data
290
291
# %%
292
"""
293
for key in vital_signs.keys()[2:]:
294
    vital_signs[key] = (vital_signs[key] - vital_signs[key].mean()) / (vital_signs[key].std() + 1e-12)
295
296
vital_signs.describe()
297
"""
298
299
# %%
300
vital_signs.to_csv('visual_signs.csv', mode='w', index=False)
301
302
# %%
303
len(vital_signs) / len(vital_signs['PATIENT_ID'].unique())
304
305
# %% [markdown]
306
# ## Lab Tests
307
308
# %%
309
lab_tests = pd.read_csv('./raw_data/19_04_2021/COVID_DSL_06_v2.CSV', encoding='ISO-8859-1', sep=';')
310
lab_tests = lab_tests.rename(columns={'IDINGRESO': 'PATIENT_ID'})
311
print(len(lab_tests))
312
313
# del useless data
314
lab_tests = lab_tests[
315
        [
316
            'PATIENT_ID',
317
            'LAB_NUMBER',
318
            'LAB_DATE',
319
            'TIME_LAB',
320
            'ITEM_LAB',
321
            'VAL_RESULT'
322
            # UD_RESULT: unit
323
            # REF_VALUES: reference values
324
        ]
325
    ]
326
327
lab_tests.head()
328
329
# %%
330
lab_tests = lab_tests.groupby(['PATIENT_ID', 'LAB_NUMBER', 'LAB_DATE', 'TIME_LAB', 'ITEM_LAB'], dropna=True, as_index = False).first()
331
lab_tests = lab_tests.set_index(['PATIENT_ID', 'LAB_NUMBER', 'LAB_DATE', 'TIME_LAB', 'ITEM_LAB'], drop = True).unstack('ITEM_LAB')['VAL_RESULT'].reset_index()
332
333
lab_tests = lab_tests.drop([
334
    'CFLAG -- ALARMA HEMOGRAMA', 
335
    'CORONA -- PCR CORONAVIRUS 2019nCoV', 
336
    'CRIOGLO -- CRIOGLOBULINAS',
337
    'EGCOVID -- ESTUDIO GENETICO COVID-19',
338
    'FRO1 -- ',
339
    'FRO1 -- FROTIS EN SANGRE PERIFERICA',
340
    'FRO2 -- ',
341
    'FRO2 -- FROTIS EN SANGRE PERIFERICA',
342
    'FRO3 -- ',
343
    'FRO3 -- FROTIS EN SANGRE PERIFERICA',
344
    'FRO_COMEN -- ',
345
    'FRO_COMEN -- FROTIS EN SANGRE PERIFERICA',
346
    'G-CORONAV (RT-PCR) -- Tipo de muestra: ASPIRADO BRONCOALVEOLAR',
347
    'G-CORONAV (RT-PCR) -- Tipo de muestra: EXUDADO',
348
    'GRRH -- GRUPO SANGUÖNEO Y FACTOR Rh',
349
    'HEML -- RECUENTO CELULAR LIQUIDO',
350
    'HEML -- Recuento Hemat¡es',
351
    'IFSUERO -- INMUNOFIJACION EN SUERO',
352
    'OBS_BIOMOL -- OBSERVACIONES GENETICA MOLECULAR',
353
    'OBS_BIOO -- Observaciones Bioqu¡mica Orina',
354
    'OBS_CB -- Observaciones Coagulaci¢n',
355
    'OBS_GASES -- Observaciones Gasometr¡a Arterial',
356
    'OBS_GASV -- Observaciones Gasometr¡a Venosa',
357
    'OBS_GEN2 -- OBSERVACIONES GENETICA',
358
    'OBS_HOR -- Observaciones Hormonas',
359
    'OBS_MICRO -- Observaciones Microbiolog¡a',
360
    'OBS_NULA2 -- Observaciones Bioqu¡mica',
361
    'OBS_NULA3 -- Observaciones Hematolog¡a',
362
    'OBS_PESP -- Observaciones Pruebas especiales',
363
    'OBS_SERO -- Observaciones Serolog¡a',
364
    'OBS_SIS -- Observaciones Orina',
365
    'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: ASPIRADO BRONCOALVEOLAR',
366
    'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: BAS',
367
    'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: ESPUTO',
368
    'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: EXUDADO',
369
    'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: LAVADO BRONCOALVEOLAR',
370
    'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: LAVADO NASOFARÖNGEO',
371
    'PTGOR -- PROTEINOGRAMA ORINA',
372
    'RESUL_IFT -- ESTUDIO DE INMUNOFENOTIPO',
373
    'RESUL_IFT -- Resultado',
374
    'Resultado -- Resultado',
375
    'SED1 -- ',
376
    'SED1 -- SEDIMENTO',
377
    'SED2 -- ',
378
    'SED2 -- SEDIMENTO',
379
    'SED3 -- ',
380
    'SED3 -- SEDIMENTO',
381
    'TIPOL -- TIPO DE LIQUIDO',
382
    'Tecnica -- T\x82cnica',
383
    'TpMues -- Tipo de muestra',
384
    'VHCBLOT -- INMUNOBLOT VIRUS HEPATITIS C',
385
    'VIR_TM -- VIRUS TIPO DE MUESTRA',
386
    'LEGIORI -- AG. LEGIONELA PNEUMOPHILA EN ORINA',
387
    'NEUMOORI -- AG NEUMOCOCO EN ORINA',
388
    'VIHAC -- VIH AC'
389
    ], axis=1)
390
391
    
392
lab_tests.head()
393
394
# %%
395
lab_tests = lab_tests.replace('Sin resultado.', np.nan)
396
lab_tests = lab_tests.replace('Sin resultado', np.nan)
397
lab_tests = lab_tests.replace('----', np.nan).replace('---', np.nan)
398
lab_tests = lab_tests.replace('> ', '').replace('< ', '')
399
400
def change_format(x):
401
    if x is None:
402
        return np.nan
403
    elif type(x) == str:
404
        if x.startswith('Negativo ('):
405
            return x.replace('Negativo (', '-')[:-1]
406
        elif x.startswith('Positivo ('):
407
            return x.replace('Positivo (', '')[:-1]
408
        elif x.startswith('Zona limite ('):
409
            return x.replace('Zona limite (', '')[:-1]
410
        elif x.startswith('>'):
411
            return x.replace('> ', '').replace('>', '')
412
        elif x.startswith('<'):
413
            return x.replace('< ', '').replace('<', '')
414
        elif x.endswith(' mg/dl'):
415
            return x.replace(' mg/dl', '')
416
        elif x.endswith('/æl'):
417
            return x.replace('/æl', '')
418
        elif x.endswith(' copias/mL'):
419
            return x.replace(' copias/mL', '')
420
        elif x == 'Numerosos':
421
            return 1.5
422
        elif x == 'Aislados':
423
            return 0.5
424
        elif x == 'Se detecta' or x == 'Se observan' or x == 'Normal' or x == 'Positivo':
425
            return 1
426
        elif x == 'No se detecta' or x == 'No se observan' or x == 'Negativo':
427
            return 0
428
        elif x == 'Indeterminado':
429
            return np.nan
430
        else:
431
            num = re.findall("[-+]?\d+\.\d+", x)
432
            if len(num) == 0:
433
                return np.nan
434
            else:
435
                return num[0]
436
    else:
437
        return x
438
439
feature_value_dict = dict()
440
441
for k in tqdm(lab_tests.keys()[4:]):
442
    lab_tests[k] = lab_tests[k].map(lambda x: change_format(change_format(x)))
443
    feature_value_dict[k] = lab_tests[k].unique()
444
445
# %%
446
def nan_and_not_nan(x):
447
    if x == x:
448
        return 1
449
    else: # nan
450
        return 0
451
452
def is_float(num):
453
    try:
454
        float(num)
455
        return True
456
    except ValueError:
457
        return False
458
459
def is_all_float(x):
460
    for i in x:
461
        if i == i and (i != None):
462
            if not is_float(i):
463
                return False
464
    return True
465
466
def to_float(x):
467
    if x != None:
468
        return float(x)
469
    else:
470
        return np.nan
471
472
other_feature_dict = dict()
473
474
for feature in tqdm(feature_value_dict.keys()):
475
    values = feature_value_dict[feature]
476
    if is_all_float(values):
477
        lab_tests[feature] = lab_tests[feature].map(lambda x: to_float(x))
478
    elif len(values) == 2:
479
        lab_tests[feature] = lab_tests[feature].map(lambda x: nan_and_not_nan(x))
480
    else:
481
        other_feature_dict[feature] = values
482
483
# %%
484
other_feature_dict
485
486
# %%
487
def format_time(t):
488
    if '/' in t:
489
        return str(datetime.datetime.strptime(t, '%d/%m/%Y %H:%M'))
490
    else:
491
        return str(datetime.datetime.strptime(t, '%d-%m-%Y %H:%M'))
492
493
lab_tests['RECORD_TIME'] = lab_tests['LAB_DATE'] + ' ' + lab_tests['TIME_LAB']
494
lab_tests['RECORD_TIME'] = lab_tests['RECORD_TIME'].map(lambda x: format_time(x))
495
lab_tests = lab_tests.drop(['LAB_NUMBER', 'LAB_DATE', 'TIME_LAB'], axis=1)
496
# lab_tests = lab_tests.drop(['LAB_NUMBER', 'TIME_LAB'], axis=1)
497
lab_tests.head()
498
499
# %%
500
lab_tests_patient = lab_tests.groupby(['PATIENT_ID'], dropna=True, as_index = False).mean()
501
print(len(lab_tests_patient))
502
count = [i for i in lab_tests_patient.count()[1:]]
503
plt.hist(count)
504
505
# %%
506
patient_total = len(lab_tests_patient)
507
threshold = patient_total * 0.1
508
reserved_keys = []
509
510
for key in lab_tests_patient.keys():
511
    if lab_tests_patient[key].count() > threshold:
512
        reserved_keys.append(key)
513
514
print(len(reserved_keys))
515
reserved_keys
516
517
# %%
518
reserved_keys.insert(1, 'RECORD_TIME')
519
520
lab_tests = lab_tests.groupby(['PATIENT_ID', 'RECORD_TIME'], dropna=True, as_index = False).mean()
521
522
lab_tests = lab_tests[reserved_keys]
523
lab_tests.head()
524
"""
525
# %% [markdown]
526
# ### Missing rate of each visit
527
528
# %%
529
sum(lab_tests.T.isnull().sum()) / ((len(lab_tests.T) - 2) * len(lab_tests))
530
531
# %% [markdown]
532
# ### Scatter Plot
533
534
# %%
535
fig=plt.figure(figsize=(16,200), dpi= 100, facecolor='w', edgecolor='k')
536
537
i = 1
538
for key in lab_tests.keys()[2:]:
539
    plt.subplot(33, 3, i)
540
    plt.scatter(lab_tests.index, lab_tests[key], s=1)
541
    plt.ylabel(key)
542
    i += 1
543
544
plt.show()
545
546
# %%
547
fig=plt.figure(figsize=(20,120), dpi= 100, facecolor='w', edgecolor='k')
548
549
i = 1
550
for key in lab_tests.keys()[2:]:
551
    plt.subplot(23, 4, i)
552
    plt.hist(lab_tests[key], bins=30)
553
    q3 = lab_tests[key].quantile(0.75)
554
    q1 = lab_tests[key].quantile(0.25)
555
    qh = q3 + 3 * (q3 - q1)
556
    ql = q1 - 3 * (q3 - q1)
557
    sigma = 5
558
    plt.axline([sigma*lab_tests[key].std() + lab_tests[key].mean(), 0], [sigma*lab_tests[key].std() + lab_tests[key].mean(), 1], color = "r", linestyle=(0, (5, 5)))
559
    plt.axline([-sigma*lab_tests[key].std() + lab_tests[key].mean(), 0], [-sigma*lab_tests[key].std() + lab_tests[key].mean(), 1], color = "r", linestyle=(0, (5, 5)))
560
    #plt.axline([lab_tests[key].quantile(0.25), 0], [lab_tests[key].quantile(0.25), 1], color = "k", linestyle=(0, (5, 5)))
561
    #plt.axline([lab_tests[key].quantile(0.75), 0], [lab_tests[key].quantile(0.75), 1], color = "k", linestyle=(0, (5, 5)))
562
    plt.axline([qh, 0], [qh, 1], color='k', linestyle=(0, (5, 5)))
563
    plt.axline([ql, 0], [ql, 1], color='k', linestyle=(0, (5, 5)))
564
    plt.ylabel(key)
565
    i += 1
566
567
plt.show()
568
"""
569
# %% [markdown]
570
# ### Normalize data
571
572
# %%
573
"""
574
for key in lab_tests.keys()[2:]:
575
    lab_tests[key] = (lab_tests[key] - lab_tests[key].mean()) / (lab_tests[key].std() + 1e-12)
576
577
lab_tests.describe()
578
"""
579
580
# %%
581
# 【del normalization】
582
# for key in lab_tests.keys()[2:]:
583
#     r = lab_tests[lab_tests[key].between(lab_tests[key].quantile(0.05), lab_tests[key].quantile(0.95))]
584
#     lab_tests[key] = (lab_tests[key] - r[key].mean()) / (r[key].std() + 1e-12)
585
586
# %%
587
lab_tests.to_csv('lab_test.csv', mode='w', index=False)
588
589
# %% [markdown]
590
# # Concat data
591
592
# %%
593
demographic['PATIENT_ID'] = demographic['PATIENT_ID'].map(lambda x: str(int(x)))
594
vital_signs['PATIENT_ID'] = vital_signs['PATIENT_ID'].map(lambda x: str(int(x)))
595
lab_tests['PATIENT_ID'] = lab_tests['PATIENT_ID'].map(lambda x: str(int(x)))
596
597
# %%
598
len(demographic['PATIENT_ID'].unique()), len(vital_signs['PATIENT_ID'].unique()), len(lab_tests['PATIENT_ID'].unique())
599
600
# %%
601
train_df = pd.merge(vital_signs, lab_tests, on=['PATIENT_ID', 'RECORD_TIME'], how='outer')
602
603
train_df = train_df.groupby(['PATIENT_ID', 'RECORD_TIME'], dropna=True, as_index = False).mean()
604
605
train_df = pd.merge(demographic, train_df, on=['PATIENT_ID'], how='left')
606
607
train_df.head()
608
609
# %%
610
# del rows without patient_id, admission_date, record_time, or outcome
611
train_df = train_df.dropna(axis=0, how='any', subset=['PATIENT_ID', 'ADMISSION_DATE', 'RECORD_TIME', 'OUTCOME'])
612
613
# %%
614
train_df.to_csv('train.csv', mode='w', index=False)
615
train_df.describe()
616
617
# %% [markdown]
618
# ## Missing rate of each visit
619
620
# %%
621
sum(train_df.T.isnull().sum()) / ((len(train_df.T) - 2) * len(train_df))
622
623
# %% [markdown]
624
# # Split and save data
625
626
# %% [markdown]
627
# * demo: demographic data
628
# * x: lab test & vital signs
629
# * y: outcome & length of stay
630
631
# %%
632
patient_ids = train_df['PATIENT_ID'].unique()
633
634
demo_cols = ['AGE', 'SEX'] # , 'DIFFICULTY_BREATHING', 'FEVER', 'SUSPECT_COVID', 'EMERGENCY'
635
test_cols = []
636
637
# get column names
638
for k in train_df.keys():
639
    if not k in demographic.keys():
640
        if not k == 'RECORD_TIME':
641
            test_cols.append(k)
642
643
test_median = train_df[test_cols].median()
644
645
# %%
646
test_cols
647
648
# %%
649
train_df['RECORD_TIME_DAY'] = train_df['RECORD_TIME'].map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d'))
650
train_df['RECORD_TIME_HOUR'] = train_df['RECORD_TIME'].map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d %H'))
651
train_df.head()
652
653
# %%
654
train_df_day = train_df.groupby(['PATIENT_ID', 'ADMISSION_DATE', 'DEPARTURE_DATE', 'RECORD_TIME_DAY'], dropna=True, as_index = False).mean()
655
train_df_hour = train_df.groupby(['PATIENT_ID', 'ADMISSION_DATE', 'DEPARTURE_DATE', 'RECORD_TIME_HOUR'], dropna=True, as_index = False).mean()
656
657
len(train_df), len(train_df_day), len(train_df_hour)
658
659
# %% [markdown]
660
# 
661
# ```
662
# number of visits (total)
663
# - Original data: 168777
664
# - Merge by hour: 130141
665
# - Merge by day:  42204
666
# ```
667
668
# %%
669
len(train_df['PATIENT_ID'].unique())
670
671
# %%
672
def get_visit_intervals(df):
673
    ls = []
674
    for pat in df['PATIENT_ID'].unique():
675
        ls.append(len(df[df['PATIENT_ID'] == pat]))
676
    return ls
677
678
# %%
679
ls_org = get_visit_intervals(train_df)
680
ls_hour = get_visit_intervals(train_df_hour)
681
ls_day = get_visit_intervals(train_df_day)
682
683
# %%
684
import matplotlib.pyplot as plt
685
from matplotlib.ticker import PercentFormatter
686
import matplotlib.font_manager as font_manager
687
import pandas as pd
688
import numpy as np
689
"""
690
csfont = {'fontname':'Times New Roman', 'fontsize': 18}
691
font = 'Times New Roman'
692
fig=plt.figure(figsize=(18,4), dpi= 100, facecolor='w', edgecolor='k')
693
plt.style.use('seaborn-whitegrid')
694
color = 'cornflowerblue'
695
ec = 'None'
696
alpha=0.5
697
698
ax = plt.subplot(1, 3, 1)
699
ax.hist(ls_org, bins=20, weights=np.ones(len(ls_org)) / len(ls_org), color=color, ec=ec, alpha=alpha, label='overall')
700
plt.xlabel('Num of visits (org)',**csfont)
701
plt.ylabel('Percentage',**csfont)
702
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
703
plt.xticks(**csfont)
704
plt.yticks(**csfont)
705
706
ax = plt.subplot(1, 3, 2)
707
ax.hist(ls_hour, bins=20, weights=np.ones(len(ls_hour)) / len(ls_hour), color=color, ec=ec, alpha=alpha, label='overall')
708
plt.xlabel('Num of visits (hour)',**csfont)
709
plt.ylabel('Percentage',**csfont)
710
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
711
plt.xticks(**csfont)
712
plt.yticks(**csfont)
713
714
ax = plt.subplot(1, 3, 3)
715
ax.hist(ls_day, bins=20, weights=np.ones(len(ls_day)) / len(ls_day), color=color, ec=ec, alpha=alpha, label='overall')
716
plt.xlabel('Num of visits (day)',**csfont)
717
plt.ylabel('Percentage',**csfont)
718
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
719
plt.xticks(**csfont)
720
plt.yticks(**csfont)
721
722
plt.show()
723
"""
724
# %%
725
def get_statistic(lst, name):
726
    print(f'[{name}]\tMax:\t{max(lst)}, Min:\t{min(lst)}, Median:\t{np.median(lst)}, Mean:\t{np.mean(lst)}, 80%:\t{np.quantile(lst, 0.8)}, 90%:\t{np.quantile(lst, 0.9)}, 95%:\t{np.quantile(lst, 0.95)}')
727
728
# %%
729
get_statistic(ls_org, 'ls_org')
730
get_statistic(ls_hour, 'ls_hour')
731
get_statistic(ls_day, 'ls_day')
732
733
# %%
734
train_df_hour['LOS'] = train_df_hour['ADMISSION_DATE']
735
train_df_hour['LOS_HOUR'] = train_df_hour['ADMISSION_DATE']
736
737
# %%
738
train_df_hour = train_df_hour.reset_index()
739
740
# %%
741
for idx in tqdm(range(len(train_df_hour))):
742
    info = train_df_hour.loc[idx]
743
    admission = datetime.datetime.strptime(info['ADMISSION_DATE'], '%Y-%m-%d %H:%M:%S')
744
    departure = datetime.datetime.strptime(info['DEPARTURE_DATE'], '%Y-%m-%d %H:%M:%S')
745
    visit_hour = datetime.datetime.strptime(info['RECORD_TIME_HOUR'], '%Y-%m-%d %H')
746
    hour = (departure - visit_hour).seconds / (24 * 60 * 60) + (departure - visit_hour).days
747
    los = (departure - admission).seconds / (24 * 60 * 60) + (departure - admission).days
748
    train_df_hour.at[idx, 'LOS'] = float(los)
749
    train_df_hour.at[idx, 'LOS_HOUR'] = float(hour)
750
751
# %%
752
train_df_hour['LOS']
753
754
# %%
755
los = []
756
for pat in tqdm(train_df_hour['PATIENT_ID'].unique()):
757
    los.append(float(train_df_hour[train_df_hour['PATIENT_ID'] == pat]['LOS'].head(1)))
758
759
# %%
760
get_statistic(los, 'los')
761
762
# %%
763
import matplotlib.pyplot as plt
764
from matplotlib.ticker import PercentFormatter
765
import matplotlib.font_manager as font_manager
766
import pandas as pd
767
import numpy as np
768
"""
769
csfont = {'fontname':'Times New Roman', 'fontsize': 18}
770
font = 'Times New Roman'
771
fig=plt.figure(figsize=(6,6), dpi= 100, facecolor='w', edgecolor='k')
772
plt.style.use('seaborn-whitegrid')
773
color = 'cornflowerblue'
774
ec = 'None'
775
alpha=0.5
776
777
ax = plt.subplot(1, 1, 1)
778
ax.hist(los, bins=20, weights=np.ones(len(los)) / len(los), color=color, ec=ec, alpha=alpha, label='overall')
779
plt.xlabel('Length of stay',**csfont)
780
plt.ylabel('Percentage',**csfont)
781
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
782
plt.xticks(**csfont)
783
plt.yticks(**csfont)
784
785
plt.show()
786
"""
787
# %%
788
train_df_hour_idx = train_df_hour.reset_index()
789
790
# %%
791
train_df_hour_idx['LOS'] = train_df_hour_idx['ADMISSION_DATE']
792
793
for idx in tqdm(range(len(train_df_hour_idx))):
794
    info = train_df_hour_idx.loc[idx]
795
    # admission = datetime.datetime.strptime(info['ADMISSION_DATE'], '%Y-%m-%d %H:%M:%S')
796
    departure = datetime.datetime.strptime(info['DEPARTURE_DATE'], '%Y-%m-%d %H:%M:%S')
797
    visit_hour = datetime.datetime.strptime(info['RECORD_TIME_HOUR'], '%Y-%m-%d %H')
798
    hour = (departure - visit_hour).seconds / (24 * 60 * 60) + (departure - visit_hour).days
799
    train_df_hour_idx.at[idx, 'LOS'] = float(hour)
800
801
# %%
802
train_df_hour['LOS'] = train_df_hour['LOS_HOUR']
803
train_df_hour.drop(columns=['LOS_HOUR'])
804
805
# %%
806
# los_threshold = 13.0
807
808
# visit_num_hour = []
809
810
# for pat in tqdm(train_df_hour_idx['PATIENT_ID'].unique()):
811
#     pat_records = train_df_hour_idx[train_df_hour_idx['PATIENT_ID'] == pat]
812
#     hour = 0
813
#     for vis in pat_records.index:
814
#         pat_visit = pat_records.loc[vis]
815
#         if pat_visit['LOS_HOUR'] <= los_threshold:
816
#             hour += 1
817
#     visit_num_hour.append(hour)
818
#     if hour == 0:
819
#         print(pat)
820
821
# %%
822
# import matplotlib.pyplot as plt
823
# from matplotlib.ticker import PercentFormatter
824
# import matplotlib.font_manager as font_manager
825
# import pandas as pd
826
# import numpy as np
827
# csfont = {'fontname':'Times New Roman', 'fontsize': 18}
828
# font = 'Times New Roman'
829
# fig=plt.figure(figsize=(6,6), dpi= 100, facecolor='w', edgecolor='k')
830
# plt.style.use('seaborn-whitegrid')
831
# color = 'cornflowerblue'
832
# ec = 'None'
833
# alpha=0.5
834
835
# ax = plt.subplot(1, 1, 1)
836
# ax.hist(visit_num_hour, bins=20, weights=np.ones(len(visit_num_hour)) / len(visit_num_hour), color=color, ec=ec, alpha=alpha, label='overall')
837
# plt.xlabel('Visit num (80% los)',**csfont)
838
# plt.ylabel('Percentage',**csfont)
839
# plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
840
# plt.xticks(**csfont)
841
# plt.yticks(**csfont)
842
843
# plt.show()
844
845
# %%
846
train_df = train_df_hour
847
train_df.head()
848
849
# %%
850
train_df.describe()
851
852
# %%
853
get_statistic(train_df['LOS'], 'los')
854
855
# %%
856
train_df[train_df['PATIENT_ID'] == '1']['HEART_RATE'].count()
857
858
# %%
859
cols = train_df.columns[5:]
860
pats = train_df['PATIENT_ID'].unique()
861
all_pat_cnt = len(pats)
862
missing_rate = dict()
863
# for col in tqdm(cols):
864
#     miss = 0
865
#     for pat in pats:
866
#         if train_df[train_df['PATIENT_ID'] == pat][col].count() == 0:
867
#             miss += 1
868
#     missing_rate[col] = miss / all_pat_cnt
869
    
870
for col in cols:
871
    missing_rate[col] = 0
872
for pat in tqdm(pats):
873
    p = train_df[train_df['PATIENT_ID'] == pat]
874
    for col in cols:
875
        if p[col].count() == 0:
876
            missing_rate[col] += 1
877
for col in cols:
878
    missing_rate[col] = missing_rate[col] / all_pat_cnt
879
    
880
missing_rate
881
882
# %%
883
with open('missing_rate.csv', mode='w', encoding='utf-8') as file:
884
    for col in cols:
885
        file.write(f'"{col}", {100 * missing_rate[col]}\n')
886
887
# %%
888
train_df['LOS'] = train_df['LOS'].clip(lower=0)
889
890
# %%
891
get_statistic(train_df['LOS'], 'los')
892
893
# %%
894
# the first visit of each person
895
def init_prev(prev):
896
    miss = []
897
    l = len(prev)
898
    for idx in range(l):
899
        #print(prev[idx])
900
        #print(type(prev[idx]))
901
        if np.isnan(prev[idx]): # there is no previous record
902
            prev[idx] = test_median[idx] # replace nan to median
903
            miss.append(1) # mark miss as 1
904
        else: # there is a previous record
905
            miss.append(0)
906
    return miss
907
908
# the rest of the visits
909
def fill_nan(cur, prev):
910
    l = len(prev)
911
    miss = []
912
    for idx in range(l):
913
        #print(cur[idx])
914
        if np.isnan(cur[idx]): # there is no record in current timestep
915
            cur[idx] = prev[idx] # cur <- prev
916
            miss.append(1)
917
        else: # there is a record in current timestep
918
            miss.append(0)
919
    return miss
920
921
# %%
922
x, y, demo, x_lab_len, missing_mask = [], [], [], [], []
923
924
for pat in tqdm(patient_ids): # for all patients
925
    # get visits for pat.id == PATIENT_ID
926
    info = train_df[train_df['PATIENT_ID'] == pat]
927
    info = info[max(0, len(info) - 76):]
928
    indexes = info.index
929
    visit = info.loc[indexes[0]] # get the first visit
930
931
    # demographic data
932
    demo.append([visit[k] for k in demo_cols])
933
    
934
    # label
935
    outcome = visit['OUTCOME']
936
    los = []
937
938
    # lab test & vital signs
939
    tests = []
940
    prev = visit[test_cols]
941
    miss = [] # missing matrix
942
    miss.append(init_prev(prev)) # fill nan for the first visit for every patient and add missing status to missing matrix
943
    # leave = datetime.datetime.strptime(visit['DEPARTURE_DATE'], '%Y-%m-%d %H:%M:%S')
944
    
945
    first = True
946
    for i in indexes:
947
        visit = info.loc[i]
948
        # now = datetime.datetime.strptime(visit['RECORD_TIME'], '%Y-%m-%d %H')
949
        cur = visit[test_cols]
950
        tmp = fill_nan(cur, prev) # fill nan for the rest of the visits
951
        if not first:
952
            miss.append(tmp) # add missing status to missing matrix
953
        tests.append(cur)
954
        # los_visit = (leave - now).days
955
        # if los_visit < 0:
956
        #     los_visit = 0
957
        los.append(visit['LOS'])
958
        prev = cur
959
        first = False
960
961
    valid_visit = len(los)
962
    # outcome = [outcome] * valid_visit
963
    x_lab_len.append(valid_visit)
964
    missing_mask.append(miss) # append the patient's missing matrix to the total missing matrix
965
966
    # tests = np.pad(tests, ((0, max_visit - valid_visit), (0, 0)))
967
    # outcome = np.pad(outcome, (0, max_visit - valid_visit))
968
    # los = np.pad(los, (0, max_visit - valid_visit))
969
    
970
    y.append([outcome, los])
971
    x.append(tests)
972
973
# %%
974
all_x = x
975
all_x_demo = demo
976
all_y = y
977
all_missing_mask = missing_mask
978
979
# %%
980
all_x_labtest = np.array(all_x, dtype=object)
981
x_lab_length = [len(_) for _ in all_x_labtest]
982
x_lab_length = torch.tensor(x_lab_length, dtype=torch.int)
983
max_length = int(x_lab_length.max())
984
all_x_labtest = [torch.tensor(_) for _ in all_x_labtest]
985
all_x_labtest = torch.nn.utils.rnn.pad_sequence((all_x_labtest), batch_first=True)
986
all_x_demographic = torch.tensor(all_x_demo)
987
batch_size, demo_dim = all_x_demographic.shape
988
all_x_demographic = torch.reshape(all_x_demographic.repeat(1, max_length), (batch_size, max_length, demo_dim))
989
all_x = torch.cat((all_x_demographic, all_x_labtest), 2)
990
991
all_y = np.array(all_y, dtype=object)
992
patient_list = []
993
for pat in all_y:
994
    visits = []
995
    for i in pat[1]:
996
        visits.append([pat[0], i])
997
    patient_list.append(visits)
998
new_all_y = np.array(patient_list, dtype=object)
999
output_all_y = [torch.Tensor(_) for _ in new_all_y]
1000
output_all_y = torch.nn.utils.rnn.pad_sequence((output_all_y), batch_first=True)
1001
1002
# %%
1003
all_missing_mask = np.array(all_missing_mask, dtype=object)
1004
all_missing_mask = [torch.tensor(_) for _ in all_missing_mask]
1005
all_missing_mask = torch.nn.utils.rnn.pad_sequence((all_missing_mask), batch_first=True)
1006
1007
# %%
1008
all_x.shape
1009
1010
# %%
1011
all_missing_mask.shape
1012
1013
# %%
1014
# save pickle format dataset (torch)
1015
pd.to_pickle(all_x,f'./processed_data/x.pkl' )
1016
pd.to_pickle(all_missing_mask,f'./processed_data/missing_mask.pkl' )
1017
pd.to_pickle(output_all_y,f'./processed_data/y.pkl' )
1018
pd.to_pickle(x_lab_length,f'./processed_data/visits_length.pkl' )
1019
1020
# %%
1021
# Calculate patients' outcome statistics (patients-wise)
1022
outcome_list = []
1023
y_outcome = output_all_y[:, :, 0]
1024
indices = torch.arange(len(x_lab_length), dtype=torch.int64)
1025
for i in indices:
1026
    outcome_list.append(y_outcome[i][0].item())
1027
outcome_list = np.array(outcome_list)
1028
print(len(outcome_list))
1029
unique, count=np.unique(outcome_list,return_counts=True)
1030
data_count=dict(zip(unique,count))
1031
print(data_count)
1032
1033
# %%
1034
# Calculate patients' outcome statistics (records-wise)
1035
outcome_records_list = []
1036
y_outcome = output_all_y[:, :, 0]
1037
indices = torch.arange(len(x_lab_length), dtype=torch.int64)
1038
for i in indices:
1039
    outcome_records_list.extend(y_outcome[i][0:x_lab_length[i]].tolist())
1040
outcome_records_list = np.array(outcome_records_list)
1041
print(len(outcome_records_list))
1042
unique, count=np.unique(outcome_records_list,return_counts=True)
1043
data_count=dict(zip(unique,count))
1044
print(data_count)
1045
1046
# %%
1047
# Calculate patients' mean los and 95% percentile los
1048
los_list = []
1049
y_los = output_all_y[:, :, 1]
1050
indices = torch.arange(len(x_lab_length), dtype=torch.int64)
1051
for i in indices:
1052
    # los_list.extend(y_los[i][: x_lab_length[i].long()].tolist())
1053
    los_list.append(y_los[i][0].item())
1054
los_list = np.array(los_list)
1055
print(los_list.mean() * 0.5)
1056
print(np.median(los_list) * 0.5)
1057
print(np.percentile(los_list, 95))
1058
1059
print('median:', np.median(los_list))
1060
print('Q1:', np.percentile(los_list, 25))
1061
print('Q3:', np.percentile(los_list, 75))
1062
1063
# %%
1064
los_alive_list = np.array([los_list[i] for i in range(len(los_list)) if outcome_list[i] == 0])
1065
los_dead_list = np.array([los_list[i] for i in range(len(los_list)) if outcome_list[i] == 1])
1066
print(len(los_alive_list))
1067
print(len(los_dead_list))
1068
1069
print('[Alive]')
1070
print('median:', np.median(los_alive_list))
1071
print('Q1:', np.percentile(los_alive_list, 25))
1072
print('Q3:', np.percentile(los_alive_list, 75))
1073
1074
print('[Dead]')
1075
print('median:', np.median(los_dead_list))
1076
print('Q1:', np.percentile(los_dead_list, 25))
1077
print('Q3:', np.percentile(los_dead_list, 75))
1078
1079
# %%
1080
cdsl_los_statistics = {
1081
    'overall': los_list,
1082
    'alive': los_alive_list,
1083
    'dead': los_dead_list
1084
}
1085
pd.to_pickle(cdsl_los_statistics, 'cdsl_los_statistics.pkl')
1086
1087
# %%
1088
# calculate visits length Median [Q1, Q3]
1089
visits_list = np.array(x_lab_length)
1090
visits_alive_list = np.array([x_lab_length[i] for i in range(len(x_lab_length)) if outcome_list[i] == 0])
1091
visits_dead_list = np.array([x_lab_length[i] for i in range(len(x_lab_length)) if outcome_list[i] == 1])
1092
print(len(visits_alive_list))
1093
print(len(visits_dead_list))
1094
1095
print('[Total]')
1096
print('median:', np.median(visits_list))
1097
print('Q1:', np.percentile(visits_list, 25))
1098
print('Q3:', np.percentile(visits_list, 75))
1099
1100
print('[Alive]')
1101
print('median:', np.median(visits_alive_list))
1102
print('Q1:', np.percentile(visits_alive_list, 25))
1103
print('Q3:', np.percentile(visits_alive_list, 75))
1104
1105
print('[Dead]')
1106
print('median:', np.median(visits_dead_list))
1107
print('Q1:', np.percentile(visits_dead_list, 25))
1108
print('Q3:', np.percentile(visits_dead_list, 75))
1109
1110
# %%
1111
def check_nan(x):
1112
    if np.isnan(np.sum(x.cpu().numpy())):
1113
        print("some values from input are nan")
1114
    else:
1115
        print("no nan")
1116
1117
# %%
1118
check_nan(all_x)
1119
1120
# %% [markdown]
1121
# # Draw Charts
1122
1123
# %% [markdown]
1124
# ## Import packages
1125
1126
# %%
1127
import matplotlib.pyplot as plt
1128
from matplotlib.ticker import PercentFormatter
1129
import matplotlib.font_manager as font_manager
1130
import pandas as pd
1131
import numpy as np
1132
1133
plt.style.use('seaborn-whitegrid')
1134
color = 'cornflowerblue'
1135
ec = 'None'
1136
alpha=0.5
1137
alive_color = 'olivedrab'
1138
dead_color = 'orchid'
1139
1140
# %% [markdown]
1141
# ## Read data
1142
1143
# %%
1144
demographic.head()
1145
1146
# %%
1147
train = pd.read_csv('./train.csv')
1148
train['PATIENT_ID']=train['PATIENT_ID'].astype(str)
1149
demographic['PATIENT_ID']=demographic['PATIENT_ID'].astype(str)
1150
pat = {
1151
    'PATIENT_ID': train['PATIENT_ID'].unique()
1152
}
1153
pat = pd.DataFrame(pat)
1154
demo = pd.merge(demographic, pat, on='PATIENT_ID', how='inner')
1155
1156
demo_alive = demo.loc[demo['OUTCOME'] == 0]
1157
demo_dead = demo.loc[demo['OUTCOME'] == 1]
1158
demo_overall = demo
1159
1160
# %%
1161
demo.to_csv('demo_overall.csv', index=False)
1162
demo_alive.to_csv('demo_alive.csv', index=False)
1163
demo_dead.to_csv('demo_dead.csv', index=False)
1164
1165
# %%
1166
patient = pd.DataFrame({"PATIENT_ID": (demo_alive['PATIENT_ID'].unique())})
1167
lab_tests_alive = pd.merge(lab_tests, patient, how='inner', on='PATIENT_ID')
1168
print(len(lab_tests_alive['PATIENT_ID'].unique()))
1169
1170
patient = pd.DataFrame({"PATIENT_ID": (demo_dead['PATIENT_ID'].unique())})
1171
lab_tests_dead = pd.merge(lab_tests, patient, how='inner', on='PATIENT_ID')
1172
print(len(lab_tests_dead['PATIENT_ID'].unique()))
1173
1174
patient = pd.DataFrame({"PATIENT_ID": (demo_overall['PATIENT_ID'].unique())})
1175
lab_tests_overall = pd.merge(lab_tests, patient, how='inner', on='PATIENT_ID')
1176
print(len(lab_tests_overall['PATIENT_ID'].unique()))
1177
1178
# %%
1179
patient = pd.DataFrame({"PATIENT_ID": (demo_alive['PATIENT_ID'].unique())})
1180
vital_signs_alive = pd.merge(vital_signs, patient, how='inner', on='PATIENT_ID')
1181
len(vital_signs_alive['PATIENT_ID'].unique())
1182
1183
# %%
1184
patient = pd.DataFrame({"PATIENT_ID": (demo_dead['PATIENT_ID'].unique())})
1185
vital_signs_dead = pd.merge(vital_signs, patient, how='inner', on='PATIENT_ID')
1186
len(vital_signs_dead['PATIENT_ID'].unique())
1187
1188
# %%
1189
patient = pd.DataFrame({"PATIENT_ID": (demo_overall['PATIENT_ID'].unique())})
1190
vital_signs_overall = pd.merge(vital_signs, patient, how='inner', on='PATIENT_ID')
1191
len(vital_signs_overall['PATIENT_ID'].unique())
1192
1193
# %%
1194
"""
1195
limit = 0.05
1196
1197
csfont = {'fontname':'Times New Roman', 'fontsize': 18}
1198
font = 'Times New Roman'
1199
fig=plt.figure(figsize=(16,12), dpi= 100, facecolor='w', edgecolor='k')
1200
1201
idx = 1
1202
1203
key = 'AGE'
1204
low = demo_overall[key].quantile(limit)
1205
high = demo_overall[key].quantile(1 - limit)
1206
demo_AGE_overall = demo_overall[demo_overall[key].between(low, high)]
1207
demo_AGE_dead = demo_dead[demo_dead[key].between(low, high)]
1208
demo_AGE_alive = demo_alive[demo_alive[key].between(low, high)]
1209
ax = plt.subplot(4, 4, idx)
1210
ax.hist(demo_AGE_overall[key], bins=20, weights=np.ones(len(demo_AGE_overall[key])) / len(demo_AGE_overall), color=color, ec=ec, alpha=alpha, label='overall')
1211
plt.xlabel('Age',**csfont)
1212
plt.ylabel('Percentage',**csfont)
1213
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
1214
# ax.title('Age Histogram', **csfont)
1215
ax.hist(demo_AGE_alive[key], bins=20, weights=np.ones(len(demo_AGE_alive[key])) / len(demo_AGE_alive), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2, label='alive')
1216
ax.hist(demo_AGE_dead[key], bins=20, weights=np.ones(len(demo_AGE_dead[key])) / len(demo_AGE_dead), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2, label='dead')
1217
plt.xticks(**csfont)
1218
plt.yticks(**csfont)
1219
idx += 1
1220
1221
key = 'TEMPERATURE'
1222
low = vital_signs_overall[key].quantile(limit)
1223
high = vital_signs_overall[key].quantile(1 - limit)
1224
vs_TEMPERATURE_overall = vital_signs_overall[vital_signs_overall[key].between(low, high)]
1225
vs_TEMPERATURE_dead = vital_signs_dead[vital_signs_dead[key].between(low, high)]
1226
vs_TEMPERATURE_alive = vital_signs_alive[vital_signs_alive[key].between(low, high)]
1227
plt.subplot(4, 4, idx)
1228
plt.hist(vs_TEMPERATURE_overall['TEMPERATURE'], bins=20, weights=np.ones(len(vs_TEMPERATURE_overall)) / len(vs_TEMPERATURE_overall), color=color, ec=ec, alpha=alpha)
1229
plt.xlabel('Temperature',**csfont)
1230
plt.ylabel('Percentage',**csfont)
1231
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
1232
# plt.title('Temperature Histogram', **csfont)
1233
plt.hist(vs_TEMPERATURE_alive['TEMPERATURE'], bins=20, weights=np.ones(len(vs_TEMPERATURE_alive)) / len(vs_TEMPERATURE_alive), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
1234
plt.hist(vs_TEMPERATURE_dead['TEMPERATURE'], bins=20, weights=np.ones(len(vs_TEMPERATURE_dead)) / len(vs_TEMPERATURE_dead), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
1235
plt.xticks(**csfont)
1236
plt.yticks(**csfont)
1237
idx += 1
1238
1239
# plt.subplot(4, 4, 3)
1240
# plt.hist(lab_tests_overall['CREA -- CREATININA'], bins=20, density=True, color=color, ec=ec, alpha=alpha)
1241
# plt.xlabel('CREA -- CREATININA',**csfont)
1242
# plt.ylabel('Percentage',**csfont)
1243
# plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
1244
# # plt.title('Temperature Histogram', **csfont)
1245
# plt.hist(lab_tests_alive['CREA -- CREATININA'], bins=20, density=True, color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
1246
# plt.hist(lab_tests_dead['CREA -- CREATININA'], bins=20, density=True, color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
1247
# plt.xticks(**csfont)
1248
# plt.yticks(**csfont)
1249
1250
key = 'CREA -- CREATININA'
1251
low = lab_tests_overall[key].quantile(limit)
1252
high = lab_tests_overall[key].quantile(1 - limit)
1253
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
1254
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
1255
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
1256
plt.subplot(4, 4, idx)
1257
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
1258
plt.xlabel('CREA -- CREATININA',**csfont)
1259
plt.ylabel('Percentage',**csfont)
1260
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
1261
# plt.title('Temperature Histogram', **csfont)
1262
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
1263
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
1264
plt.xticks(**csfont)
1265
plt.yticks(**csfont)
1266
idx += 1
1267
1268
key = 'HEM -- Hemat¡es'
1269
low = lab_tests_overall[key].quantile(limit)
1270
high = lab_tests_overall[key].quantile(1 - limit)
1271
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
1272
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
1273
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
1274
plt.subplot(4, 4, idx)
1275
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
1276
plt.xlabel('HEM -- Hemat¡es',**csfont)
1277
plt.ylabel('Percentage',**csfont)
1278
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
1279
# plt.title('Temperature Histogram', **csfont)
1280
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
1281
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
1282
plt.xticks(**csfont)
1283
plt.yticks(**csfont)
1284
idx += 1
1285
1286
key = 'LEUC -- Leucocitos'
1287
low = lab_tests_overall[key].quantile(limit)
1288
high = lab_tests_overall[key].quantile(1 - limit)
1289
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
1290
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
1291
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
1292
plt.subplot(4, 4, idx)
1293
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
1294
plt.xlabel('LEUC -- Leucocitos',**csfont)
1295
plt.ylabel('Percentage',**csfont)
1296
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
1297
# plt.title('Temperature Histogram', **csfont)
1298
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
1299
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
1300
plt.xticks(**csfont)
1301
plt.yticks(**csfont)
1302
idx += 1
1303
1304
key = 'PLAQ -- Recuento de plaquetas'
1305
low = lab_tests_overall[key].quantile(limit)
1306
high = lab_tests_overall[key].quantile(1 - limit)
1307
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
1308
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
1309
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
1310
plt.subplot(4, 4, idx)
1311
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
1312
plt.xlabel('PLAQ -- Recuento de plaquetas',**csfont)
1313
plt.ylabel('Percentage',**csfont)
1314
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
1315
# plt.title('Temperature Histogram', **csfont)
1316
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
1317
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
1318
plt.xticks(**csfont)
1319
plt.yticks(**csfont)
1320
idx += 1
1321
1322
key = 'CHCM -- Conc. Hemoglobina Corpuscular Media'
1323
low = lab_tests_overall[key].quantile(limit)
1324
high = lab_tests_overall[key].quantile(1 - limit)
1325
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
1326
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
1327
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
1328
plt.subplot(4, 4, idx)
1329
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
1330
plt.xlabel('CHCM',**csfont)
1331
plt.ylabel('Percentage',**csfont)
1332
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
1333
# plt.title('Temperature Histogram', **csfont)
1334
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
1335
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
1336
plt.xticks(**csfont)
1337
plt.yticks(**csfont)
1338
idx += 1
1339
1340
key = 'HCTO -- Hematocrito'
1341
low = lab_tests_overall[key].quantile(limit)
1342
high = lab_tests_overall[key].quantile(1 - limit)
1343
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
1344
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
1345
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
1346
plt.subplot(4, 4, idx)
1347
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
1348
plt.xlabel('HCTO -- Hematocrito',**csfont)
1349
plt.ylabel('Percentage',**csfont)
1350
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
1351
# plt.title('Temperature Histogram', **csfont)
1352
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
1353
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
1354
plt.xticks(**csfont)
1355
plt.yticks(**csfont)
1356
idx += 1
1357
1358
key = 'VCM -- Volumen Corpuscular Medio'
1359
low = lab_tests_overall[key].quantile(limit)
1360
high = lab_tests_overall[key].quantile(1 - limit)
1361
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
1362
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
1363
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
1364
plt.subplot(4, 4, idx)
1365
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
1366
plt.xlabel('VCM -- Volumen Corpuscular Medio',**csfont)
1367
plt.ylabel('Percentage',**csfont)
1368
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
1369
# plt.title('Temperature Histogram', **csfont)
1370
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
1371
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
1372
plt.xticks(**csfont)
1373
plt.yticks(**csfont)
1374
idx += 1
1375
1376
key = 'HGB -- Hemoglobina'
1377
low = lab_tests_overall[key].quantile(limit)
1378
high = lab_tests_overall[key].quantile(1 - limit)
1379
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
1380
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
1381
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
1382
plt.subplot(4, 4, idx)
1383
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
1384
plt.xlabel('HGB -- Hemoglobina',**csfont)
1385
plt.ylabel('Percentage',**csfont)
1386
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
1387
# plt.title('Temperature Histogram', **csfont)
1388
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
1389
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
1390
plt.xticks(**csfont)
1391
plt.yticks(**csfont)
1392
idx += 1
1393
1394
key = 'HCM -- Hemoglobina Corpuscular Media'
1395
low = lab_tests_overall[key].quantile(limit)
1396
high = lab_tests_overall[key].quantile(1 - limit)
1397
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
1398
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
1399
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
1400
plt.subplot(4, 4, idx)
1401
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
1402
plt.xlabel('HCM -- Hemoglobina Corpuscular Media',**csfont)
1403
plt.ylabel('Percentage',**csfont)
1404
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
1405
# plt.title('Temperature Histogram', **csfont)
1406
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
1407
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
1408
plt.xticks(**csfont)
1409
plt.yticks(**csfont)
1410
idx += 1
1411
1412
key = 'NEU -- Neutr¢filos'
1413
low = lab_tests_overall[key].quantile(limit)
1414
high = lab_tests_overall[key].quantile(1 - limit)
1415
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
1416
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
1417
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
1418
plt.subplot(4, 4, idx)
1419
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
1420
plt.xlabel('NEU -- Neutr¢filos',**csfont)
1421
plt.ylabel('Percentage',**csfont)
1422
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
1423
# plt.title('Temperature Histogram', **csfont)
1424
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
1425
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
1426
plt.xticks(**csfont)
1427
plt.yticks(**csfont)
1428
idx += 1
1429
1430
key = 'NEU% -- Neutr¢filos %'
1431
low = lab_tests_overall[key].quantile(limit)
1432
high = lab_tests_overall[key].quantile(1 - limit)
1433
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
1434
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
1435
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
1436
plt.subplot(4, 4, idx)
1437
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
1438
plt.xlabel('NEU% -- Neutr¢filos%',**csfont)
1439
plt.ylabel('Percentage',**csfont)
1440
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
1441
# plt.title('Temperature Histogram', **csfont)
1442
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
1443
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
1444
plt.xticks(**csfont)
1445
plt.yticks(**csfont)
1446
idx += 1
1447
1448
key = 'LIN -- Linfocitos'
1449
low = lab_tests_overall[key].quantile(limit)
1450
high = lab_tests_overall[key].quantile(1 - limit)
1451
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
1452
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
1453
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
1454
plt.subplot(4, 4, idx)
1455
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
1456
plt.xlabel('LIN -- Linfocitos',**csfont)
1457
plt.ylabel('Percentage',**csfont)
1458
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
1459
# plt.title('Temperature Histogram', **csfont)
1460
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
1461
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
1462
plt.xticks(**csfont)
1463
plt.yticks(**csfont)
1464
idx += 1
1465
1466
key = 'LIN% -- Linfocitos %'
1467
low = lab_tests_overall[key].quantile(limit)
1468
high = lab_tests_overall[key].quantile(1 - limit)
1469
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
1470
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
1471
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
1472
plt.subplot(4, 4, idx)
1473
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
1474
plt.xlabel('LIN% -- Linfocitos%',**csfont)
1475
plt.ylabel('Percentage',**csfont)
1476
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
1477
# plt.title('Temperature Histogram', **csfont)
1478
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
1479
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
1480
plt.xticks(**csfont)
1481
plt.yticks(**csfont)
1482
idx += 1
1483
1484
key = 'ADW -- Coeficiente de anisocitosis'
1485
low = lab_tests_overall[key].quantile(limit)
1486
high = lab_tests_overall[key].quantile(1 - limit)
1487
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]
1488
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]
1489
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]
1490
plt.subplot(4, 4, idx)
1491
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)
1492
plt.xlabel('ADW -- Coeficiente de anisocitosis',**csfont)
1493
plt.ylabel('Percentage',**csfont)
1494
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
1495
# plt.title('Temperature Histogram', **csfont)
1496
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2)
1497
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2)
1498
plt.xticks(**csfont)
1499
plt.yticks(**csfont)
1500
idx += 1
1501
1502
handles, labels = ax.get_legend_handles_labels()
1503
print(handles, labels)
1504
# fig.legend(handles, labels, loc='upper center')
1505
plt.figlegend(handles, labels, loc='upper center', ncol=5, fontsize=18, bbox_to_anchor=(0.5, 1.05), prop=font_manager.FontProperties(family='Times New Roman',
1506
                                   style='normal', size=18))
1507
# fig.legend(, [], loc='upper center')
1508
1509
fig.tight_layout()
1510
plt.show()
1511
"""
1512