|
a |
|
b/datasets/cdsl/preprocess.py |
|
|
1 |
# %% [markdown] |
|
|
2 |
# # hm dataset pre-processing |
|
|
3 |
# |
|
|
4 |
# import packages |
|
|
5 |
|
|
|
6 |
# %% |
|
|
7 |
import os |
|
|
8 |
import pandas as pd |
|
|
9 |
import numpy as np |
|
|
10 |
import matplotlib.pyplot as plt |
|
|
11 |
import pickle as pkl |
|
|
12 |
import torch |
|
|
13 |
import math |
|
|
14 |
import datetime |
|
|
15 |
from tqdm import tqdm |
|
|
16 |
import datetime |
|
|
17 |
import re |
|
|
18 |
from functools import reduce |
|
|
19 |
|
|
|
20 |
# %% [markdown] |
|
|
21 |
# ## Demographic data |
|
|
22 |
|
|
|
23 |
# %% |
|
|
24 |
demographic = pd.read_csv('./raw_data/19_04_2021/COVID_DSL_01.CSV', encoding='ISO-8859-1', sep='|') |
|
|
25 |
print(len(demographic)) |
|
|
26 |
demographic.head() |
|
|
27 |
|
|
|
28 |
# %% |
|
|
29 |
med = pd.read_csv('./raw_data/19_04_2021/COVID_DSL_04.CSV', encoding='ISO-8859-1', sep='|') |
|
|
30 |
print(len(med)) |
|
|
31 |
med.head() |
|
|
32 |
|
|
|
33 |
# %% |
|
|
34 |
len(med['ID_ATC7'].unique()) |
|
|
35 |
|
|
|
36 |
# %% [markdown] |
|
|
37 |
# get rid of patient with missing label |
|
|
38 |
|
|
|
39 |
# %% |
|
|
40 |
print(len(demographic)) |
|
|
41 |
demographic = demographic.dropna(axis=0, how='any', subset=['IDINGRESO', 'F_INGRESO_ING', 'F_ALTA_ING', 'MOTIVO_ALTA_ING']) |
|
|
42 |
print(len(demographic)) |
|
|
43 |
|
|
|
44 |
# %% |
|
|
45 |
def outcome2num(x): |
|
|
46 |
if x == 'Fallecimiento': |
|
|
47 |
return 1 |
|
|
48 |
else: |
|
|
49 |
return 0 |
|
|
50 |
|
|
|
51 |
def to_one_hot(x, feature): |
|
|
52 |
if x == feature: |
|
|
53 |
return 1 |
|
|
54 |
else: |
|
|
55 |
return 0 |
|
|
56 |
|
|
|
57 |
# %% |
|
|
58 |
# select necessary columns from demographic |
|
|
59 |
demographic = demographic[ |
|
|
60 |
[ |
|
|
61 |
'IDINGRESO', |
|
|
62 |
'EDAD', |
|
|
63 |
'SEX', |
|
|
64 |
'F_INGRESO_ING', |
|
|
65 |
'F_ALTA_ING', |
|
|
66 |
'MOTIVO_ALTA_ING', |
|
|
67 |
'ESPECIALIDAD_URGENCIA', |
|
|
68 |
'DIAG_URG' |
|
|
69 |
] |
|
|
70 |
] |
|
|
71 |
|
|
|
72 |
# rename column |
|
|
73 |
demographic = demographic.rename(columns={ |
|
|
74 |
'IDINGRESO': 'PATIENT_ID', |
|
|
75 |
'EDAD': 'AGE', |
|
|
76 |
'SEX': 'SEX', |
|
|
77 |
'F_INGRESO_ING': 'ADMISSION_DATE', |
|
|
78 |
'F_ALTA_ING': 'DEPARTURE_DATE', |
|
|
79 |
'MOTIVO_ALTA_ING': 'OUTCOME', |
|
|
80 |
'ESPECIALIDAD_URGENCIA': 'DEPARTMENT_OF_EMERGENCY', |
|
|
81 |
'DIAG_URG': 'DIAGNOSIS_AT_EMERGENCY_VISIT' |
|
|
82 |
}) |
|
|
83 |
|
|
|
84 |
# SEX: male: 1; female: 0 |
|
|
85 |
demographic['SEX'].replace('MALE', 1, inplace=True) |
|
|
86 |
demographic['SEX'].replace('FEMALE', 0, inplace=True) |
|
|
87 |
|
|
|
88 |
# outcome: Fallecimiento(dead): 1; others: 0 |
|
|
89 |
demographic['OUTCOME'] = demographic['OUTCOME'].map(outcome2num) |
|
|
90 |
|
|
|
91 |
# diagnosis at emergency visit (loss rate < 10%) |
|
|
92 |
# demographic['DIFFICULTY_BREATHING'] = demographic['DIAGNOSIS_AT_EMERGENCY_VISIT'].map(lambda x: to_one_hot(x, 'DIFICULTAD RESPIRATORIA')) # 1674 |
|
|
93 |
# demographic['SUSPECT_COVID'] = demographic['DIAGNOSIS_AT_EMERGENCY_VISIT'].map(lambda x: to_one_hot(x, 'SOSPECHA COVID-19')) # 960 |
|
|
94 |
# demographic['FEVER'] = demographic['DIAGNOSIS_AT_EMERGENCY_VISIT'].map(lambda x: to_one_hot(x, 'FIEBRE')) # 455 |
|
|
95 |
|
|
|
96 |
# department of emergency (loss rate < 10%) |
|
|
97 |
# demographic['EMERGENCY'] = demographic['DEPARTMENT_OF_EMERGENCY'].map(lambda x: to_one_hot(x, 'Medicina de Urgencias')) # 3914 |
|
|
98 |
|
|
|
99 |
# %% |
|
|
100 |
# del useless data |
|
|
101 |
demographic = demographic[ |
|
|
102 |
[ |
|
|
103 |
'PATIENT_ID', |
|
|
104 |
'AGE', |
|
|
105 |
'SEX', |
|
|
106 |
'ADMISSION_DATE', |
|
|
107 |
'DEPARTURE_DATE', |
|
|
108 |
'OUTCOME', |
|
|
109 |
# 'DIFFICULTY_BREATHING', |
|
|
110 |
# 'SUSPECT_COVID', |
|
|
111 |
# 'FEVER', |
|
|
112 |
# 'EMERGENCY' |
|
|
113 |
] |
|
|
114 |
] |
|
|
115 |
|
|
|
116 |
# %% |
|
|
117 |
demographic.describe().to_csv('demographic_overview.csv', mode='w', index=False) |
|
|
118 |
demographic.describe() |
|
|
119 |
|
|
|
120 |
# %% [markdown] |
|
|
121 |
# ### Analyze data |
|
|
122 |
|
|
|
123 |
# %% |
|
|
124 |
plt.scatter(demographic['PATIENT_ID'], demographic['AGE'], s=1) |
|
|
125 |
plt.xlabel('Patient Id') |
|
|
126 |
plt.ylabel('Age') |
|
|
127 |
plt.title('Patient-Age Scatter Plot') |
|
|
128 |
|
|
|
129 |
# %% |
|
|
130 |
plt.scatter(demographic['PATIENT_ID'], demographic['AGE'], s=1) |
|
|
131 |
plt.xlabel('Patient Id') |
|
|
132 |
plt.ylabel('Age') |
|
|
133 |
plt.title('Patient-Age Scatter Plot') |
|
|
134 |
|
|
|
135 |
# %% |
|
|
136 |
demographic.to_csv('demographic.csv', mode='w', index=False) |
|
|
137 |
demographic.head() |
|
|
138 |
|
|
|
139 |
# %% [markdown] |
|
|
140 |
# ## Vital Signal |
|
|
141 |
|
|
|
142 |
# %% |
|
|
143 |
vital_signs = pd.read_csv('./raw_data/19_04_2021/COVID_DSL_02.CSV', encoding='ISO-8859-1', sep='|') |
|
|
144 |
print(len(vital_signs)) |
|
|
145 |
vital_signs.head() |
|
|
146 |
|
|
|
147 |
# %% |
|
|
148 |
vital_signs = vital_signs.rename(columns={ |
|
|
149 |
'IDINGRESO': 'PATIENT_ID', |
|
|
150 |
'CONSTANTS_ING_DATE': 'RECORD_DATE', |
|
|
151 |
'CONSTANTS_ING_TIME': 'RECORD_TIME', |
|
|
152 |
'FC_HR_ING': 'HEART_RATE', |
|
|
153 |
'GLU_GLY_ING': 'BLOOD_GLUCOSE', |
|
|
154 |
'SAT_02_ING': 'OXYGEN_SATURATION', |
|
|
155 |
'TA_MAX_ING': 'MAX_BLOOD_PRESSURE', |
|
|
156 |
'TA_MIN_ING': 'MIN_BLOOD_PRESSURE', |
|
|
157 |
'TEMP_ING': 'TEMPERATURE' |
|
|
158 |
}) |
|
|
159 |
vital_signs['RECORD_TIME'] = vital_signs['RECORD_DATE'] + ' ' + vital_signs['RECORD_TIME'] |
|
|
160 |
vital_signs['RECORD_TIME'] = vital_signs['RECORD_TIME'].map(lambda x: str(datetime.datetime.strptime(x, '%Y-%m-%d %H:%M'))) |
|
|
161 |
vital_signs = vital_signs.drop(['RECORD_DATE', 'SAT_02_ING_OBS', 'BLOOD_GLUCOSE'], axis=1) |
|
|
162 |
|
|
|
163 |
# %% |
|
|
164 |
vital_signs.describe() |
|
|
165 |
|
|
|
166 |
# %% |
|
|
167 |
vital_signs.head() |
|
|
168 |
|
|
|
169 |
# %% |
|
|
170 |
def format_temperature(x): |
|
|
171 |
if type(x) == str: |
|
|
172 |
return float(x.replace(',', '.')) |
|
|
173 |
else: |
|
|
174 |
return float(x) |
|
|
175 |
|
|
|
176 |
def format_oxygen(x): |
|
|
177 |
x = float(x) |
|
|
178 |
if x > 100: |
|
|
179 |
return np.nan |
|
|
180 |
else: |
|
|
181 |
return x |
|
|
182 |
|
|
|
183 |
def format_heart_rate(x): |
|
|
184 |
x = int(x) |
|
|
185 |
if x > 220: |
|
|
186 |
return np.nan |
|
|
187 |
else: |
|
|
188 |
return x |
|
|
189 |
|
|
|
190 |
vital_signs['TEMPERATURE'] = vital_signs['TEMPERATURE'].map(lambda x: format_temperature(x)) |
|
|
191 |
vital_signs['OXYGEN_SATURATION'] = vital_signs['OXYGEN_SATURATION'].map(lambda x: format_oxygen(x)) |
|
|
192 |
vital_signs['HEART_RATE'] = vital_signs['HEART_RATE'].map(lambda x: format_heart_rate(x)) |
|
|
193 |
|
|
|
194 |
# %% |
|
|
195 |
vital_signs = vital_signs.replace(0, np.NAN) |
|
|
196 |
|
|
|
197 |
# %% |
|
|
198 |
vital_signs = vital_signs.groupby(['PATIENT_ID', 'RECORD_TIME'], dropna=True, as_index = False).mean() |
|
|
199 |
vital_signs.head() |
|
|
200 |
|
|
|
201 |
# %% |
|
|
202 |
vital_signs.describe() |
|
|
203 |
|
|
|
204 |
# %% |
|
|
205 |
vital_signs.describe().to_csv('vital_signs_overview.csv', index=False, mode='w') |
|
|
206 |
vital_signs.describe() |
|
|
207 |
|
|
|
208 |
# %% |
|
|
209 |
""" |
|
|
210 |
#plt.rcParams['figure.figsize'] = [10, 5] |
|
|
211 |
fig=plt.figure(figsize=(16,10), dpi= 100, facecolor='w', edgecolor='k') |
|
|
212 |
|
|
|
213 |
plt.subplot(2, 3, 1) |
|
|
214 |
plt.scatter(vital_signs.index, vital_signs['MAX_BLOOD_PRESSURE'], s=1) |
|
|
215 |
plt.xlabel('Index') |
|
|
216 |
plt.ylabel('Max Blood Pressure') |
|
|
217 |
plt.title('Visit-Max Blood Pressure Scatter Plot') |
|
|
218 |
|
|
|
219 |
plt.subplot(2, 3, 2) |
|
|
220 |
plt.scatter(vital_signs.index, vital_signs['MIN_BLOOD_PRESSURE'], s=1) |
|
|
221 |
plt.xlabel('Index') |
|
|
222 |
plt.ylabel('Min Blood Pressure') |
|
|
223 |
plt.title('Visit-Min Blood Pressure Scatter Plot') |
|
|
224 |
|
|
|
225 |
plt.subplot(2, 3, 3) |
|
|
226 |
plt.scatter(vital_signs.index, vital_signs['TEMPERATURE'], s=1) |
|
|
227 |
plt.xlabel('Index') |
|
|
228 |
plt.ylabel('Temperature') |
|
|
229 |
plt.title('Visit-Temperature Scatter Plot') |
|
|
230 |
|
|
|
231 |
plt.subplot(2, 3, 4) |
|
|
232 |
plt.scatter(vital_signs.index, vital_signs['HEART_RATE'], s=1) |
|
|
233 |
plt.xlabel('Index') |
|
|
234 |
plt.ylabel('Heart Rate') |
|
|
235 |
plt.title('Visit-Heart Rate Scatter Plot') |
|
|
236 |
|
|
|
237 |
plt.subplot(2, 3, 5) |
|
|
238 |
plt.scatter(vital_signs.index, vital_signs['OXYGEN_SATURATION'], s=1) |
|
|
239 |
plt.xlabel('Index') |
|
|
240 |
plt.ylabel('Oxygen Saturation') |
|
|
241 |
plt.title('Visit-Oxygen Saturation Scatter Plot') |
|
|
242 |
|
|
|
243 |
plt.show() |
|
|
244 |
""" |
|
|
245 |
# %% |
|
|
246 |
""" |
|
|
247 |
#plt.rcParams['figure.figsize'] = [10, 5] |
|
|
248 |
fig=plt.figure(figsize=(16,10), dpi= 100, facecolor='w', edgecolor='k') |
|
|
249 |
|
|
|
250 |
plt.subplot(2, 3, 1) |
|
|
251 |
plt.hist(vital_signs['MAX_BLOOD_PRESSURE'], bins=30) |
|
|
252 |
plt.xlabel('Index') |
|
|
253 |
plt.ylabel('Max Blood Pressure') |
|
|
254 |
plt.title('Visit-Max Blood Pressure Histogram') |
|
|
255 |
|
|
|
256 |
plt.subplot(2, 3, 2) |
|
|
257 |
plt.hist(vital_signs['MIN_BLOOD_PRESSURE'], bins=30) |
|
|
258 |
plt.xlabel('Index') |
|
|
259 |
plt.ylabel('Min Blood Pressure') |
|
|
260 |
plt.title('Visit-Min Blood Pressure Histogram') |
|
|
261 |
|
|
|
262 |
plt.subplot(2, 3, 3) |
|
|
263 |
plt.hist(vital_signs['TEMPERATURE'], bins=30) |
|
|
264 |
plt.xlabel('Index') |
|
|
265 |
plt.ylabel('Temperature') |
|
|
266 |
plt.title('Visit-Temperature Histogram') |
|
|
267 |
|
|
|
268 |
plt.subplot(2, 3, 4) |
|
|
269 |
plt.hist(vital_signs['HEART_RATE'], bins=30) |
|
|
270 |
plt.xlabel('Index') |
|
|
271 |
plt.ylabel('Heart Rate') |
|
|
272 |
plt.title('Visit-Heart Rate Histogram') |
|
|
273 |
|
|
|
274 |
plt.subplot(2, 3, 5) |
|
|
275 |
plt.hist(vital_signs['OXYGEN_SATURATION'], bins=30) |
|
|
276 |
plt.xlabel('Index') |
|
|
277 |
plt.ylabel('Oxygen Saturation') |
|
|
278 |
plt.title('Visit-Oxygen Saturation Histogram') |
|
|
279 |
|
|
|
280 |
plt.show() |
|
|
281 |
""" |
|
|
282 |
# %% [markdown] |
|
|
283 |
# ### Missing rate of each visit |
|
|
284 |
|
|
|
285 |
# %% |
|
|
286 |
sum(vital_signs.T.isnull().sum()) / ((len(vital_signs.T) - 2) * len(vital_signs)) |
|
|
287 |
|
|
|
288 |
# %% [markdown] |
|
|
289 |
# ### Normalize data |
|
|
290 |
|
|
|
291 |
# %% |
|
|
292 |
""" |
|
|
293 |
for key in vital_signs.keys()[2:]: |
|
|
294 |
vital_signs[key] = (vital_signs[key] - vital_signs[key].mean()) / (vital_signs[key].std() + 1e-12) |
|
|
295 |
|
|
|
296 |
vital_signs.describe() |
|
|
297 |
""" |
|
|
298 |
|
|
|
299 |
# %% |
|
|
300 |
vital_signs.to_csv('visual_signs.csv', mode='w', index=False) |
|
|
301 |
|
|
|
302 |
# %% |
|
|
303 |
len(vital_signs) / len(vital_signs['PATIENT_ID'].unique()) |
|
|
304 |
|
|
|
305 |
# %% [markdown] |
|
|
306 |
# ## Lab Tests |
|
|
307 |
|
|
|
308 |
# %% |
|
|
309 |
lab_tests = pd.read_csv('./raw_data/19_04_2021/COVID_DSL_06_v2.CSV', encoding='ISO-8859-1', sep=';') |
|
|
310 |
lab_tests = lab_tests.rename(columns={'IDINGRESO': 'PATIENT_ID'}) |
|
|
311 |
print(len(lab_tests)) |
|
|
312 |
|
|
|
313 |
# del useless data |
|
|
314 |
lab_tests = lab_tests[ |
|
|
315 |
[ |
|
|
316 |
'PATIENT_ID', |
|
|
317 |
'LAB_NUMBER', |
|
|
318 |
'LAB_DATE', |
|
|
319 |
'TIME_LAB', |
|
|
320 |
'ITEM_LAB', |
|
|
321 |
'VAL_RESULT' |
|
|
322 |
# UD_RESULT: unit |
|
|
323 |
# REF_VALUES: reference values |
|
|
324 |
] |
|
|
325 |
] |
|
|
326 |
|
|
|
327 |
lab_tests.head() |
|
|
328 |
|
|
|
329 |
# %% |
|
|
330 |
lab_tests = lab_tests.groupby(['PATIENT_ID', 'LAB_NUMBER', 'LAB_DATE', 'TIME_LAB', 'ITEM_LAB'], dropna=True, as_index = False).first() |
|
|
331 |
lab_tests = lab_tests.set_index(['PATIENT_ID', 'LAB_NUMBER', 'LAB_DATE', 'TIME_LAB', 'ITEM_LAB'], drop = True).unstack('ITEM_LAB')['VAL_RESULT'].reset_index() |
|
|
332 |
|
|
|
333 |
lab_tests = lab_tests.drop([ |
|
|
334 |
'CFLAG -- ALARMA HEMOGRAMA', |
|
|
335 |
'CORONA -- PCR CORONAVIRUS 2019nCoV', |
|
|
336 |
'CRIOGLO -- CRIOGLOBULINAS', |
|
|
337 |
'EGCOVID -- ESTUDIO GENETICO COVID-19', |
|
|
338 |
'FRO1 -- ', |
|
|
339 |
'FRO1 -- FROTIS EN SANGRE PERIFERICA', |
|
|
340 |
'FRO2 -- ', |
|
|
341 |
'FRO2 -- FROTIS EN SANGRE PERIFERICA', |
|
|
342 |
'FRO3 -- ', |
|
|
343 |
'FRO3 -- FROTIS EN SANGRE PERIFERICA', |
|
|
344 |
'FRO_COMEN -- ', |
|
|
345 |
'FRO_COMEN -- FROTIS EN SANGRE PERIFERICA', |
|
|
346 |
'G-CORONAV (RT-PCR) -- Tipo de muestra: ASPIRADO BRONCOALVEOLAR', |
|
|
347 |
'G-CORONAV (RT-PCR) -- Tipo de muestra: EXUDADO', |
|
|
348 |
'GRRH -- GRUPO SANGUÖNEO Y FACTOR Rh', |
|
|
349 |
'HEML -- RECUENTO CELULAR LIQUIDO', |
|
|
350 |
'HEML -- Recuento Hemat¡es', |
|
|
351 |
'IFSUERO -- INMUNOFIJACION EN SUERO', |
|
|
352 |
'OBS_BIOMOL -- OBSERVACIONES GENETICA MOLECULAR', |
|
|
353 |
'OBS_BIOO -- Observaciones Bioqu¡mica Orina', |
|
|
354 |
'OBS_CB -- Observaciones Coagulaci¢n', |
|
|
355 |
'OBS_GASES -- Observaciones Gasometr¡a Arterial', |
|
|
356 |
'OBS_GASV -- Observaciones Gasometr¡a Venosa', |
|
|
357 |
'OBS_GEN2 -- OBSERVACIONES GENETICA', |
|
|
358 |
'OBS_HOR -- Observaciones Hormonas', |
|
|
359 |
'OBS_MICRO -- Observaciones Microbiolog¡a', |
|
|
360 |
'OBS_NULA2 -- Observaciones Bioqu¡mica', |
|
|
361 |
'OBS_NULA3 -- Observaciones Hematolog¡a', |
|
|
362 |
'OBS_PESP -- Observaciones Pruebas especiales', |
|
|
363 |
'OBS_SERO -- Observaciones Serolog¡a', |
|
|
364 |
'OBS_SIS -- Observaciones Orina', |
|
|
365 |
'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: ASPIRADO BRONCOALVEOLAR', |
|
|
366 |
'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: BAS', |
|
|
367 |
'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: ESPUTO', |
|
|
368 |
'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: EXUDADO', |
|
|
369 |
'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: LAVADO BRONCOALVEOLAR', |
|
|
370 |
'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: LAVADO NASOFARÖNGEO', |
|
|
371 |
'PTGOR -- PROTEINOGRAMA ORINA', |
|
|
372 |
'RESUL_IFT -- ESTUDIO DE INMUNOFENOTIPO', |
|
|
373 |
'RESUL_IFT -- Resultado', |
|
|
374 |
'Resultado -- Resultado', |
|
|
375 |
'SED1 -- ', |
|
|
376 |
'SED1 -- SEDIMENTO', |
|
|
377 |
'SED2 -- ', |
|
|
378 |
'SED2 -- SEDIMENTO', |
|
|
379 |
'SED3 -- ', |
|
|
380 |
'SED3 -- SEDIMENTO', |
|
|
381 |
'TIPOL -- TIPO DE LIQUIDO', |
|
|
382 |
'Tecnica -- T\x82cnica', |
|
|
383 |
'TpMues -- Tipo de muestra', |
|
|
384 |
'VHCBLOT -- INMUNOBLOT VIRUS HEPATITIS C', |
|
|
385 |
'VIR_TM -- VIRUS TIPO DE MUESTRA', |
|
|
386 |
'LEGIORI -- AG. LEGIONELA PNEUMOPHILA EN ORINA', |
|
|
387 |
'NEUMOORI -- AG NEUMOCOCO EN ORINA', |
|
|
388 |
'VIHAC -- VIH AC' |
|
|
389 |
], axis=1) |
|
|
390 |
|
|
|
391 |
|
|
|
392 |
lab_tests.head() |
|
|
393 |
|
|
|
394 |
# %% |
|
|
395 |
lab_tests = lab_tests.replace('Sin resultado.', np.nan) |
|
|
396 |
lab_tests = lab_tests.replace('Sin resultado', np.nan) |
|
|
397 |
lab_tests = lab_tests.replace('----', np.nan).replace('---', np.nan) |
|
|
398 |
lab_tests = lab_tests.replace('> ', '').replace('< ', '') |
|
|
399 |
|
|
|
400 |
def change_format(x): |
|
|
401 |
if x is None: |
|
|
402 |
return np.nan |
|
|
403 |
elif type(x) == str: |
|
|
404 |
if x.startswith('Negativo ('): |
|
|
405 |
return x.replace('Negativo (', '-')[:-1] |
|
|
406 |
elif x.startswith('Positivo ('): |
|
|
407 |
return x.replace('Positivo (', '')[:-1] |
|
|
408 |
elif x.startswith('Zona limite ('): |
|
|
409 |
return x.replace('Zona limite (', '')[:-1] |
|
|
410 |
elif x.startswith('>'): |
|
|
411 |
return x.replace('> ', '').replace('>', '') |
|
|
412 |
elif x.startswith('<'): |
|
|
413 |
return x.replace('< ', '').replace('<', '') |
|
|
414 |
elif x.endswith(' mg/dl'): |
|
|
415 |
return x.replace(' mg/dl', '') |
|
|
416 |
elif x.endswith('/æl'): |
|
|
417 |
return x.replace('/æl', '') |
|
|
418 |
elif x.endswith(' copias/mL'): |
|
|
419 |
return x.replace(' copias/mL', '') |
|
|
420 |
elif x == 'Numerosos': |
|
|
421 |
return 1.5 |
|
|
422 |
elif x == 'Aislados': |
|
|
423 |
return 0.5 |
|
|
424 |
elif x == 'Se detecta' or x == 'Se observan' or x == 'Normal' or x == 'Positivo': |
|
|
425 |
return 1 |
|
|
426 |
elif x == 'No se detecta' or x == 'No se observan' or x == 'Negativo': |
|
|
427 |
return 0 |
|
|
428 |
elif x == 'Indeterminado': |
|
|
429 |
return np.nan |
|
|
430 |
else: |
|
|
431 |
num = re.findall("[-+]?\d+\.\d+", x) |
|
|
432 |
if len(num) == 0: |
|
|
433 |
return np.nan |
|
|
434 |
else: |
|
|
435 |
return num[0] |
|
|
436 |
else: |
|
|
437 |
return x |
|
|
438 |
|
|
|
439 |
feature_value_dict = dict() |
|
|
440 |
|
|
|
441 |
for k in tqdm(lab_tests.keys()[4:]): |
|
|
442 |
lab_tests[k] = lab_tests[k].map(lambda x: change_format(change_format(x))) |
|
|
443 |
feature_value_dict[k] = lab_tests[k].unique() |
|
|
444 |
|
|
|
445 |
# %% |
|
|
446 |
def nan_and_not_nan(x): |
|
|
447 |
if x == x: |
|
|
448 |
return 1 |
|
|
449 |
else: # nan |
|
|
450 |
return 0 |
|
|
451 |
|
|
|
452 |
def is_float(num): |
|
|
453 |
try: |
|
|
454 |
float(num) |
|
|
455 |
return True |
|
|
456 |
except ValueError: |
|
|
457 |
return False |
|
|
458 |
|
|
|
459 |
def is_all_float(x): |
|
|
460 |
for i in x: |
|
|
461 |
if i == i and (i != None): |
|
|
462 |
if not is_float(i): |
|
|
463 |
return False |
|
|
464 |
return True |
|
|
465 |
|
|
|
466 |
def to_float(x): |
|
|
467 |
if x != None: |
|
|
468 |
return float(x) |
|
|
469 |
else: |
|
|
470 |
return np.nan |
|
|
471 |
|
|
|
472 |
other_feature_dict = dict() |
|
|
473 |
|
|
|
474 |
for feature in tqdm(feature_value_dict.keys()): |
|
|
475 |
values = feature_value_dict[feature] |
|
|
476 |
if is_all_float(values): |
|
|
477 |
lab_tests[feature] = lab_tests[feature].map(lambda x: to_float(x)) |
|
|
478 |
elif len(values) == 2: |
|
|
479 |
lab_tests[feature] = lab_tests[feature].map(lambda x: nan_and_not_nan(x)) |
|
|
480 |
else: |
|
|
481 |
other_feature_dict[feature] = values |
|
|
482 |
|
|
|
483 |
# %% |
|
|
484 |
other_feature_dict |
|
|
485 |
|
|
|
486 |
# %% |
|
|
487 |
def format_time(t): |
|
|
488 |
if '/' in t: |
|
|
489 |
return str(datetime.datetime.strptime(t, '%d/%m/%Y %H:%M')) |
|
|
490 |
else: |
|
|
491 |
return str(datetime.datetime.strptime(t, '%d-%m-%Y %H:%M')) |
|
|
492 |
|
|
|
493 |
lab_tests['RECORD_TIME'] = lab_tests['LAB_DATE'] + ' ' + lab_tests['TIME_LAB'] |
|
|
494 |
lab_tests['RECORD_TIME'] = lab_tests['RECORD_TIME'].map(lambda x: format_time(x)) |
|
|
495 |
lab_tests = lab_tests.drop(['LAB_NUMBER', 'LAB_DATE', 'TIME_LAB'], axis=1) |
|
|
496 |
# lab_tests = lab_tests.drop(['LAB_NUMBER', 'TIME_LAB'], axis=1) |
|
|
497 |
lab_tests.head() |
|
|
498 |
|
|
|
499 |
# %% |
|
|
500 |
lab_tests_patient = lab_tests.groupby(['PATIENT_ID'], dropna=True, as_index = False).mean() |
|
|
501 |
print(len(lab_tests_patient)) |
|
|
502 |
count = [i for i in lab_tests_patient.count()[1:]] |
|
|
503 |
plt.hist(count) |
|
|
504 |
|
|
|
505 |
# %% |
|
|
506 |
patient_total = len(lab_tests_patient) |
|
|
507 |
threshold = patient_total * 0.1 |
|
|
508 |
reserved_keys = [] |
|
|
509 |
|
|
|
510 |
for key in lab_tests_patient.keys(): |
|
|
511 |
if lab_tests_patient[key].count() > threshold: |
|
|
512 |
reserved_keys.append(key) |
|
|
513 |
|
|
|
514 |
print(len(reserved_keys)) |
|
|
515 |
reserved_keys |
|
|
516 |
|
|
|
517 |
# %% |
|
|
518 |
reserved_keys.insert(1, 'RECORD_TIME') |
|
|
519 |
|
|
|
520 |
lab_tests = lab_tests.groupby(['PATIENT_ID', 'RECORD_TIME'], dropna=True, as_index = False).mean() |
|
|
521 |
|
|
|
522 |
lab_tests = lab_tests[reserved_keys] |
|
|
523 |
lab_tests.head() |
|
|
524 |
""" |
|
|
525 |
# %% [markdown] |
|
|
526 |
# ### Missing rate of each visit |
|
|
527 |
|
|
|
528 |
# %% |
|
|
529 |
sum(lab_tests.T.isnull().sum()) / ((len(lab_tests.T) - 2) * len(lab_tests)) |
|
|
530 |
|
|
|
531 |
# %% [markdown] |
|
|
532 |
# ### Scatter Plot |
|
|
533 |
|
|
|
534 |
# %% |
|
|
535 |
fig=plt.figure(figsize=(16,200), dpi= 100, facecolor='w', edgecolor='k') |
|
|
536 |
|
|
|
537 |
i = 1 |
|
|
538 |
for key in lab_tests.keys()[2:]: |
|
|
539 |
plt.subplot(33, 3, i) |
|
|
540 |
plt.scatter(lab_tests.index, lab_tests[key], s=1) |
|
|
541 |
plt.ylabel(key) |
|
|
542 |
i += 1 |
|
|
543 |
|
|
|
544 |
plt.show() |
|
|
545 |
|
|
|
546 |
# %% |
|
|
547 |
fig=plt.figure(figsize=(20,120), dpi= 100, facecolor='w', edgecolor='k') |
|
|
548 |
|
|
|
549 |
i = 1 |
|
|
550 |
for key in lab_tests.keys()[2:]: |
|
|
551 |
plt.subplot(23, 4, i) |
|
|
552 |
plt.hist(lab_tests[key], bins=30) |
|
|
553 |
q3 = lab_tests[key].quantile(0.75) |
|
|
554 |
q1 = lab_tests[key].quantile(0.25) |
|
|
555 |
qh = q3 + 3 * (q3 - q1) |
|
|
556 |
ql = q1 - 3 * (q3 - q1) |
|
|
557 |
sigma = 5 |
|
|
558 |
plt.axline([sigma*lab_tests[key].std() + lab_tests[key].mean(), 0], [sigma*lab_tests[key].std() + lab_tests[key].mean(), 1], color = "r", linestyle=(0, (5, 5))) |
|
|
559 |
plt.axline([-sigma*lab_tests[key].std() + lab_tests[key].mean(), 0], [-sigma*lab_tests[key].std() + lab_tests[key].mean(), 1], color = "r", linestyle=(0, (5, 5))) |
|
|
560 |
#plt.axline([lab_tests[key].quantile(0.25), 0], [lab_tests[key].quantile(0.25), 1], color = "k", linestyle=(0, (5, 5))) |
|
|
561 |
#plt.axline([lab_tests[key].quantile(0.75), 0], [lab_tests[key].quantile(0.75), 1], color = "k", linestyle=(0, (5, 5))) |
|
|
562 |
plt.axline([qh, 0], [qh, 1], color='k', linestyle=(0, (5, 5))) |
|
|
563 |
plt.axline([ql, 0], [ql, 1], color='k', linestyle=(0, (5, 5))) |
|
|
564 |
plt.ylabel(key) |
|
|
565 |
i += 1 |
|
|
566 |
|
|
|
567 |
plt.show() |
|
|
568 |
""" |
|
|
569 |
# %% [markdown] |
|
|
570 |
# ### Normalize data |
|
|
571 |
|
|
|
572 |
# %% |
|
|
573 |
""" |
|
|
574 |
for key in lab_tests.keys()[2:]: |
|
|
575 |
lab_tests[key] = (lab_tests[key] - lab_tests[key].mean()) / (lab_tests[key].std() + 1e-12) |
|
|
576 |
|
|
|
577 |
lab_tests.describe() |
|
|
578 |
""" |
|
|
579 |
|
|
|
580 |
# %% |
|
|
581 |
# 【del normalization】 |
|
|
582 |
# for key in lab_tests.keys()[2:]: |
|
|
583 |
# r = lab_tests[lab_tests[key].between(lab_tests[key].quantile(0.05), lab_tests[key].quantile(0.95))] |
|
|
584 |
# lab_tests[key] = (lab_tests[key] - r[key].mean()) / (r[key].std() + 1e-12) |
|
|
585 |
|
|
|
586 |
# %% |
|
|
587 |
lab_tests.to_csv('lab_test.csv', mode='w', index=False) |
|
|
588 |
|
|
|
589 |
# %% [markdown] |
|
|
590 |
# # Concat data |
|
|
591 |
|
|
|
592 |
# %% |
|
|
593 |
demographic['PATIENT_ID'] = demographic['PATIENT_ID'].map(lambda x: str(int(x))) |
|
|
594 |
vital_signs['PATIENT_ID'] = vital_signs['PATIENT_ID'].map(lambda x: str(int(x))) |
|
|
595 |
lab_tests['PATIENT_ID'] = lab_tests['PATIENT_ID'].map(lambda x: str(int(x))) |
|
|
596 |
|
|
|
597 |
# %% |
|
|
598 |
len(demographic['PATIENT_ID'].unique()), len(vital_signs['PATIENT_ID'].unique()), len(lab_tests['PATIENT_ID'].unique()) |
|
|
599 |
|
|
|
600 |
# %% |
|
|
601 |
train_df = pd.merge(vital_signs, lab_tests, on=['PATIENT_ID', 'RECORD_TIME'], how='outer') |
|
|
602 |
|
|
|
603 |
train_df = train_df.groupby(['PATIENT_ID', 'RECORD_TIME'], dropna=True, as_index = False).mean() |
|
|
604 |
|
|
|
605 |
train_df = pd.merge(demographic, train_df, on=['PATIENT_ID'], how='left') |
|
|
606 |
|
|
|
607 |
train_df.head() |
|
|
608 |
|
|
|
609 |
# %% |
|
|
610 |
# del rows without patient_id, admission_date, record_time, or outcome |
|
|
611 |
train_df = train_df.dropna(axis=0, how='any', subset=['PATIENT_ID', 'ADMISSION_DATE', 'RECORD_TIME', 'OUTCOME']) |
|
|
612 |
|
|
|
613 |
# %% |
|
|
614 |
train_df.to_csv('train.csv', mode='w', index=False) |
|
|
615 |
train_df.describe() |
|
|
616 |
|
|
|
617 |
# %% [markdown] |
|
|
618 |
# ## Missing rate of each visit |
|
|
619 |
|
|
|
620 |
# %% |
|
|
621 |
sum(train_df.T.isnull().sum()) / ((len(train_df.T) - 2) * len(train_df)) |
|
|
622 |
|
|
|
623 |
# %% [markdown] |
|
|
624 |
# # Split and save data |
|
|
625 |
|
|
|
626 |
# %% [markdown] |
|
|
627 |
# * demo: demographic data |
|
|
628 |
# * x: lab test & vital signs |
|
|
629 |
# * y: outcome & length of stay |
|
|
630 |
|
|
|
631 |
# %% |
|
|
632 |
patient_ids = train_df['PATIENT_ID'].unique() |
|
|
633 |
|
|
|
634 |
demo_cols = ['AGE', 'SEX'] # , 'DIFFICULTY_BREATHING', 'FEVER', 'SUSPECT_COVID', 'EMERGENCY' |
|
|
635 |
test_cols = [] |
|
|
636 |
|
|
|
637 |
# get column names |
|
|
638 |
for k in train_df.keys(): |
|
|
639 |
if not k in demographic.keys(): |
|
|
640 |
if not k == 'RECORD_TIME': |
|
|
641 |
test_cols.append(k) |
|
|
642 |
|
|
|
643 |
test_median = train_df[test_cols].median() |
|
|
644 |
|
|
|
645 |
# %% |
|
|
646 |
test_cols |
|
|
647 |
|
|
|
648 |
# %% |
|
|
649 |
train_df['RECORD_TIME_DAY'] = train_df['RECORD_TIME'].map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d')) |
|
|
650 |
train_df['RECORD_TIME_HOUR'] = train_df['RECORD_TIME'].map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d %H')) |
|
|
651 |
train_df.head() |
|
|
652 |
|
|
|
653 |
# %% |
|
|
654 |
train_df_day = train_df.groupby(['PATIENT_ID', 'ADMISSION_DATE', 'DEPARTURE_DATE', 'RECORD_TIME_DAY'], dropna=True, as_index = False).mean() |
|
|
655 |
train_df_hour = train_df.groupby(['PATIENT_ID', 'ADMISSION_DATE', 'DEPARTURE_DATE', 'RECORD_TIME_HOUR'], dropna=True, as_index = False).mean() |
|
|
656 |
|
|
|
657 |
len(train_df), len(train_df_day), len(train_df_hour) |
|
|
658 |
|
|
|
659 |
# %% [markdown] |
|
|
660 |
# |
|
|
661 |
# ``` |
|
|
662 |
# number of visits (total) |
|
|
663 |
# - Original data: 168777 |
|
|
664 |
# - Merge by hour: 130141 |
|
|
665 |
# - Merge by day: 42204 |
|
|
666 |
# ``` |
|
|
667 |
|
|
|
668 |
# %% |
|
|
669 |
len(train_df['PATIENT_ID'].unique()) |
|
|
670 |
|
|
|
671 |
# %% |
|
|
672 |
def get_visit_intervals(df): |
|
|
673 |
ls = [] |
|
|
674 |
for pat in df['PATIENT_ID'].unique(): |
|
|
675 |
ls.append(len(df[df['PATIENT_ID'] == pat])) |
|
|
676 |
return ls |
|
|
677 |
|
|
|
678 |
# %% |
|
|
679 |
ls_org = get_visit_intervals(train_df) |
|
|
680 |
ls_hour = get_visit_intervals(train_df_hour) |
|
|
681 |
ls_day = get_visit_intervals(train_df_day) |
|
|
682 |
|
|
|
683 |
# %% |
|
|
684 |
import matplotlib.pyplot as plt |
|
|
685 |
from matplotlib.ticker import PercentFormatter |
|
|
686 |
import matplotlib.font_manager as font_manager |
|
|
687 |
import pandas as pd |
|
|
688 |
import numpy as np |
|
|
689 |
""" |
|
|
690 |
csfont = {'fontname':'Times New Roman', 'fontsize': 18} |
|
|
691 |
font = 'Times New Roman' |
|
|
692 |
fig=plt.figure(figsize=(18,4), dpi= 100, facecolor='w', edgecolor='k') |
|
|
693 |
plt.style.use('seaborn-whitegrid') |
|
|
694 |
color = 'cornflowerblue' |
|
|
695 |
ec = 'None' |
|
|
696 |
alpha=0.5 |
|
|
697 |
|
|
|
698 |
ax = plt.subplot(1, 3, 1) |
|
|
699 |
ax.hist(ls_org, bins=20, weights=np.ones(len(ls_org)) / len(ls_org), color=color, ec=ec, alpha=alpha, label='overall') |
|
|
700 |
plt.xlabel('Num of visits (org)',**csfont) |
|
|
701 |
plt.ylabel('Percentage',**csfont) |
|
|
702 |
plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) |
|
|
703 |
plt.xticks(**csfont) |
|
|
704 |
plt.yticks(**csfont) |
|
|
705 |
|
|
|
706 |
ax = plt.subplot(1, 3, 2) |
|
|
707 |
ax.hist(ls_hour, bins=20, weights=np.ones(len(ls_hour)) / len(ls_hour), color=color, ec=ec, alpha=alpha, label='overall') |
|
|
708 |
plt.xlabel('Num of visits (hour)',**csfont) |
|
|
709 |
plt.ylabel('Percentage',**csfont) |
|
|
710 |
plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) |
|
|
711 |
plt.xticks(**csfont) |
|
|
712 |
plt.yticks(**csfont) |
|
|
713 |
|
|
|
714 |
ax = plt.subplot(1, 3, 3) |
|
|
715 |
ax.hist(ls_day, bins=20, weights=np.ones(len(ls_day)) / len(ls_day), color=color, ec=ec, alpha=alpha, label='overall') |
|
|
716 |
plt.xlabel('Num of visits (day)',**csfont) |
|
|
717 |
plt.ylabel('Percentage',**csfont) |
|
|
718 |
plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) |
|
|
719 |
plt.xticks(**csfont) |
|
|
720 |
plt.yticks(**csfont) |
|
|
721 |
|
|
|
722 |
plt.show() |
|
|
723 |
""" |
|
|
724 |
# %% |
|
|
725 |
def get_statistic(lst, name): |
|
|
726 |
print(f'[{name}]\tMax:\t{max(lst)}, Min:\t{min(lst)}, Median:\t{np.median(lst)}, Mean:\t{np.mean(lst)}, 80%:\t{np.quantile(lst, 0.8)}, 90%:\t{np.quantile(lst, 0.9)}, 95%:\t{np.quantile(lst, 0.95)}') |
|
|
727 |
|
|
|
728 |
# %% |
|
|
729 |
get_statistic(ls_org, 'ls_org') |
|
|
730 |
get_statistic(ls_hour, 'ls_hour') |
|
|
731 |
get_statistic(ls_day, 'ls_day') |
|
|
732 |
|
|
|
733 |
# %% |
|
|
734 |
train_df_hour['LOS'] = train_df_hour['ADMISSION_DATE'] |
|
|
735 |
train_df_hour['LOS_HOUR'] = train_df_hour['ADMISSION_DATE'] |
|
|
736 |
|
|
|
737 |
# %% |
|
|
738 |
train_df_hour = train_df_hour.reset_index() |
|
|
739 |
|
|
|
740 |
# %% |
|
|
741 |
for idx in tqdm(range(len(train_df_hour))): |
|
|
742 |
info = train_df_hour.loc[idx] |
|
|
743 |
admission = datetime.datetime.strptime(info['ADMISSION_DATE'], '%Y-%m-%d %H:%M:%S') |
|
|
744 |
departure = datetime.datetime.strptime(info['DEPARTURE_DATE'], '%Y-%m-%d %H:%M:%S') |
|
|
745 |
visit_hour = datetime.datetime.strptime(info['RECORD_TIME_HOUR'], '%Y-%m-%d %H') |
|
|
746 |
hour = (departure - visit_hour).seconds / (24 * 60 * 60) + (departure - visit_hour).days |
|
|
747 |
los = (departure - admission).seconds / (24 * 60 * 60) + (departure - admission).days |
|
|
748 |
train_df_hour.at[idx, 'LOS'] = float(los) |
|
|
749 |
train_df_hour.at[idx, 'LOS_HOUR'] = float(hour) |
|
|
750 |
|
|
|
751 |
# %% |
|
|
752 |
train_df_hour['LOS'] |
|
|
753 |
|
|
|
754 |
# %% |
|
|
755 |
los = [] |
|
|
756 |
for pat in tqdm(train_df_hour['PATIENT_ID'].unique()): |
|
|
757 |
los.append(float(train_df_hour[train_df_hour['PATIENT_ID'] == pat]['LOS'].head(1))) |
|
|
758 |
|
|
|
759 |
# %% |
|
|
760 |
get_statistic(los, 'los') |
|
|
761 |
|
|
|
762 |
# %% |
|
|
763 |
import matplotlib.pyplot as plt |
|
|
764 |
from matplotlib.ticker import PercentFormatter |
|
|
765 |
import matplotlib.font_manager as font_manager |
|
|
766 |
import pandas as pd |
|
|
767 |
import numpy as np |
|
|
768 |
""" |
|
|
769 |
csfont = {'fontname':'Times New Roman', 'fontsize': 18} |
|
|
770 |
font = 'Times New Roman' |
|
|
771 |
fig=plt.figure(figsize=(6,6), dpi= 100, facecolor='w', edgecolor='k') |
|
|
772 |
plt.style.use('seaborn-whitegrid') |
|
|
773 |
color = 'cornflowerblue' |
|
|
774 |
ec = 'None' |
|
|
775 |
alpha=0.5 |
|
|
776 |
|
|
|
777 |
ax = plt.subplot(1, 1, 1) |
|
|
778 |
ax.hist(los, bins=20, weights=np.ones(len(los)) / len(los), color=color, ec=ec, alpha=alpha, label='overall') |
|
|
779 |
plt.xlabel('Length of stay',**csfont) |
|
|
780 |
plt.ylabel('Percentage',**csfont) |
|
|
781 |
plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) |
|
|
782 |
plt.xticks(**csfont) |
|
|
783 |
plt.yticks(**csfont) |
|
|
784 |
|
|
|
785 |
plt.show() |
|
|
786 |
""" |
|
|
787 |
# %% |
|
|
788 |
train_df_hour_idx = train_df_hour.reset_index() |
|
|
789 |
|
|
|
790 |
# %% |
|
|
791 |
train_df_hour_idx['LOS'] = train_df_hour_idx['ADMISSION_DATE'] |
|
|
792 |
|
|
|
793 |
for idx in tqdm(range(len(train_df_hour_idx))): |
|
|
794 |
info = train_df_hour_idx.loc[idx] |
|
|
795 |
# admission = datetime.datetime.strptime(info['ADMISSION_DATE'], '%Y-%m-%d %H:%M:%S') |
|
|
796 |
departure = datetime.datetime.strptime(info['DEPARTURE_DATE'], '%Y-%m-%d %H:%M:%S') |
|
|
797 |
visit_hour = datetime.datetime.strptime(info['RECORD_TIME_HOUR'], '%Y-%m-%d %H') |
|
|
798 |
hour = (departure - visit_hour).seconds / (24 * 60 * 60) + (departure - visit_hour).days |
|
|
799 |
train_df_hour_idx.at[idx, 'LOS'] = float(hour) |
|
|
800 |
|
|
|
801 |
# %% |
|
|
802 |
train_df_hour['LOS'] = train_df_hour['LOS_HOUR'] |
|
|
803 |
train_df_hour.drop(columns=['LOS_HOUR']) |
|
|
804 |
|
|
|
805 |
# %% |
|
|
806 |
# los_threshold = 13.0 |
|
|
807 |
|
|
|
808 |
# visit_num_hour = [] |
|
|
809 |
|
|
|
810 |
# for pat in tqdm(train_df_hour_idx['PATIENT_ID'].unique()): |
|
|
811 |
# pat_records = train_df_hour_idx[train_df_hour_idx['PATIENT_ID'] == pat] |
|
|
812 |
# hour = 0 |
|
|
813 |
# for vis in pat_records.index: |
|
|
814 |
# pat_visit = pat_records.loc[vis] |
|
|
815 |
# if pat_visit['LOS_HOUR'] <= los_threshold: |
|
|
816 |
# hour += 1 |
|
|
817 |
# visit_num_hour.append(hour) |
|
|
818 |
# if hour == 0: |
|
|
819 |
# print(pat) |
|
|
820 |
|
|
|
821 |
# %% |
|
|
822 |
# import matplotlib.pyplot as plt |
|
|
823 |
# from matplotlib.ticker import PercentFormatter |
|
|
824 |
# import matplotlib.font_manager as font_manager |
|
|
825 |
# import pandas as pd |
|
|
826 |
# import numpy as np |
|
|
827 |
# csfont = {'fontname':'Times New Roman', 'fontsize': 18} |
|
|
828 |
# font = 'Times New Roman' |
|
|
829 |
# fig=plt.figure(figsize=(6,6), dpi= 100, facecolor='w', edgecolor='k') |
|
|
830 |
# plt.style.use('seaborn-whitegrid') |
|
|
831 |
# color = 'cornflowerblue' |
|
|
832 |
# ec = 'None' |
|
|
833 |
# alpha=0.5 |
|
|
834 |
|
|
|
835 |
# ax = plt.subplot(1, 1, 1) |
|
|
836 |
# ax.hist(visit_num_hour, bins=20, weights=np.ones(len(visit_num_hour)) / len(visit_num_hour), color=color, ec=ec, alpha=alpha, label='overall') |
|
|
837 |
# plt.xlabel('Visit num (80% los)',**csfont) |
|
|
838 |
# plt.ylabel('Percentage',**csfont) |
|
|
839 |
# plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) |
|
|
840 |
# plt.xticks(**csfont) |
|
|
841 |
# plt.yticks(**csfont) |
|
|
842 |
|
|
|
843 |
# plt.show() |
|
|
844 |
|
|
|
845 |
# %% |
|
|
846 |
train_df = train_df_hour |
|
|
847 |
train_df.head() |
|
|
848 |
|
|
|
849 |
# %% |
|
|
850 |
train_df.describe() |
|
|
851 |
|
|
|
852 |
# %% |
|
|
853 |
get_statistic(train_df['LOS'], 'los') |
|
|
854 |
|
|
|
855 |
# %% |
|
|
856 |
train_df[train_df['PATIENT_ID'] == '1']['HEART_RATE'].count() |
|
|
857 |
|
|
|
858 |
# %% |
|
|
859 |
cols = train_df.columns[5:] |
|
|
860 |
pats = train_df['PATIENT_ID'].unique() |
|
|
861 |
all_pat_cnt = len(pats) |
|
|
862 |
missing_rate = dict() |
|
|
863 |
# for col in tqdm(cols): |
|
|
864 |
# miss = 0 |
|
|
865 |
# for pat in pats: |
|
|
866 |
# if train_df[train_df['PATIENT_ID'] == pat][col].count() == 0: |
|
|
867 |
# miss += 1 |
|
|
868 |
# missing_rate[col] = miss / all_pat_cnt |
|
|
869 |
|
|
|
870 |
for col in cols: |
|
|
871 |
missing_rate[col] = 0 |
|
|
872 |
for pat in tqdm(pats): |
|
|
873 |
p = train_df[train_df['PATIENT_ID'] == pat] |
|
|
874 |
for col in cols: |
|
|
875 |
if p[col].count() == 0: |
|
|
876 |
missing_rate[col] += 1 |
|
|
877 |
for col in cols: |
|
|
878 |
missing_rate[col] = missing_rate[col] / all_pat_cnt |
|
|
879 |
|
|
|
880 |
missing_rate |
|
|
881 |
|
|
|
882 |
# %% |
|
|
883 |
with open('missing_rate.csv', mode='w', encoding='utf-8') as file: |
|
|
884 |
for col in cols: |
|
|
885 |
file.write(f'"{col}", {100 * missing_rate[col]}\n') |
|
|
886 |
|
|
|
887 |
# %% |
|
|
888 |
train_df['LOS'] = train_df['LOS'].clip(lower=0) |
|
|
889 |
|
|
|
890 |
# %% |
|
|
891 |
get_statistic(train_df['LOS'], 'los') |
|
|
892 |
|
|
|
893 |
# %% |
|
|
894 |
# the first visit of each person |
|
|
895 |
def init_prev(prev): |
|
|
896 |
miss = [] |
|
|
897 |
l = len(prev) |
|
|
898 |
for idx in range(l): |
|
|
899 |
#print(prev[idx]) |
|
|
900 |
#print(type(prev[idx])) |
|
|
901 |
if np.isnan(prev[idx]): # there is no previous record |
|
|
902 |
prev[idx] = test_median[idx] # replace nan to median |
|
|
903 |
miss.append(1) # mark miss as 1 |
|
|
904 |
else: # there is a previous record |
|
|
905 |
miss.append(0) |
|
|
906 |
return miss |
|
|
907 |
|
|
|
908 |
# the rest of the visits |
|
|
909 |
def fill_nan(cur, prev): |
|
|
910 |
l = len(prev) |
|
|
911 |
miss = [] |
|
|
912 |
for idx in range(l): |
|
|
913 |
#print(cur[idx]) |
|
|
914 |
if np.isnan(cur[idx]): # there is no record in current timestep |
|
|
915 |
cur[idx] = prev[idx] # cur <- prev |
|
|
916 |
miss.append(1) |
|
|
917 |
else: # there is a record in current timestep |
|
|
918 |
miss.append(0) |
|
|
919 |
return miss |
|
|
920 |
|
|
|
921 |
# %% |
|
|
922 |
x, y, demo, x_lab_len, missing_mask = [], [], [], [], [] |
|
|
923 |
|
|
|
924 |
for pat in tqdm(patient_ids): # for all patients |
|
|
925 |
# get visits for pat.id == PATIENT_ID |
|
|
926 |
info = train_df[train_df['PATIENT_ID'] == pat] |
|
|
927 |
info = info[max(0, len(info) - 76):] |
|
|
928 |
indexes = info.index |
|
|
929 |
visit = info.loc[indexes[0]] # get the first visit |
|
|
930 |
|
|
|
931 |
# demographic data |
|
|
932 |
demo.append([visit[k] for k in demo_cols]) |
|
|
933 |
|
|
|
934 |
# label |
|
|
935 |
outcome = visit['OUTCOME'] |
|
|
936 |
los = [] |
|
|
937 |
|
|
|
938 |
# lab test & vital signs |
|
|
939 |
tests = [] |
|
|
940 |
prev = visit[test_cols] |
|
|
941 |
miss = [] # missing matrix |
|
|
942 |
miss.append(init_prev(prev)) # fill nan for the first visit for every patient and add missing status to missing matrix |
|
|
943 |
# leave = datetime.datetime.strptime(visit['DEPARTURE_DATE'], '%Y-%m-%d %H:%M:%S') |
|
|
944 |
|
|
|
945 |
first = True |
|
|
946 |
for i in indexes: |
|
|
947 |
visit = info.loc[i] |
|
|
948 |
# now = datetime.datetime.strptime(visit['RECORD_TIME'], '%Y-%m-%d %H') |
|
|
949 |
cur = visit[test_cols] |
|
|
950 |
tmp = fill_nan(cur, prev) # fill nan for the rest of the visits |
|
|
951 |
if not first: |
|
|
952 |
miss.append(tmp) # add missing status to missing matrix |
|
|
953 |
tests.append(cur) |
|
|
954 |
# los_visit = (leave - now).days |
|
|
955 |
# if los_visit < 0: |
|
|
956 |
# los_visit = 0 |
|
|
957 |
los.append(visit['LOS']) |
|
|
958 |
prev = cur |
|
|
959 |
first = False |
|
|
960 |
|
|
|
961 |
valid_visit = len(los) |
|
|
962 |
# outcome = [outcome] * valid_visit |
|
|
963 |
x_lab_len.append(valid_visit) |
|
|
964 |
missing_mask.append(miss) # append the patient's missing matrix to the total missing matrix |
|
|
965 |
|
|
|
966 |
# tests = np.pad(tests, ((0, max_visit - valid_visit), (0, 0))) |
|
|
967 |
# outcome = np.pad(outcome, (0, max_visit - valid_visit)) |
|
|
968 |
# los = np.pad(los, (0, max_visit - valid_visit)) |
|
|
969 |
|
|
|
970 |
y.append([outcome, los]) |
|
|
971 |
x.append(tests) |
|
|
972 |
|
|
|
973 |
# %% |
|
|
974 |
all_x = x |
|
|
975 |
all_x_demo = demo |
|
|
976 |
all_y = y |
|
|
977 |
all_missing_mask = missing_mask |
|
|
978 |
|
|
|
979 |
# %% |
|
|
980 |
all_x_labtest = np.array(all_x, dtype=object) |
|
|
981 |
x_lab_length = [len(_) for _ in all_x_labtest] |
|
|
982 |
x_lab_length = torch.tensor(x_lab_length, dtype=torch.int) |
|
|
983 |
max_length = int(x_lab_length.max()) |
|
|
984 |
all_x_labtest = [torch.tensor(_) for _ in all_x_labtest] |
|
|
985 |
all_x_labtest = torch.nn.utils.rnn.pad_sequence((all_x_labtest), batch_first=True) |
|
|
986 |
all_x_demographic = torch.tensor(all_x_demo) |
|
|
987 |
batch_size, demo_dim = all_x_demographic.shape |
|
|
988 |
all_x_demographic = torch.reshape(all_x_demographic.repeat(1, max_length), (batch_size, max_length, demo_dim)) |
|
|
989 |
all_x = torch.cat((all_x_demographic, all_x_labtest), 2) |
|
|
990 |
|
|
|
991 |
all_y = np.array(all_y, dtype=object) |
|
|
992 |
patient_list = [] |
|
|
993 |
for pat in all_y: |
|
|
994 |
visits = [] |
|
|
995 |
for i in pat[1]: |
|
|
996 |
visits.append([pat[0], i]) |
|
|
997 |
patient_list.append(visits) |
|
|
998 |
new_all_y = np.array(patient_list, dtype=object) |
|
|
999 |
output_all_y = [torch.Tensor(_) for _ in new_all_y] |
|
|
1000 |
output_all_y = torch.nn.utils.rnn.pad_sequence((output_all_y), batch_first=True) |
|
|
1001 |
|
|
|
1002 |
# %% |
|
|
1003 |
all_missing_mask = np.array(all_missing_mask, dtype=object) |
|
|
1004 |
all_missing_mask = [torch.tensor(_) for _ in all_missing_mask] |
|
|
1005 |
all_missing_mask = torch.nn.utils.rnn.pad_sequence((all_missing_mask), batch_first=True) |
|
|
1006 |
|
|
|
1007 |
# %% |
|
|
1008 |
all_x.shape |
|
|
1009 |
|
|
|
1010 |
# %% |
|
|
1011 |
all_missing_mask.shape |
|
|
1012 |
|
|
|
1013 |
# %% |
|
|
1014 |
# save pickle format dataset (torch) |
|
|
1015 |
pd.to_pickle(all_x,f'./processed_data/x.pkl' ) |
|
|
1016 |
pd.to_pickle(all_missing_mask,f'./processed_data/missing_mask.pkl' ) |
|
|
1017 |
pd.to_pickle(output_all_y,f'./processed_data/y.pkl' ) |
|
|
1018 |
pd.to_pickle(x_lab_length,f'./processed_data/visits_length.pkl' ) |
|
|
1019 |
|
|
|
1020 |
# %% |
|
|
1021 |
# Calculate patients' outcome statistics (patients-wise) |
|
|
1022 |
outcome_list = [] |
|
|
1023 |
y_outcome = output_all_y[:, :, 0] |
|
|
1024 |
indices = torch.arange(len(x_lab_length), dtype=torch.int64) |
|
|
1025 |
for i in indices: |
|
|
1026 |
outcome_list.append(y_outcome[i][0].item()) |
|
|
1027 |
outcome_list = np.array(outcome_list) |
|
|
1028 |
print(len(outcome_list)) |
|
|
1029 |
unique, count=np.unique(outcome_list,return_counts=True) |
|
|
1030 |
data_count=dict(zip(unique,count)) |
|
|
1031 |
print(data_count) |
|
|
1032 |
|
|
|
1033 |
# %% |
|
|
1034 |
# Calculate patients' outcome statistics (records-wise) |
|
|
1035 |
outcome_records_list = [] |
|
|
1036 |
y_outcome = output_all_y[:, :, 0] |
|
|
1037 |
indices = torch.arange(len(x_lab_length), dtype=torch.int64) |
|
|
1038 |
for i in indices: |
|
|
1039 |
outcome_records_list.extend(y_outcome[i][0:x_lab_length[i]].tolist()) |
|
|
1040 |
outcome_records_list = np.array(outcome_records_list) |
|
|
1041 |
print(len(outcome_records_list)) |
|
|
1042 |
unique, count=np.unique(outcome_records_list,return_counts=True) |
|
|
1043 |
data_count=dict(zip(unique,count)) |
|
|
1044 |
print(data_count) |
|
|
1045 |
|
|
|
1046 |
# %% |
|
|
1047 |
# Calculate patients' mean los and 95% percentile los |
|
|
1048 |
los_list = [] |
|
|
1049 |
y_los = output_all_y[:, :, 1] |
|
|
1050 |
indices = torch.arange(len(x_lab_length), dtype=torch.int64) |
|
|
1051 |
for i in indices: |
|
|
1052 |
# los_list.extend(y_los[i][: x_lab_length[i].long()].tolist()) |
|
|
1053 |
los_list.append(y_los[i][0].item()) |
|
|
1054 |
los_list = np.array(los_list) |
|
|
1055 |
print(los_list.mean() * 0.5) |
|
|
1056 |
print(np.median(los_list) * 0.5) |
|
|
1057 |
print(np.percentile(los_list, 95)) |
|
|
1058 |
|
|
|
1059 |
print('median:', np.median(los_list)) |
|
|
1060 |
print('Q1:', np.percentile(los_list, 25)) |
|
|
1061 |
print('Q3:', np.percentile(los_list, 75)) |
|
|
1062 |
|
|
|
1063 |
# %% |
|
|
1064 |
los_alive_list = np.array([los_list[i] for i in range(len(los_list)) if outcome_list[i] == 0]) |
|
|
1065 |
los_dead_list = np.array([los_list[i] for i in range(len(los_list)) if outcome_list[i] == 1]) |
|
|
1066 |
print(len(los_alive_list)) |
|
|
1067 |
print(len(los_dead_list)) |
|
|
1068 |
|
|
|
1069 |
print('[Alive]') |
|
|
1070 |
print('median:', np.median(los_alive_list)) |
|
|
1071 |
print('Q1:', np.percentile(los_alive_list, 25)) |
|
|
1072 |
print('Q3:', np.percentile(los_alive_list, 75)) |
|
|
1073 |
|
|
|
1074 |
print('[Dead]') |
|
|
1075 |
print('median:', np.median(los_dead_list)) |
|
|
1076 |
print('Q1:', np.percentile(los_dead_list, 25)) |
|
|
1077 |
print('Q3:', np.percentile(los_dead_list, 75)) |
|
|
1078 |
|
|
|
1079 |
# %% |
|
|
1080 |
cdsl_los_statistics = { |
|
|
1081 |
'overall': los_list, |
|
|
1082 |
'alive': los_alive_list, |
|
|
1083 |
'dead': los_dead_list |
|
|
1084 |
} |
|
|
1085 |
pd.to_pickle(cdsl_los_statistics, 'cdsl_los_statistics.pkl') |
|
|
1086 |
|
|
|
1087 |
# %% |
|
|
1088 |
# calculate visits length Median [Q1, Q3] |
|
|
1089 |
visits_list = np.array(x_lab_length) |
|
|
1090 |
visits_alive_list = np.array([x_lab_length[i] for i in range(len(x_lab_length)) if outcome_list[i] == 0]) |
|
|
1091 |
visits_dead_list = np.array([x_lab_length[i] for i in range(len(x_lab_length)) if outcome_list[i] == 1]) |
|
|
1092 |
print(len(visits_alive_list)) |
|
|
1093 |
print(len(visits_dead_list)) |
|
|
1094 |
|
|
|
1095 |
print('[Total]') |
|
|
1096 |
print('median:', np.median(visits_list)) |
|
|
1097 |
print('Q1:', np.percentile(visits_list, 25)) |
|
|
1098 |
print('Q3:', np.percentile(visits_list, 75)) |
|
|
1099 |
|
|
|
1100 |
print('[Alive]') |
|
|
1101 |
print('median:', np.median(visits_alive_list)) |
|
|
1102 |
print('Q1:', np.percentile(visits_alive_list, 25)) |
|
|
1103 |
print('Q3:', np.percentile(visits_alive_list, 75)) |
|
|
1104 |
|
|
|
1105 |
print('[Dead]') |
|
|
1106 |
print('median:', np.median(visits_dead_list)) |
|
|
1107 |
print('Q1:', np.percentile(visits_dead_list, 25)) |
|
|
1108 |
print('Q3:', np.percentile(visits_dead_list, 75)) |
|
|
1109 |
|
|
|
1110 |
# %% |
|
|
1111 |
def check_nan(x): |
|
|
1112 |
if np.isnan(np.sum(x.cpu().numpy())): |
|
|
1113 |
print("some values from input are nan") |
|
|
1114 |
else: |
|
|
1115 |
print("no nan") |
|
|
1116 |
|
|
|
1117 |
# %% |
|
|
1118 |
check_nan(all_x) |
|
|
1119 |
|
|
|
1120 |
# %% [markdown] |
|
|
1121 |
# # Draw Charts |
|
|
1122 |
|
|
|
1123 |
# %% [markdown] |
|
|
1124 |
# ## Import packages |
|
|
1125 |
|
|
|
1126 |
# %% |
|
|
1127 |
import matplotlib.pyplot as plt |
|
|
1128 |
from matplotlib.ticker import PercentFormatter |
|
|
1129 |
import matplotlib.font_manager as font_manager |
|
|
1130 |
import pandas as pd |
|
|
1131 |
import numpy as np |
|
|
1132 |
|
|
|
1133 |
plt.style.use('seaborn-whitegrid') |
|
|
1134 |
color = 'cornflowerblue' |
|
|
1135 |
ec = 'None' |
|
|
1136 |
alpha=0.5 |
|
|
1137 |
alive_color = 'olivedrab' |
|
|
1138 |
dead_color = 'orchid' |
|
|
1139 |
|
|
|
1140 |
# %% [markdown] |
|
|
1141 |
# ## Read data |
|
|
1142 |
|
|
|
1143 |
# %% |
|
|
1144 |
demographic.head() |
|
|
1145 |
|
|
|
1146 |
# %% |
|
|
1147 |
train = pd.read_csv('./train.csv') |
|
|
1148 |
train['PATIENT_ID']=train['PATIENT_ID'].astype(str) |
|
|
1149 |
demographic['PATIENT_ID']=demographic['PATIENT_ID'].astype(str) |
|
|
1150 |
pat = { |
|
|
1151 |
'PATIENT_ID': train['PATIENT_ID'].unique() |
|
|
1152 |
} |
|
|
1153 |
pat = pd.DataFrame(pat) |
|
|
1154 |
demo = pd.merge(demographic, pat, on='PATIENT_ID', how='inner') |
|
|
1155 |
|
|
|
1156 |
demo_alive = demo.loc[demo['OUTCOME'] == 0] |
|
|
1157 |
demo_dead = demo.loc[demo['OUTCOME'] == 1] |
|
|
1158 |
demo_overall = demo |
|
|
1159 |
|
|
|
1160 |
# %% |
|
|
1161 |
demo.to_csv('demo_overall.csv', index=False) |
|
|
1162 |
demo_alive.to_csv('demo_alive.csv', index=False) |
|
|
1163 |
demo_dead.to_csv('demo_dead.csv', index=False) |
|
|
1164 |
|
|
|
1165 |
# %% |
|
|
1166 |
patient = pd.DataFrame({"PATIENT_ID": (demo_alive['PATIENT_ID'].unique())}) |
|
|
1167 |
lab_tests_alive = pd.merge(lab_tests, patient, how='inner', on='PATIENT_ID') |
|
|
1168 |
print(len(lab_tests_alive['PATIENT_ID'].unique())) |
|
|
1169 |
|
|
|
1170 |
patient = pd.DataFrame({"PATIENT_ID": (demo_dead['PATIENT_ID'].unique())}) |
|
|
1171 |
lab_tests_dead = pd.merge(lab_tests, patient, how='inner', on='PATIENT_ID') |
|
|
1172 |
print(len(lab_tests_dead['PATIENT_ID'].unique())) |
|
|
1173 |
|
|
|
1174 |
patient = pd.DataFrame({"PATIENT_ID": (demo_overall['PATIENT_ID'].unique())}) |
|
|
1175 |
lab_tests_overall = pd.merge(lab_tests, patient, how='inner', on='PATIENT_ID') |
|
|
1176 |
print(len(lab_tests_overall['PATIENT_ID'].unique())) |
|
|
1177 |
|
|
|
1178 |
# %% |
|
|
1179 |
patient = pd.DataFrame({"PATIENT_ID": (demo_alive['PATIENT_ID'].unique())}) |
|
|
1180 |
vital_signs_alive = pd.merge(vital_signs, patient, how='inner', on='PATIENT_ID') |
|
|
1181 |
len(vital_signs_alive['PATIENT_ID'].unique()) |
|
|
1182 |
|
|
|
1183 |
# %% |
|
|
1184 |
patient = pd.DataFrame({"PATIENT_ID": (demo_dead['PATIENT_ID'].unique())}) |
|
|
1185 |
vital_signs_dead = pd.merge(vital_signs, patient, how='inner', on='PATIENT_ID') |
|
|
1186 |
len(vital_signs_dead['PATIENT_ID'].unique()) |
|
|
1187 |
|
|
|
1188 |
# %% |
|
|
1189 |
patient = pd.DataFrame({"PATIENT_ID": (demo_overall['PATIENT_ID'].unique())}) |
|
|
1190 |
vital_signs_overall = pd.merge(vital_signs, patient, how='inner', on='PATIENT_ID') |
|
|
1191 |
len(vital_signs_overall['PATIENT_ID'].unique()) |
|
|
1192 |
|
|
|
1193 |
# %% |
|
|
1194 |
""" |
|
|
1195 |
limit = 0.05 |
|
|
1196 |
|
|
|
1197 |
csfont = {'fontname':'Times New Roman', 'fontsize': 18} |
|
|
1198 |
font = 'Times New Roman' |
|
|
1199 |
fig=plt.figure(figsize=(16,12), dpi= 100, facecolor='w', edgecolor='k') |
|
|
1200 |
|
|
|
1201 |
idx = 1 |
|
|
1202 |
|
|
|
1203 |
key = 'AGE' |
|
|
1204 |
low = demo_overall[key].quantile(limit) |
|
|
1205 |
high = demo_overall[key].quantile(1 - limit) |
|
|
1206 |
demo_AGE_overall = demo_overall[demo_overall[key].between(low, high)] |
|
|
1207 |
demo_AGE_dead = demo_dead[demo_dead[key].between(low, high)] |
|
|
1208 |
demo_AGE_alive = demo_alive[demo_alive[key].between(low, high)] |
|
|
1209 |
ax = plt.subplot(4, 4, idx) |
|
|
1210 |
ax.hist(demo_AGE_overall[key], bins=20, weights=np.ones(len(demo_AGE_overall[key])) / len(demo_AGE_overall), color=color, ec=ec, alpha=alpha, label='overall') |
|
|
1211 |
plt.xlabel('Age',**csfont) |
|
|
1212 |
plt.ylabel('Percentage',**csfont) |
|
|
1213 |
plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) |
|
|
1214 |
# ax.title('Age Histogram', **csfont) |
|
|
1215 |
ax.hist(demo_AGE_alive[key], bins=20, weights=np.ones(len(demo_AGE_alive[key])) / len(demo_AGE_alive), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2, label='alive') |
|
|
1216 |
ax.hist(demo_AGE_dead[key], bins=20, weights=np.ones(len(demo_AGE_dead[key])) / len(demo_AGE_dead), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2, label='dead') |
|
|
1217 |
plt.xticks(**csfont) |
|
|
1218 |
plt.yticks(**csfont) |
|
|
1219 |
idx += 1 |
|
|
1220 |
|
|
|
1221 |
key = 'TEMPERATURE' |
|
|
1222 |
low = vital_signs_overall[key].quantile(limit) |
|
|
1223 |
high = vital_signs_overall[key].quantile(1 - limit) |
|
|
1224 |
vs_TEMPERATURE_overall = vital_signs_overall[vital_signs_overall[key].between(low, high)] |
|
|
1225 |
vs_TEMPERATURE_dead = vital_signs_dead[vital_signs_dead[key].between(low, high)] |
|
|
1226 |
vs_TEMPERATURE_alive = vital_signs_alive[vital_signs_alive[key].between(low, high)] |
|
|
1227 |
plt.subplot(4, 4, idx) |
|
|
1228 |
plt.hist(vs_TEMPERATURE_overall['TEMPERATURE'], bins=20, weights=np.ones(len(vs_TEMPERATURE_overall)) / len(vs_TEMPERATURE_overall), color=color, ec=ec, alpha=alpha) |
|
|
1229 |
plt.xlabel('Temperature',**csfont) |
|
|
1230 |
plt.ylabel('Percentage',**csfont) |
|
|
1231 |
plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) |
|
|
1232 |
# plt.title('Temperature Histogram', **csfont) |
|
|
1233 |
plt.hist(vs_TEMPERATURE_alive['TEMPERATURE'], bins=20, weights=np.ones(len(vs_TEMPERATURE_alive)) / len(vs_TEMPERATURE_alive), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) |
|
|
1234 |
plt.hist(vs_TEMPERATURE_dead['TEMPERATURE'], bins=20, weights=np.ones(len(vs_TEMPERATURE_dead)) / len(vs_TEMPERATURE_dead), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) |
|
|
1235 |
plt.xticks(**csfont) |
|
|
1236 |
plt.yticks(**csfont) |
|
|
1237 |
idx += 1 |
|
|
1238 |
|
|
|
1239 |
# plt.subplot(4, 4, 3) |
|
|
1240 |
# plt.hist(lab_tests_overall['CREA -- CREATININA'], bins=20, density=True, color=color, ec=ec, alpha=alpha) |
|
|
1241 |
# plt.xlabel('CREA -- CREATININA',**csfont) |
|
|
1242 |
# plt.ylabel('Percentage',**csfont) |
|
|
1243 |
# plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) |
|
|
1244 |
# # plt.title('Temperature Histogram', **csfont) |
|
|
1245 |
# plt.hist(lab_tests_alive['CREA -- CREATININA'], bins=20, density=True, color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) |
|
|
1246 |
# plt.hist(lab_tests_dead['CREA -- CREATININA'], bins=20, density=True, color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) |
|
|
1247 |
# plt.xticks(**csfont) |
|
|
1248 |
# plt.yticks(**csfont) |
|
|
1249 |
|
|
|
1250 |
key = 'CREA -- CREATININA' |
|
|
1251 |
low = lab_tests_overall[key].quantile(limit) |
|
|
1252 |
high = lab_tests_overall[key].quantile(1 - limit) |
|
|
1253 |
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)] |
|
|
1254 |
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)] |
|
|
1255 |
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)] |
|
|
1256 |
plt.subplot(4, 4, idx) |
|
|
1257 |
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha) |
|
|
1258 |
plt.xlabel('CREA -- CREATININA',**csfont) |
|
|
1259 |
plt.ylabel('Percentage',**csfont) |
|
|
1260 |
plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) |
|
|
1261 |
# plt.title('Temperature Histogram', **csfont) |
|
|
1262 |
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) |
|
|
1263 |
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) |
|
|
1264 |
plt.xticks(**csfont) |
|
|
1265 |
plt.yticks(**csfont) |
|
|
1266 |
idx += 1 |
|
|
1267 |
|
|
|
1268 |
key = 'HEM -- Hemat¡es' |
|
|
1269 |
low = lab_tests_overall[key].quantile(limit) |
|
|
1270 |
high = lab_tests_overall[key].quantile(1 - limit) |
|
|
1271 |
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)] |
|
|
1272 |
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)] |
|
|
1273 |
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)] |
|
|
1274 |
plt.subplot(4, 4, idx) |
|
|
1275 |
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha) |
|
|
1276 |
plt.xlabel('HEM -- Hemat¡es',**csfont) |
|
|
1277 |
plt.ylabel('Percentage',**csfont) |
|
|
1278 |
plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) |
|
|
1279 |
# plt.title('Temperature Histogram', **csfont) |
|
|
1280 |
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) |
|
|
1281 |
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) |
|
|
1282 |
plt.xticks(**csfont) |
|
|
1283 |
plt.yticks(**csfont) |
|
|
1284 |
idx += 1 |
|
|
1285 |
|
|
|
1286 |
key = 'LEUC -- Leucocitos' |
|
|
1287 |
low = lab_tests_overall[key].quantile(limit) |
|
|
1288 |
high = lab_tests_overall[key].quantile(1 - limit) |
|
|
1289 |
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)] |
|
|
1290 |
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)] |
|
|
1291 |
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)] |
|
|
1292 |
plt.subplot(4, 4, idx) |
|
|
1293 |
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha) |
|
|
1294 |
plt.xlabel('LEUC -- Leucocitos',**csfont) |
|
|
1295 |
plt.ylabel('Percentage',**csfont) |
|
|
1296 |
plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) |
|
|
1297 |
# plt.title('Temperature Histogram', **csfont) |
|
|
1298 |
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) |
|
|
1299 |
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) |
|
|
1300 |
plt.xticks(**csfont) |
|
|
1301 |
plt.yticks(**csfont) |
|
|
1302 |
idx += 1 |
|
|
1303 |
|
|
|
1304 |
key = 'PLAQ -- Recuento de plaquetas' |
|
|
1305 |
low = lab_tests_overall[key].quantile(limit) |
|
|
1306 |
high = lab_tests_overall[key].quantile(1 - limit) |
|
|
1307 |
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)] |
|
|
1308 |
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)] |
|
|
1309 |
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)] |
|
|
1310 |
plt.subplot(4, 4, idx) |
|
|
1311 |
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha) |
|
|
1312 |
plt.xlabel('PLAQ -- Recuento de plaquetas',**csfont) |
|
|
1313 |
plt.ylabel('Percentage',**csfont) |
|
|
1314 |
plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) |
|
|
1315 |
# plt.title('Temperature Histogram', **csfont) |
|
|
1316 |
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) |
|
|
1317 |
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) |
|
|
1318 |
plt.xticks(**csfont) |
|
|
1319 |
plt.yticks(**csfont) |
|
|
1320 |
idx += 1 |
|
|
1321 |
|
|
|
1322 |
key = 'CHCM -- Conc. Hemoglobina Corpuscular Media' |
|
|
1323 |
low = lab_tests_overall[key].quantile(limit) |
|
|
1324 |
high = lab_tests_overall[key].quantile(1 - limit) |
|
|
1325 |
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)] |
|
|
1326 |
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)] |
|
|
1327 |
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)] |
|
|
1328 |
plt.subplot(4, 4, idx) |
|
|
1329 |
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha) |
|
|
1330 |
plt.xlabel('CHCM',**csfont) |
|
|
1331 |
plt.ylabel('Percentage',**csfont) |
|
|
1332 |
plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) |
|
|
1333 |
# plt.title('Temperature Histogram', **csfont) |
|
|
1334 |
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) |
|
|
1335 |
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) |
|
|
1336 |
plt.xticks(**csfont) |
|
|
1337 |
plt.yticks(**csfont) |
|
|
1338 |
idx += 1 |
|
|
1339 |
|
|
|
1340 |
key = 'HCTO -- Hematocrito' |
|
|
1341 |
low = lab_tests_overall[key].quantile(limit) |
|
|
1342 |
high = lab_tests_overall[key].quantile(1 - limit) |
|
|
1343 |
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)] |
|
|
1344 |
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)] |
|
|
1345 |
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)] |
|
|
1346 |
plt.subplot(4, 4, idx) |
|
|
1347 |
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha) |
|
|
1348 |
plt.xlabel('HCTO -- Hematocrito',**csfont) |
|
|
1349 |
plt.ylabel('Percentage',**csfont) |
|
|
1350 |
plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) |
|
|
1351 |
# plt.title('Temperature Histogram', **csfont) |
|
|
1352 |
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) |
|
|
1353 |
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) |
|
|
1354 |
plt.xticks(**csfont) |
|
|
1355 |
plt.yticks(**csfont) |
|
|
1356 |
idx += 1 |
|
|
1357 |
|
|
|
1358 |
key = 'VCM -- Volumen Corpuscular Medio' |
|
|
1359 |
low = lab_tests_overall[key].quantile(limit) |
|
|
1360 |
high = lab_tests_overall[key].quantile(1 - limit) |
|
|
1361 |
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)] |
|
|
1362 |
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)] |
|
|
1363 |
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)] |
|
|
1364 |
plt.subplot(4, 4, idx) |
|
|
1365 |
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha) |
|
|
1366 |
plt.xlabel('VCM -- Volumen Corpuscular Medio',**csfont) |
|
|
1367 |
plt.ylabel('Percentage',**csfont) |
|
|
1368 |
plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) |
|
|
1369 |
# plt.title('Temperature Histogram', **csfont) |
|
|
1370 |
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) |
|
|
1371 |
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) |
|
|
1372 |
plt.xticks(**csfont) |
|
|
1373 |
plt.yticks(**csfont) |
|
|
1374 |
idx += 1 |
|
|
1375 |
|
|
|
1376 |
key = 'HGB -- Hemoglobina' |
|
|
1377 |
low = lab_tests_overall[key].quantile(limit) |
|
|
1378 |
high = lab_tests_overall[key].quantile(1 - limit) |
|
|
1379 |
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)] |
|
|
1380 |
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)] |
|
|
1381 |
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)] |
|
|
1382 |
plt.subplot(4, 4, idx) |
|
|
1383 |
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha) |
|
|
1384 |
plt.xlabel('HGB -- Hemoglobina',**csfont) |
|
|
1385 |
plt.ylabel('Percentage',**csfont) |
|
|
1386 |
plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) |
|
|
1387 |
# plt.title('Temperature Histogram', **csfont) |
|
|
1388 |
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) |
|
|
1389 |
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) |
|
|
1390 |
plt.xticks(**csfont) |
|
|
1391 |
plt.yticks(**csfont) |
|
|
1392 |
idx += 1 |
|
|
1393 |
|
|
|
1394 |
key = 'HCM -- Hemoglobina Corpuscular Media' |
|
|
1395 |
low = lab_tests_overall[key].quantile(limit) |
|
|
1396 |
high = lab_tests_overall[key].quantile(1 - limit) |
|
|
1397 |
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)] |
|
|
1398 |
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)] |
|
|
1399 |
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)] |
|
|
1400 |
plt.subplot(4, 4, idx) |
|
|
1401 |
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha) |
|
|
1402 |
plt.xlabel('HCM -- Hemoglobina Corpuscular Media',**csfont) |
|
|
1403 |
plt.ylabel('Percentage',**csfont) |
|
|
1404 |
plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) |
|
|
1405 |
# plt.title('Temperature Histogram', **csfont) |
|
|
1406 |
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) |
|
|
1407 |
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) |
|
|
1408 |
plt.xticks(**csfont) |
|
|
1409 |
plt.yticks(**csfont) |
|
|
1410 |
idx += 1 |
|
|
1411 |
|
|
|
1412 |
key = 'NEU -- Neutr¢filos' |
|
|
1413 |
low = lab_tests_overall[key].quantile(limit) |
|
|
1414 |
high = lab_tests_overall[key].quantile(1 - limit) |
|
|
1415 |
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)] |
|
|
1416 |
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)] |
|
|
1417 |
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)] |
|
|
1418 |
plt.subplot(4, 4, idx) |
|
|
1419 |
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha) |
|
|
1420 |
plt.xlabel('NEU -- Neutr¢filos',**csfont) |
|
|
1421 |
plt.ylabel('Percentage',**csfont) |
|
|
1422 |
plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) |
|
|
1423 |
# plt.title('Temperature Histogram', **csfont) |
|
|
1424 |
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) |
|
|
1425 |
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) |
|
|
1426 |
plt.xticks(**csfont) |
|
|
1427 |
plt.yticks(**csfont) |
|
|
1428 |
idx += 1 |
|
|
1429 |
|
|
|
1430 |
key = 'NEU% -- Neutr¢filos %' |
|
|
1431 |
low = lab_tests_overall[key].quantile(limit) |
|
|
1432 |
high = lab_tests_overall[key].quantile(1 - limit) |
|
|
1433 |
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)] |
|
|
1434 |
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)] |
|
|
1435 |
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)] |
|
|
1436 |
plt.subplot(4, 4, idx) |
|
|
1437 |
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha) |
|
|
1438 |
plt.xlabel('NEU% -- Neutr¢filos%',**csfont) |
|
|
1439 |
plt.ylabel('Percentage',**csfont) |
|
|
1440 |
plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) |
|
|
1441 |
# plt.title('Temperature Histogram', **csfont) |
|
|
1442 |
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) |
|
|
1443 |
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) |
|
|
1444 |
plt.xticks(**csfont) |
|
|
1445 |
plt.yticks(**csfont) |
|
|
1446 |
idx += 1 |
|
|
1447 |
|
|
|
1448 |
key = 'LIN -- Linfocitos' |
|
|
1449 |
low = lab_tests_overall[key].quantile(limit) |
|
|
1450 |
high = lab_tests_overall[key].quantile(1 - limit) |
|
|
1451 |
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)] |
|
|
1452 |
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)] |
|
|
1453 |
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)] |
|
|
1454 |
plt.subplot(4, 4, idx) |
|
|
1455 |
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha) |
|
|
1456 |
plt.xlabel('LIN -- Linfocitos',**csfont) |
|
|
1457 |
plt.ylabel('Percentage',**csfont) |
|
|
1458 |
plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) |
|
|
1459 |
# plt.title('Temperature Histogram', **csfont) |
|
|
1460 |
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) |
|
|
1461 |
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) |
|
|
1462 |
plt.xticks(**csfont) |
|
|
1463 |
plt.yticks(**csfont) |
|
|
1464 |
idx += 1 |
|
|
1465 |
|
|
|
1466 |
key = 'LIN% -- Linfocitos %' |
|
|
1467 |
low = lab_tests_overall[key].quantile(limit) |
|
|
1468 |
high = lab_tests_overall[key].quantile(1 - limit) |
|
|
1469 |
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)] |
|
|
1470 |
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)] |
|
|
1471 |
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)] |
|
|
1472 |
plt.subplot(4, 4, idx) |
|
|
1473 |
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha) |
|
|
1474 |
plt.xlabel('LIN% -- Linfocitos%',**csfont) |
|
|
1475 |
plt.ylabel('Percentage',**csfont) |
|
|
1476 |
plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) |
|
|
1477 |
# plt.title('Temperature Histogram', **csfont) |
|
|
1478 |
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) |
|
|
1479 |
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) |
|
|
1480 |
plt.xticks(**csfont) |
|
|
1481 |
plt.yticks(**csfont) |
|
|
1482 |
idx += 1 |
|
|
1483 |
|
|
|
1484 |
key = 'ADW -- Coeficiente de anisocitosis' |
|
|
1485 |
low = lab_tests_overall[key].quantile(limit) |
|
|
1486 |
high = lab_tests_overall[key].quantile(1 - limit) |
|
|
1487 |
lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)] |
|
|
1488 |
lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)] |
|
|
1489 |
lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)] |
|
|
1490 |
plt.subplot(4, 4, idx) |
|
|
1491 |
plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha) |
|
|
1492 |
plt.xlabel('ADW -- Coeficiente de anisocitosis',**csfont) |
|
|
1493 |
plt.ylabel('Percentage',**csfont) |
|
|
1494 |
plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) |
|
|
1495 |
# plt.title('Temperature Histogram', **csfont) |
|
|
1496 |
plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype="step", linewidth=2) |
|
|
1497 |
plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype="step", linewidth=2) |
|
|
1498 |
plt.xticks(**csfont) |
|
|
1499 |
plt.yticks(**csfont) |
|
|
1500 |
idx += 1 |
|
|
1501 |
|
|
|
1502 |
handles, labels = ax.get_legend_handles_labels() |
|
|
1503 |
print(handles, labels) |
|
|
1504 |
# fig.legend(handles, labels, loc='upper center') |
|
|
1505 |
plt.figlegend(handles, labels, loc='upper center', ncol=5, fontsize=18, bbox_to_anchor=(0.5, 1.05), prop=font_manager.FontProperties(family='Times New Roman', |
|
|
1506 |
style='normal', size=18)) |
|
|
1507 |
# fig.legend(, [], loc='upper center') |
|
|
1508 |
|
|
|
1509 |
fig.tight_layout() |
|
|
1510 |
plt.show() |
|
|
1511 |
""" |
|
|
1512 |
|