Switch to unified view

a b/datasets/cdsl/preprocess.ipynb
1
{
2
 "cells": [
3
  {
4
   "cell_type": "markdown",
5
   "metadata": {},
6
   "source": [
7
    "# hm dataset pre-processing\n",
8
    "\n",
9
    "import packages"
10
   ]
11
  },
12
  {
13
   "cell_type": "code",
14
   "execution_count": null,
15
   "metadata": {},
16
   "outputs": [],
17
   "source": [
18
    "import os\n",
19
    "import pandas as pd\n",
20
    "import numpy as np\n",
21
    "import matplotlib.pyplot as plt\n",
22
    "import pickle as pkl\n",
23
    "import torch\n",
24
    "import math\n",
25
    "import datetime\n",
26
    "from tqdm import tqdm\n",
27
    "import datetime\n",
28
    "import re\n",
29
    "from functools import reduce"
30
   ]
31
  },
32
  {
33
   "cell_type": "markdown",
34
   "metadata": {},
35
   "source": [
36
    "## Demographic data"
37
   ]
38
  },
39
  {
40
   "cell_type": "code",
41
   "execution_count": null,
42
   "metadata": {},
43
   "outputs": [],
44
   "source": [
45
    "demographic = pd.read_csv('./raw_data/19_04_2021/COVID_DSL_01.CSV', encoding='ISO-8859-1', sep='|')\n",
46
    "print(len(demographic))\n",
47
    "demographic.head()"
48
   ]
49
  },
50
  {
51
   "cell_type": "code",
52
   "execution_count": null,
53
   "metadata": {},
54
   "outputs": [],
55
   "source": [
56
    "med = pd.read_csv('./raw_data/19_04_2021/COVID_DSL_04.CSV', encoding='ISO-8859-1', sep='|')\n",
57
    "print(len(med))\n",
58
    "med.head()"
59
   ]
60
  },
61
  {
62
   "cell_type": "code",
63
   "execution_count": null,
64
   "metadata": {},
65
   "outputs": [],
66
   "source": [
67
    "len(med['ID_ATC7'].unique())"
68
   ]
69
  },
70
  {
71
   "cell_type": "markdown",
72
   "metadata": {},
73
   "source": [
74
    "get rid of patient with missing label"
75
   ]
76
  },
77
  {
78
   "cell_type": "code",
79
   "execution_count": null,
80
   "metadata": {},
81
   "outputs": [],
82
   "source": [
83
    "print(len(demographic))\n",
84
    "demographic = demographic.dropna(axis=0, how='any', subset=['IDINGRESO', 'F_INGRESO_ING', 'F_ALTA_ING', 'MOTIVO_ALTA_ING'])\n",
85
    "print(len(demographic))"
86
   ]
87
  },
88
  {
89
   "cell_type": "code",
90
   "execution_count": null,
91
   "metadata": {},
92
   "outputs": [],
93
   "source": [
94
    "def outcome2num(x):\n",
95
    "    if x == 'Fallecimiento':\n",
96
    "        return 1\n",
97
    "    else:\n",
98
    "        return 0\n",
99
    "\n",
100
    "def to_one_hot(x, feature):\n",
101
    "    if x == feature:\n",
102
    "        return 1\n",
103
    "    else:\n",
104
    "        return 0"
105
   ]
106
  },
107
  {
108
   "cell_type": "code",
109
   "execution_count": null,
110
   "metadata": {},
111
   "outputs": [],
112
   "source": [
113
    "# select necessary columns from demographic\n",
114
    "demographic = demographic[\n",
115
    "        [\n",
116
    "            'IDINGRESO', \n",
117
    "            'EDAD',\n",
118
    "            'SEX',\n",
119
    "            'F_INGRESO_ING', \n",
120
    "            'F_ALTA_ING', \n",
121
    "            'MOTIVO_ALTA_ING', \n",
122
    "            'ESPECIALIDAD_URGENCIA', \n",
123
    "            'DIAG_URG'\n",
124
    "        ]\n",
125
    "    ]\n",
126
    "\n",
127
    "# rename column\n",
128
    "demographic = demographic.rename(columns={\n",
129
    "    'IDINGRESO': 'PATIENT_ID',\n",
130
    "    'EDAD': 'AGE',\n",
131
    "    'SEX': 'SEX',\n",
132
    "    'F_INGRESO_ING': 'ADMISSION_DATE',\n",
133
    "    'F_ALTA_ING': 'DEPARTURE_DATE',\n",
134
    "    'MOTIVO_ALTA_ING': 'OUTCOME',\n",
135
    "    'ESPECIALIDAD_URGENCIA': 'DEPARTMENT_OF_EMERGENCY',\n",
136
    "    'DIAG_URG': 'DIAGNOSIS_AT_EMERGENCY_VISIT'\n",
137
    "})\n",
138
    "\n",
139
    "# SEX: male: 1; female: 0\n",
140
    "demographic['SEX'].replace('MALE', 1, inplace=True)\n",
141
    "demographic['SEX'].replace('FEMALE', 0, inplace=True)\n",
142
    "\n",
143
    "# outcome: Fallecimiento(dead): 1; others: 0\n",
144
    "demographic['OUTCOME'] = demographic['OUTCOME'].map(outcome2num)\n",
145
    "\n",
146
    "# diagnosis at emergency visit (loss rate < 10%)\n",
147
    "# demographic['DIFFICULTY_BREATHING'] = demographic['DIAGNOSIS_AT_EMERGENCY_VISIT'].map(lambda x: to_one_hot(x, 'DIFICULTAD RESPIRATORIA')) # 1674\n",
148
    "# demographic['SUSPECT_COVID'] = demographic['DIAGNOSIS_AT_EMERGENCY_VISIT'].map(lambda x: to_one_hot(x, 'SOSPECHA COVID-19')) # 960\n",
149
    "# demographic['FEVER'] = demographic['DIAGNOSIS_AT_EMERGENCY_VISIT'].map(lambda x: to_one_hot(x, 'FIEBRE')) # 455\n",
150
    "\n",
151
    "# department of emergency (loss rate < 10%)\n",
152
    "# demographic['EMERGENCY'] = demographic['DEPARTMENT_OF_EMERGENCY'].map(lambda x: to_one_hot(x, 'Medicina de Urgencias')) # 3914"
153
   ]
154
  },
155
  {
156
   "cell_type": "code",
157
   "execution_count": null,
158
   "metadata": {},
159
   "outputs": [],
160
   "source": [
161
    "# del useless data\n",
162
    "demographic = demographic[\n",
163
    "        [\n",
164
    "            'PATIENT_ID',\n",
165
    "            'AGE',\n",
166
    "            'SEX',\n",
167
    "            'ADMISSION_DATE',\n",
168
    "            'DEPARTURE_DATE',\n",
169
    "            'OUTCOME',\n",
170
    "            # 'DIFFICULTY_BREATHING',\n",
171
    "            # 'SUSPECT_COVID',\n",
172
    "            # 'FEVER',\n",
173
    "            # 'EMERGENCY'\n",
174
    "        ]\n",
175
    "    ]"
176
   ]
177
  },
178
  {
179
   "cell_type": "code",
180
   "execution_count": null,
181
   "metadata": {},
182
   "outputs": [],
183
   "source": [
184
    "demographic.describe().to_csv('demographic_overview.csv', mode='w', index=False)\n",
185
    "demographic.describe()"
186
   ]
187
  },
188
  {
189
   "cell_type": "markdown",
190
   "metadata": {},
191
   "source": [
192
    "### Analyze data"
193
   ]
194
  },
195
  {
196
   "cell_type": "code",
197
   "execution_count": null,
198
   "metadata": {},
199
   "outputs": [],
200
   "source": [
201
    "plt.scatter(demographic['PATIENT_ID'], demographic['AGE'], s=1)\n",
202
    "plt.xlabel('Patient Id')\n",
203
    "plt.ylabel('Age')\n",
204
    "plt.title('Patient-Age Scatter Plot')"
205
   ]
206
  },
207
  {
208
   "cell_type": "code",
209
   "execution_count": null,
210
   "metadata": {},
211
   "outputs": [],
212
   "source": [
213
    "plt.scatter(demographic['PATIENT_ID'], demographic['AGE'], s=1)\n",
214
    "plt.xlabel('Patient Id')\n",
215
    "plt.ylabel('Age')\n",
216
    "plt.title('Patient-Age Scatter Plot')"
217
   ]
218
  },
219
  {
220
   "cell_type": "code",
221
   "execution_count": null,
222
   "metadata": {},
223
   "outputs": [],
224
   "source": [
225
    "demographic.to_csv('demographic.csv', mode='w', index=False)\n",
226
    "demographic.head()"
227
   ]
228
  },
229
  {
230
   "cell_type": "markdown",
231
   "metadata": {},
232
   "source": [
233
    "## Vital Signal"
234
   ]
235
  },
236
  {
237
   "cell_type": "code",
238
   "execution_count": null,
239
   "metadata": {},
240
   "outputs": [],
241
   "source": [
242
    "vital_signs = pd.read_csv('./raw_data/19_04_2021/COVID_DSL_02.CSV', encoding='ISO-8859-1', sep='|')\n",
243
    "print(len(vital_signs))\n",
244
    "vital_signs.head()"
245
   ]
246
  },
247
  {
248
   "cell_type": "code",
249
   "execution_count": null,
250
   "metadata": {},
251
   "outputs": [],
252
   "source": [
253
    "vital_signs = vital_signs.rename(columns={\n",
254
    "    'IDINGRESO': 'PATIENT_ID',\n",
255
    "    'CONSTANTS_ING_DATE': 'RECORD_DATE',\n",
256
    "    'CONSTANTS_ING_TIME': 'RECORD_TIME',\n",
257
    "    'FC_HR_ING': 'HEART_RATE',\n",
258
    "    'GLU_GLY_ING': 'BLOOD_GLUCOSE',\n",
259
    "    'SAT_02_ING': 'OXYGEN_SATURATION',\n",
260
    "    'TA_MAX_ING': 'MAX_BLOOD_PRESSURE',\n",
261
    "    'TA_MIN_ING': 'MIN_BLOOD_PRESSURE',\n",
262
    "    'TEMP_ING': 'TEMPERATURE'\n",
263
    "})\n",
264
    "vital_signs['RECORD_TIME'] = vital_signs['RECORD_DATE'] + ' ' + vital_signs['RECORD_TIME']\n",
265
    "vital_signs['RECORD_TIME'] = vital_signs['RECORD_TIME'].map(lambda x: str(datetime.datetime.strptime(x, '%Y-%m-%d %H:%M')))\n",
266
    "vital_signs = vital_signs.drop(['RECORD_DATE', 'SAT_02_ING_OBS', 'BLOOD_GLUCOSE'], axis=1)"
267
   ]
268
  },
269
  {
270
   "cell_type": "code",
271
   "execution_count": null,
272
   "metadata": {},
273
   "outputs": [],
274
   "source": [
275
    "vital_signs.describe()"
276
   ]
277
  },
278
  {
279
   "cell_type": "code",
280
   "execution_count": null,
281
   "metadata": {},
282
   "outputs": [],
283
   "source": [
284
    "vital_signs.head()"
285
   ]
286
  },
287
  {
288
   "cell_type": "code",
289
   "execution_count": null,
290
   "metadata": {},
291
   "outputs": [],
292
   "source": [
293
    "def format_temperature(x):\n",
294
    "    if type(x) == str:\n",
295
    "        return float(x.replace(',', '.'))\n",
296
    "    else:\n",
297
    "        return float(x)\n",
298
    "\n",
299
    "def format_oxygen(x):\n",
300
    "    x = float(x)\n",
301
    "    if x > 100:\n",
302
    "        return np.nan\n",
303
    "    else:\n",
304
    "        return x\n",
305
    "\n",
306
    "def format_heart_rate(x):\n",
307
    "    x = int(x)\n",
308
    "    if x > 220:\n",
309
    "        return np.nan\n",
310
    "    else:\n",
311
    "        return x\n",
312
    "\n",
313
    "vital_signs['TEMPERATURE'] = vital_signs['TEMPERATURE'].map(lambda x: format_temperature(x))\n",
314
    "vital_signs['OXYGEN_SATURATION'] = vital_signs['OXYGEN_SATURATION'].map(lambda x: format_oxygen(x))\n",
315
    "vital_signs['HEART_RATE'] = vital_signs['HEART_RATE'].map(lambda x: format_heart_rate(x))"
316
   ]
317
  },
318
  {
319
   "cell_type": "code",
320
   "execution_count": null,
321
   "metadata": {},
322
   "outputs": [],
323
   "source": [
324
    "vital_signs = vital_signs.replace(0, np.NAN)"
325
   ]
326
  },
327
  {
328
   "cell_type": "code",
329
   "execution_count": null,
330
   "metadata": {},
331
   "outputs": [],
332
   "source": [
333
    "vital_signs = vital_signs.groupby(['PATIENT_ID', 'RECORD_TIME'], dropna=True, as_index = False).mean()\n",
334
    "vital_signs.head()"
335
   ]
336
  },
337
  {
338
   "cell_type": "code",
339
   "execution_count": null,
340
   "metadata": {},
341
   "outputs": [],
342
   "source": [
343
    "vital_signs.describe()"
344
   ]
345
  },
346
  {
347
   "cell_type": "code",
348
   "execution_count": null,
349
   "metadata": {},
350
   "outputs": [],
351
   "source": [
352
    "vital_signs.describe().to_csv('vital_signs_overview.csv', index=False, mode='w')\n",
353
    "vital_signs.describe()"
354
   ]
355
  },
356
  {
357
   "cell_type": "code",
358
   "execution_count": null,
359
   "metadata": {},
360
   "outputs": [],
361
   "source": [
362
    "#plt.rcParams['figure.figsize'] = [10, 5]\n",
363
    "fig=plt.figure(figsize=(16,10), dpi= 100, facecolor='w', edgecolor='k')\n",
364
    "\n",
365
    "plt.subplot(2, 3, 1)\n",
366
    "plt.scatter(vital_signs.index, vital_signs['MAX_BLOOD_PRESSURE'], s=1)\n",
367
    "plt.xlabel('Index')\n",
368
    "plt.ylabel('Max Blood Pressure')\n",
369
    "plt.title('Visit-Max Blood Pressure Scatter Plot')\n",
370
    "\n",
371
    "plt.subplot(2, 3, 2)\n",
372
    "plt.scatter(vital_signs.index, vital_signs['MIN_BLOOD_PRESSURE'], s=1)\n",
373
    "plt.xlabel('Index')\n",
374
    "plt.ylabel('Min Blood Pressure')\n",
375
    "plt.title('Visit-Min Blood Pressure Scatter Plot')\n",
376
    "\n",
377
    "plt.subplot(2, 3, 3)\n",
378
    "plt.scatter(vital_signs.index, vital_signs['TEMPERATURE'], s=1)\n",
379
    "plt.xlabel('Index')\n",
380
    "plt.ylabel('Temperature')\n",
381
    "plt.title('Visit-Temperature Scatter Plot')\n",
382
    "\n",
383
    "plt.subplot(2, 3, 4)\n",
384
    "plt.scatter(vital_signs.index, vital_signs['HEART_RATE'], s=1)\n",
385
    "plt.xlabel('Index')\n",
386
    "plt.ylabel('Heart Rate')\n",
387
    "plt.title('Visit-Heart Rate Scatter Plot')\n",
388
    "\n",
389
    "plt.subplot(2, 3, 5)\n",
390
    "plt.scatter(vital_signs.index, vital_signs['OXYGEN_SATURATION'], s=1)\n",
391
    "plt.xlabel('Index')\n",
392
    "plt.ylabel('Oxygen Saturation')\n",
393
    "plt.title('Visit-Oxygen Saturation Scatter Plot')\n",
394
    "\n",
395
    "plt.show()"
396
   ]
397
  },
398
  {
399
   "cell_type": "code",
400
   "execution_count": null,
401
   "metadata": {},
402
   "outputs": [],
403
   "source": [
404
    "#plt.rcParams['figure.figsize'] = [10, 5]\n",
405
    "fig=plt.figure(figsize=(16,10), dpi= 100, facecolor='w', edgecolor='k')\n",
406
    "\n",
407
    "plt.subplot(2, 3, 1)\n",
408
    "plt.hist(vital_signs['MAX_BLOOD_PRESSURE'], bins=30)\n",
409
    "plt.xlabel('Index')\n",
410
    "plt.ylabel('Max Blood Pressure')\n",
411
    "plt.title('Visit-Max Blood Pressure Histogram')\n",
412
    "\n",
413
    "plt.subplot(2, 3, 2)\n",
414
    "plt.hist(vital_signs['MIN_BLOOD_PRESSURE'], bins=30)\n",
415
    "plt.xlabel('Index')\n",
416
    "plt.ylabel('Min Blood Pressure')\n",
417
    "plt.title('Visit-Min Blood Pressure Histogram')\n",
418
    "\n",
419
    "plt.subplot(2, 3, 3)\n",
420
    "plt.hist(vital_signs['TEMPERATURE'], bins=30)\n",
421
    "plt.xlabel('Index')\n",
422
    "plt.ylabel('Temperature')\n",
423
    "plt.title('Visit-Temperature Histogram')\n",
424
    "\n",
425
    "plt.subplot(2, 3, 4)\n",
426
    "plt.hist(vital_signs['HEART_RATE'], bins=30)\n",
427
    "plt.xlabel('Index')\n",
428
    "plt.ylabel('Heart Rate')\n",
429
    "plt.title('Visit-Heart Rate Histogram')\n",
430
    "\n",
431
    "plt.subplot(2, 3, 5)\n",
432
    "plt.hist(vital_signs['OXYGEN_SATURATION'], bins=30)\n",
433
    "plt.xlabel('Index')\n",
434
    "plt.ylabel('Oxygen Saturation')\n",
435
    "plt.title('Visit-Oxygen Saturation Histogram')\n",
436
    "\n",
437
    "plt.show()"
438
   ]
439
  },
440
  {
441
   "cell_type": "markdown",
442
   "metadata": {},
443
   "source": [
444
    "### Missing rate of each visit"
445
   ]
446
  },
447
  {
448
   "cell_type": "code",
449
   "execution_count": null,
450
   "metadata": {},
451
   "outputs": [],
452
   "source": [
453
    "sum(vital_signs.T.isnull().sum()) / ((len(vital_signs.T) - 2) * len(vital_signs))"
454
   ]
455
  },
456
  {
457
   "cell_type": "markdown",
458
   "metadata": {},
459
   "source": [
460
    "### Normalize data"
461
   ]
462
  },
463
  {
464
   "cell_type": "code",
465
   "execution_count": null,
466
   "metadata": {},
467
   "outputs": [],
468
   "source": [
469
    "\"\"\"\n",
470
    "for key in vital_signs.keys()[2:]:\n",
471
    "    vital_signs[key] = (vital_signs[key] - vital_signs[key].mean()) / (vital_signs[key].std() + 1e-12)\n",
472
    "\n",
473
    "vital_signs.describe()\n",
474
    "\"\"\""
475
   ]
476
  },
477
  {
478
   "cell_type": "code",
479
   "execution_count": null,
480
   "metadata": {},
481
   "outputs": [],
482
   "source": [
483
    "vital_signs.to_csv('visual_signs.csv', mode='w', index=False)"
484
   ]
485
  },
486
  {
487
   "cell_type": "code",
488
   "execution_count": null,
489
   "metadata": {},
490
   "outputs": [],
491
   "source": [
492
    "len(vital_signs) / len(vital_signs['PATIENT_ID'].unique())"
493
   ]
494
  },
495
  {
496
   "cell_type": "markdown",
497
   "metadata": {},
498
   "source": [
499
    "## Lab Tests"
500
   ]
501
  },
502
  {
503
   "cell_type": "code",
504
   "execution_count": null,
505
   "metadata": {},
506
   "outputs": [],
507
   "source": [
508
    "lab_tests = pd.read_csv('./raw_data/19_04_2021/COVID_DSL_06_v2.CSV', encoding='ISO-8859-1', sep=';')\n",
509
    "lab_tests = lab_tests.rename(columns={'IDINGRESO': 'PATIENT_ID'})\n",
510
    "print(len(lab_tests))\n",
511
    "\n",
512
    "# del useless data\n",
513
    "lab_tests = lab_tests[\n",
514
    "        [\n",
515
    "            'PATIENT_ID',\n",
516
    "            'LAB_NUMBER',\n",
517
    "            'LAB_DATE',\n",
518
    "            'TIME_LAB',\n",
519
    "            'ITEM_LAB',\n",
520
    "            'VAL_RESULT'\n",
521
    "            # UD_RESULT: unit\n",
522
    "            # REF_VALUES: reference values\n",
523
    "        ]\n",
524
    "    ]\n",
525
    "\n",
526
    "lab_tests.head()"
527
   ]
528
  },
529
  {
530
   "cell_type": "code",
531
   "execution_count": null,
532
   "metadata": {},
533
   "outputs": [],
534
   "source": [
535
    "lab_tests = lab_tests.groupby(['PATIENT_ID', 'LAB_NUMBER', 'LAB_DATE', 'TIME_LAB', 'ITEM_LAB'], dropna=True, as_index = False).first()\n",
536
    "lab_tests = lab_tests.set_index(['PATIENT_ID', 'LAB_NUMBER', 'LAB_DATE', 'TIME_LAB', 'ITEM_LAB'], drop = True).unstack('ITEM_LAB')['VAL_RESULT'].reset_index()\n",
537
    "\n",
538
    "lab_tests = lab_tests.drop([\n",
539
    "    'CFLAG -- ALARMA HEMOGRAMA', \n",
540
    "    'CORONA -- PCR CORONAVIRUS 2019nCoV', \n",
541
    "    'CRIOGLO -- CRIOGLOBULINAS',\n",
542
    "    'EGCOVID -- ESTUDIO GENETICO COVID-19',\n",
543
    "    'FRO1 -- ',\n",
544
    "    'FRO1 -- FROTIS EN SANGRE PERIFERICA',\n",
545
    "    'FRO2 -- ',\n",
546
    "    'FRO2 -- FROTIS EN SANGRE PERIFERICA',\n",
547
    "    'FRO3 -- ',\n",
548
    "    'FRO3 -- FROTIS EN SANGRE PERIFERICA',\n",
549
    "    'FRO_COMEN -- ',\n",
550
    "    'FRO_COMEN -- FROTIS EN SANGRE PERIFERICA',\n",
551
    "    'G-CORONAV (RT-PCR) -- Tipo de muestra: ASPIRADO BRONCOALVEOLAR',\n",
552
    "    'G-CORONAV (RT-PCR) -- Tipo de muestra: EXUDADO',\n",
553
    "    'GRRH -- GRUPO SANGUÖNEO Y FACTOR Rh',\n",
554
    "    'HEML -- RECUENTO CELULAR LIQUIDO',\n",
555
    "    'HEML -- Recuento Hemat¡es',\n",
556
    "    'IFSUERO -- INMUNOFIJACION EN SUERO',\n",
557
    "    'OBS_BIOMOL -- OBSERVACIONES GENETICA MOLECULAR',\n",
558
    "    'OBS_BIOO -- Observaciones Bioqu¡mica Orina',\n",
559
    "    'OBS_CB -- Observaciones Coagulaci¢n',\n",
560
    "    'OBS_GASES -- Observaciones Gasometr¡a Arterial',\n",
561
    "    'OBS_GASV -- Observaciones Gasometr¡a Venosa',\n",
562
    "    'OBS_GEN2 -- OBSERVACIONES GENETICA',\n",
563
    "    'OBS_HOR -- Observaciones Hormonas',\n",
564
    "    'OBS_MICRO -- Observaciones Microbiolog¡a',\n",
565
    "    'OBS_NULA2 -- Observaciones Bioqu¡mica',\n",
566
    "    'OBS_NULA3 -- Observaciones Hematolog¡a',\n",
567
    "    'OBS_PESP -- Observaciones Pruebas especiales',\n",
568
    "    'OBS_SERO -- Observaciones Serolog¡a',\n",
569
    "    'OBS_SIS -- Observaciones Orina',\n",
570
    "    'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: ASPIRADO BRONCOALVEOLAR',\n",
571
    "    'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: BAS',\n",
572
    "    'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: ESPUTO',\n",
573
    "    'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: EXUDADO',\n",
574
    "    'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: LAVADO BRONCOALVEOLAR',\n",
575
    "    'PCR VIRUS RESPIRATORIOS -- Tipo de muestra: LAVADO NASOFARÖNGEO',\n",
576
    "    'PTGOR -- PROTEINOGRAMA ORINA',\n",
577
    "    'RESUL_IFT -- ESTUDIO DE INMUNOFENOTIPO',\n",
578
    "    'RESUL_IFT -- Resultado',\n",
579
    "    'Resultado -- Resultado',\n",
580
    "    'SED1 -- ',\n",
581
    "    'SED1 -- SEDIMENTO',\n",
582
    "    'SED2 -- ',\n",
583
    "    'SED2 -- SEDIMENTO',\n",
584
    "    'SED3 -- ',\n",
585
    "    'SED3 -- SEDIMENTO',\n",
586
    "    'TIPOL -- TIPO DE LIQUIDO',\n",
587
    "    'Tecnica -- T\\x82cnica',\n",
588
    "    'TpMues -- Tipo de muestra',\n",
589
    "    'VHCBLOT -- INMUNOBLOT VIRUS HEPATITIS C',\n",
590
    "    'VIR_TM -- VIRUS TIPO DE MUESTRA',\n",
591
    "    'LEGIORI -- AG. LEGIONELA PNEUMOPHILA EN ORINA',\n",
592
    "    'NEUMOORI -- AG NEUMOCOCO EN ORINA',\n",
593
    "    'VIHAC -- VIH AC'\n",
594
    "    ], axis=1)\n",
595
    "\n",
596
    "    \n",
597
    "lab_tests.head()"
598
   ]
599
  },
600
  {
601
   "cell_type": "code",
602
   "execution_count": null,
603
   "metadata": {},
604
   "outputs": [],
605
   "source": [
606
    "lab_tests = lab_tests.replace('Sin resultado.', np.nan)\n",
607
    "lab_tests = lab_tests.replace('Sin resultado', np.nan)\n",
608
    "lab_tests = lab_tests.replace('----', np.nan).replace('---', np.nan)\n",
609
    "lab_tests = lab_tests.replace('> ', '').replace('< ', '')\n",
610
    "\n",
611
    "def change_format(x):\n",
612
    "    if x is None:\n",
613
    "        return np.nan\n",
614
    "    elif type(x) == str:\n",
615
    "        if x.startswith('Negativo ('):\n",
616
    "            return x.replace('Negativo (', '-')[:-1]\n",
617
    "        elif x.startswith('Positivo ('):\n",
618
    "            return x.replace('Positivo (', '')[:-1]\n",
619
    "        elif x.startswith('Zona limite ('):\n",
620
    "            return x.replace('Zona limite (', '')[:-1]\n",
621
    "        elif x.startswith('>'):\n",
622
    "            return x.replace('> ', '').replace('>', '')\n",
623
    "        elif x.startswith('<'):\n",
624
    "            return x.replace('< ', '').replace('<', '')\n",
625
    "        elif x.endswith(' mg/dl'):\n",
626
    "            return x.replace(' mg/dl', '')\n",
627
    "        elif x.endswith('/æl'):\n",
628
    "            return x.replace('/æl', '')\n",
629
    "        elif x.endswith(' copias/mL'):\n",
630
    "            return x.replace(' copias/mL', '')\n",
631
    "        elif x == 'Numerosos':\n",
632
    "            return 1.5\n",
633
    "        elif x == 'Aislados':\n",
634
    "            return 0.5\n",
635
    "        elif x == 'Se detecta' or x == 'Se observan' or x == 'Normal' or x == 'Positivo':\n",
636
    "            return 1\n",
637
    "        elif x == 'No se detecta' or x == 'No se observan' or x == 'Negativo':\n",
638
    "            return 0\n",
639
    "        elif x == 'Indeterminado':\n",
640
    "            return np.nan\n",
641
    "        else:\n",
642
    "            num = re.findall(\"[-+]?\\d+\\.\\d+\", x)\n",
643
    "            if len(num) == 0:\n",
644
    "                return np.nan\n",
645
    "            else:\n",
646
    "                return num[0]\n",
647
    "    else:\n",
648
    "        return x\n",
649
    "\n",
650
    "feature_value_dict = dict()\n",
651
    "\n",
652
    "for k in tqdm(lab_tests.keys()[4:]):\n",
653
    "    lab_tests[k] = lab_tests[k].map(lambda x: change_format(change_format(x)))\n",
654
    "    feature_value_dict[k] = lab_tests[k].unique()"
655
   ]
656
  },
657
  {
658
   "cell_type": "code",
659
   "execution_count": null,
660
   "metadata": {},
661
   "outputs": [],
662
   "source": [
663
    "def nan_and_not_nan(x):\n",
664
    "    if x == x:\n",
665
    "        return 1\n",
666
    "    else: # nan\n",
667
    "        return 0\n",
668
    "\n",
669
    "def is_float(num):\n",
670
    "    try:\n",
671
    "        float(num)\n",
672
    "        return True\n",
673
    "    except ValueError:\n",
674
    "        return False\n",
675
    "\n",
676
    "def is_all_float(x):\n",
677
    "    for i in x:\n",
678
    "        if i == i and (i != None):\n",
679
    "            if not is_float(i):\n",
680
    "                return False\n",
681
    "    return True\n",
682
    "\n",
683
    "def to_float(x):\n",
684
    "    if x != None:\n",
685
    "        return float(x)\n",
686
    "    else:\n",
687
    "        return np.nan\n",
688
    "\n",
689
    "other_feature_dict = dict()\n",
690
    "\n",
691
    "for feature in tqdm(feature_value_dict.keys()):\n",
692
    "    values = feature_value_dict[feature]\n",
693
    "    if is_all_float(values):\n",
694
    "        lab_tests[feature] = lab_tests[feature].map(lambda x: to_float(x))\n",
695
    "    elif len(values) == 2:\n",
696
    "        lab_tests[feature] = lab_tests[feature].map(lambda x: nan_and_not_nan(x))\n",
697
    "    else:\n",
698
    "        other_feature_dict[feature] = values"
699
   ]
700
  },
701
  {
702
   "cell_type": "code",
703
   "execution_count": null,
704
   "metadata": {},
705
   "outputs": [],
706
   "source": [
707
    "other_feature_dict"
708
   ]
709
  },
710
  {
711
   "cell_type": "code",
712
   "execution_count": null,
713
   "metadata": {},
714
   "outputs": [],
715
   "source": [
716
    "def format_time(t):\n",
717
    "    if '/' in t:\n",
718
    "        return str(datetime.datetime.strptime(t, '%d/%m/%Y %H:%M'))\n",
719
    "    else:\n",
720
    "        return str(datetime.datetime.strptime(t, '%d-%m-%Y %H:%M'))\n",
721
    "\n",
722
    "lab_tests['RECORD_TIME'] = lab_tests['LAB_DATE'] + ' ' + lab_tests['TIME_LAB']\n",
723
    "lab_tests['RECORD_TIME'] = lab_tests['RECORD_TIME'].map(lambda x: format_time(x))\n",
724
    "lab_tests = lab_tests.drop(['LAB_NUMBER', 'LAB_DATE', 'TIME_LAB'], axis=1)\n",
725
    "# lab_tests = lab_tests.drop(['LAB_NUMBER', 'TIME_LAB'], axis=1)\n",
726
    "lab_tests.head()"
727
   ]
728
  },
729
  {
730
   "cell_type": "code",
731
   "execution_count": null,
732
   "metadata": {},
733
   "outputs": [],
734
   "source": [
735
    "lab_tests_patient = lab_tests.groupby(['PATIENT_ID'], dropna=True, as_index = False).mean()\n",
736
    "print(len(lab_tests_patient))\n",
737
    "count = [i for i in lab_tests_patient.count()[1:]]\n",
738
    "plt.hist(count)"
739
   ]
740
  },
741
  {
742
   "cell_type": "code",
743
   "execution_count": null,
744
   "metadata": {},
745
   "outputs": [],
746
   "source": [
747
    "patient_total = len(lab_tests_patient)\n",
748
    "threshold = patient_total * 0.1\n",
749
    "reserved_keys = []\n",
750
    "\n",
751
    "for key in lab_tests_patient.keys():\n",
752
    "    if lab_tests_patient[key].count() > threshold:\n",
753
    "        reserved_keys.append(key)\n",
754
    "\n",
755
    "print(len(reserved_keys))\n",
756
    "reserved_keys"
757
   ]
758
  },
759
  {
760
   "cell_type": "code",
761
   "execution_count": null,
762
   "metadata": {},
763
   "outputs": [],
764
   "source": [
765
    "reserved_keys.insert(1, 'RECORD_TIME')\n",
766
    "\n",
767
    "lab_tests = lab_tests.groupby(['PATIENT_ID', 'RECORD_TIME'], dropna=True, as_index = False).mean()\n",
768
    "\n",
769
    "lab_tests = lab_tests[reserved_keys]\n",
770
    "lab_tests.head()"
771
   ]
772
  },
773
  {
774
   "cell_type": "markdown",
775
   "metadata": {},
776
   "source": [
777
    "### Missing rate of each visit"
778
   ]
779
  },
780
  {
781
   "cell_type": "code",
782
   "execution_count": null,
783
   "metadata": {},
784
   "outputs": [],
785
   "source": [
786
    "sum(lab_tests.T.isnull().sum()) / ((len(lab_tests.T) - 2) * len(lab_tests))"
787
   ]
788
  },
789
  {
790
   "cell_type": "markdown",
791
   "metadata": {},
792
   "source": [
793
    "### Scatter Plot"
794
   ]
795
  },
796
  {
797
   "cell_type": "code",
798
   "execution_count": null,
799
   "metadata": {},
800
   "outputs": [],
801
   "source": [
802
    "fig=plt.figure(figsize=(16,200), dpi= 100, facecolor='w', edgecolor='k')\n",
803
    "\n",
804
    "i = 1\n",
805
    "for key in lab_tests.keys()[2:]:\n",
806
    "    plt.subplot(33, 3, i)\n",
807
    "    plt.scatter(lab_tests.index, lab_tests[key], s=1)\n",
808
    "    plt.ylabel(key)\n",
809
    "    i += 1\n",
810
    "\n",
811
    "plt.show()"
812
   ]
813
  },
814
  {
815
   "cell_type": "code",
816
   "execution_count": null,
817
   "metadata": {},
818
   "outputs": [],
819
   "source": [
820
    "fig=plt.figure(figsize=(20,120), dpi= 100, facecolor='w', edgecolor='k')\n",
821
    "\n",
822
    "i = 1\n",
823
    "for key in lab_tests.keys()[2:]:\n",
824
    "    plt.subplot(23, 4, i)\n",
825
    "    plt.hist(lab_tests[key], bins=30)\n",
826
    "    q3 = lab_tests[key].quantile(0.75)\n",
827
    "    q1 = lab_tests[key].quantile(0.25)\n",
828
    "    qh = q3 + 3 * (q3 - q1)\n",
829
    "    ql = q1 - 3 * (q3 - q1)\n",
830
    "    sigma = 5\n",
831
    "    plt.axline([sigma*lab_tests[key].std() + lab_tests[key].mean(), 0], [sigma*lab_tests[key].std() + lab_tests[key].mean(), 1], color = \"r\", linestyle=(0, (5, 5)))\n",
832
    "    plt.axline([-sigma*lab_tests[key].std() + lab_tests[key].mean(), 0], [-sigma*lab_tests[key].std() + lab_tests[key].mean(), 1], color = \"r\", linestyle=(0, (5, 5)))\n",
833
    "    #plt.axline([lab_tests[key].quantile(0.25), 0], [lab_tests[key].quantile(0.25), 1], color = \"k\", linestyle=(0, (5, 5)))\n",
834
    "    #plt.axline([lab_tests[key].quantile(0.75), 0], [lab_tests[key].quantile(0.75), 1], color = \"k\", linestyle=(0, (5, 5)))\n",
835
    "    plt.axline([qh, 0], [qh, 1], color='k', linestyle=(0, (5, 5)))\n",
836
    "    plt.axline([ql, 0], [ql, 1], color='k', linestyle=(0, (5, 5)))\n",
837
    "    plt.ylabel(key)\n",
838
    "    i += 1\n",
839
    "\n",
840
    "plt.show()"
841
   ]
842
  },
843
  {
844
   "cell_type": "markdown",
845
   "metadata": {},
846
   "source": [
847
    "### Normalize data"
848
   ]
849
  },
850
  {
851
   "cell_type": "code",
852
   "execution_count": null,
853
   "metadata": {},
854
   "outputs": [],
855
   "source": [
856
    "\"\"\"\n",
857
    "for key in lab_tests.keys()[2:]:\n",
858
    "    lab_tests[key] = (lab_tests[key] - lab_tests[key].mean()) / (lab_tests[key].std() + 1e-12)\n",
859
    "\n",
860
    "lab_tests.describe()\n",
861
    "\"\"\""
862
   ]
863
  },
864
  {
865
   "cell_type": "code",
866
   "execution_count": null,
867
   "metadata": {},
868
   "outputs": [],
869
   "source": [
870
    "# 【del normalization】\n",
871
    "# for key in lab_tests.keys()[2:]:\n",
872
    "#     r = lab_tests[lab_tests[key].between(lab_tests[key].quantile(0.05), lab_tests[key].quantile(0.95))]\n",
873
    "#     lab_tests[key] = (lab_tests[key] - r[key].mean()) / (r[key].std() + 1e-12)"
874
   ]
875
  },
876
  {
877
   "cell_type": "code",
878
   "execution_count": null,
879
   "metadata": {},
880
   "outputs": [],
881
   "source": [
882
    "lab_tests.to_csv('lab_test.csv', mode='w', index=False)"
883
   ]
884
  },
885
  {
886
   "cell_type": "markdown",
887
   "metadata": {},
888
   "source": [
889
    "# Concat data"
890
   ]
891
  },
892
  {
893
   "cell_type": "code",
894
   "execution_count": null,
895
   "metadata": {},
896
   "outputs": [],
897
   "source": [
898
    "demographic['PATIENT_ID'] = demographic['PATIENT_ID'].map(lambda x: str(int(x)))\n",
899
    "vital_signs['PATIENT_ID'] = vital_signs['PATIENT_ID'].map(lambda x: str(int(x)))\n",
900
    "lab_tests['PATIENT_ID'] = lab_tests['PATIENT_ID'].map(lambda x: str(int(x)))"
901
   ]
902
  },
903
  {
904
   "cell_type": "code",
905
   "execution_count": null,
906
   "metadata": {},
907
   "outputs": [],
908
   "source": [
909
    "len(demographic['PATIENT_ID'].unique()), len(vital_signs['PATIENT_ID'].unique()), len(lab_tests['PATIENT_ID'].unique())"
910
   ]
911
  },
912
  {
913
   "cell_type": "code",
914
   "execution_count": null,
915
   "metadata": {},
916
   "outputs": [],
917
   "source": [
918
    "train_df = pd.merge(vital_signs, lab_tests, on=['PATIENT_ID', 'RECORD_TIME'], how='outer')\n",
919
    "\n",
920
    "train_df = train_df.groupby(['PATIENT_ID', 'RECORD_TIME'], dropna=True, as_index = False).mean()\n",
921
    "\n",
922
    "train_df = pd.merge(demographic, train_df, on=['PATIENT_ID'], how='left')\n",
923
    "\n",
924
    "train_df.head()"
925
   ]
926
  },
927
  {
928
   "cell_type": "code",
929
   "execution_count": null,
930
   "metadata": {},
931
   "outputs": [],
932
   "source": [
933
    "# del rows without patient_id, admission_date, record_time, or outcome\n",
934
    "train_df = train_df.dropna(axis=0, how='any', subset=['PATIENT_ID', 'ADMISSION_DATE', 'RECORD_TIME', 'OUTCOME'])"
935
   ]
936
  },
937
  {
938
   "cell_type": "code",
939
   "execution_count": null,
940
   "metadata": {},
941
   "outputs": [],
942
   "source": [
943
    "train_df.to_csv('train.csv', mode='w', index=False)\n",
944
    "train_df.describe()"
945
   ]
946
  },
947
  {
948
   "cell_type": "markdown",
949
   "metadata": {},
950
   "source": [
951
    "## Missing rate of each visit"
952
   ]
953
  },
954
  {
955
   "cell_type": "code",
956
   "execution_count": null,
957
   "metadata": {},
958
   "outputs": [],
959
   "source": [
960
    "sum(train_df.T.isnull().sum()) / ((len(train_df.T) - 2) * len(train_df))"
961
   ]
962
  },
963
  {
964
   "cell_type": "markdown",
965
   "metadata": {},
966
   "source": [
967
    "# Split and save data"
968
   ]
969
  },
970
  {
971
   "cell_type": "markdown",
972
   "metadata": {},
973
   "source": [
974
    "* demo: demographic data\n",
975
    "* x: lab test & vital signs\n",
976
    "* y: outcome & length of stay"
977
   ]
978
  },
979
  {
980
   "cell_type": "code",
981
   "execution_count": null,
982
   "metadata": {},
983
   "outputs": [],
984
   "source": [
985
    "patient_ids = train_df['PATIENT_ID'].unique()\n",
986
    "\n",
987
    "demo_cols = ['AGE', 'SEX'] # , 'DIFFICULTY_BREATHING', 'FEVER', 'SUSPECT_COVID', 'EMERGENCY'\n",
988
    "test_cols = []\n",
989
    "\n",
990
    "# get column names\n",
991
    "for k in train_df.keys():\n",
992
    "    if not k in demographic.keys():\n",
993
    "        if not k == 'RECORD_TIME':\n",
994
    "            test_cols.append(k)\n",
995
    "\n",
996
    "test_median = train_df[test_cols].median()"
997
   ]
998
  },
999
  {
1000
   "cell_type": "code",
1001
   "execution_count": null,
1002
   "metadata": {},
1003
   "outputs": [],
1004
   "source": [
1005
    "test_cols"
1006
   ]
1007
  },
1008
  {
1009
   "cell_type": "code",
1010
   "execution_count": null,
1011
   "metadata": {},
1012
   "outputs": [],
1013
   "source": [
1014
    "train_df['RECORD_TIME_DAY'] = train_df['RECORD_TIME'].map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d'))\n",
1015
    "train_df['RECORD_TIME_HOUR'] = train_df['RECORD_TIME'].map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d %H'))\n",
1016
    "train_df.head()"
1017
   ]
1018
  },
1019
  {
1020
   "cell_type": "code",
1021
   "execution_count": null,
1022
   "metadata": {},
1023
   "outputs": [],
1024
   "source": [
1025
    "train_df_day = train_df.groupby(['PATIENT_ID', 'ADMISSION_DATE', 'DEPARTURE_DATE', 'RECORD_TIME_DAY'], dropna=True, as_index = False).mean()\n",
1026
    "train_df_hour = train_df.groupby(['PATIENT_ID', 'ADMISSION_DATE', 'DEPARTURE_DATE', 'RECORD_TIME_HOUR'], dropna=True, as_index = False).mean()\n",
1027
    "\n",
1028
    "len(train_df), len(train_df_day), len(train_df_hour)"
1029
   ]
1030
  },
1031
  {
1032
   "attachments": {},
1033
   "cell_type": "markdown",
1034
   "metadata": {},
1035
   "source": [
1036
    "\n",
1037
    "```\n",
1038
    "number of visits (total)\n",
1039
    "- Original data: 168777\n",
1040
    "- Merge by hour: 130141\n",
1041
    "- Merge by day:  42204\n",
1042
    "```"
1043
   ]
1044
  },
1045
  {
1046
   "cell_type": "code",
1047
   "execution_count": null,
1048
   "metadata": {},
1049
   "outputs": [],
1050
   "source": [
1051
    "len(train_df['PATIENT_ID'].unique())"
1052
   ]
1053
  },
1054
  {
1055
   "cell_type": "code",
1056
   "execution_count": null,
1057
   "metadata": {},
1058
   "outputs": [],
1059
   "source": [
1060
    "def get_visit_intervals(df):\n",
1061
    "    ls = []\n",
1062
    "    for pat in df['PATIENT_ID'].unique():\n",
1063
    "        ls.append(len(df[df['PATIENT_ID'] == pat]))\n",
1064
    "    return ls"
1065
   ]
1066
  },
1067
  {
1068
   "cell_type": "code",
1069
   "execution_count": null,
1070
   "metadata": {},
1071
   "outputs": [],
1072
   "source": [
1073
    "ls_org = get_visit_intervals(train_df)\n",
1074
    "ls_hour = get_visit_intervals(train_df_hour)\n",
1075
    "ls_day = get_visit_intervals(train_df_day)"
1076
   ]
1077
  },
1078
  {
1079
   "cell_type": "code",
1080
   "execution_count": null,
1081
   "metadata": {},
1082
   "outputs": [],
1083
   "source": [
1084
    "import matplotlib.pyplot as plt\n",
1085
    "from matplotlib.ticker import PercentFormatter\n",
1086
    "import matplotlib.font_manager as font_manager\n",
1087
    "import pandas as pd\n",
1088
    "import numpy as np\n",
1089
    "csfont = {'fontname':'Times New Roman', 'fontsize': 18}\n",
1090
    "font = 'Times New Roman'\n",
1091
    "fig=plt.figure(figsize=(18,4), dpi= 100, facecolor='w', edgecolor='k')\n",
1092
    "plt.style.use('seaborn-whitegrid')\n",
1093
    "color = 'cornflowerblue'\n",
1094
    "ec = 'None'\n",
1095
    "alpha=0.5\n",
1096
    "\n",
1097
    "ax = plt.subplot(1, 3, 1)\n",
1098
    "ax.hist(ls_org, bins=20, weights=np.ones(len(ls_org)) / len(ls_org), color=color, ec=ec, alpha=alpha, label='overall')\n",
1099
    "plt.xlabel('Num of visits (org)',**csfont)\n",
1100
    "plt.ylabel('Percentage',**csfont)\n",
1101
    "plt.gca().yaxis.set_major_formatter(PercentFormatter(1))\n",
1102
    "plt.xticks(**csfont)\n",
1103
    "plt.yticks(**csfont)\n",
1104
    "\n",
1105
    "ax = plt.subplot(1, 3, 2)\n",
1106
    "ax.hist(ls_hour, bins=20, weights=np.ones(len(ls_hour)) / len(ls_hour), color=color, ec=ec, alpha=alpha, label='overall')\n",
1107
    "plt.xlabel('Num of visits (hour)',**csfont)\n",
1108
    "plt.ylabel('Percentage',**csfont)\n",
1109
    "plt.gca().yaxis.set_major_formatter(PercentFormatter(1))\n",
1110
    "plt.xticks(**csfont)\n",
1111
    "plt.yticks(**csfont)\n",
1112
    "\n",
1113
    "ax = plt.subplot(1, 3, 3)\n",
1114
    "ax.hist(ls_day, bins=20, weights=np.ones(len(ls_day)) / len(ls_day), color=color, ec=ec, alpha=alpha, label='overall')\n",
1115
    "plt.xlabel('Num of visits (day)',**csfont)\n",
1116
    "plt.ylabel('Percentage',**csfont)\n",
1117
    "plt.gca().yaxis.set_major_formatter(PercentFormatter(1))\n",
1118
    "plt.xticks(**csfont)\n",
1119
    "plt.yticks(**csfont)\n",
1120
    "\n",
1121
    "plt.show()"
1122
   ]
1123
  },
1124
  {
1125
   "cell_type": "code",
1126
   "execution_count": null,
1127
   "metadata": {},
1128
   "outputs": [],
1129
   "source": [
1130
    "def get_statistic(lst, name):\n",
1131
    "    print(f'[{name}]\\tMax:\\t{max(lst)}, Min:\\t{min(lst)}, Median:\\t{np.median(lst)}, Mean:\\t{np.mean(lst)}, 80%:\\t{np.quantile(lst, 0.8)}, 90%:\\t{np.quantile(lst, 0.9)}, 95%:\\t{np.quantile(lst, 0.95)}')"
1132
   ]
1133
  },
1134
  {
1135
   "cell_type": "code",
1136
   "execution_count": null,
1137
   "metadata": {},
1138
   "outputs": [],
1139
   "source": [
1140
    "get_statistic(ls_org, 'ls_org')\n",
1141
    "get_statistic(ls_hour, 'ls_hour')\n",
1142
    "get_statistic(ls_day, 'ls_day')"
1143
   ]
1144
  },
1145
  {
1146
   "cell_type": "code",
1147
   "execution_count": null,
1148
   "metadata": {},
1149
   "outputs": [],
1150
   "source": [
1151
    "train_df_hour['LOS'] = train_df_hour['ADMISSION_DATE']\n",
1152
    "train_df_hour['LOS_HOUR'] = train_df_hour['ADMISSION_DATE']"
1153
   ]
1154
  },
1155
  {
1156
   "cell_type": "code",
1157
   "execution_count": null,
1158
   "metadata": {},
1159
   "outputs": [],
1160
   "source": [
1161
    "train_df_hour = train_df_hour.reset_index()"
1162
   ]
1163
  },
1164
  {
1165
   "cell_type": "code",
1166
   "execution_count": null,
1167
   "metadata": {},
1168
   "outputs": [],
1169
   "source": [
1170
    "for idx in tqdm(range(len(train_df_hour))):\n",
1171
    "    info = train_df_hour.loc[idx]\n",
1172
    "    admission = datetime.datetime.strptime(info['ADMISSION_DATE'], '%Y-%m-%d %H:%M:%S')\n",
1173
    "    departure = datetime.datetime.strptime(info['DEPARTURE_DATE'], '%Y-%m-%d %H:%M:%S')\n",
1174
    "    visit_hour = datetime.datetime.strptime(info['RECORD_TIME_HOUR'], '%Y-%m-%d %H')\n",
1175
    "    hour = (departure - visit_hour).seconds / (24 * 60 * 60) + (departure - visit_hour).days\n",
1176
    "    los = (departure - admission).seconds / (24 * 60 * 60) + (departure - admission).days\n",
1177
    "    train_df_hour.at[idx, 'LOS'] = float(los)\n",
1178
    "    train_df_hour.at[idx, 'LOS_HOUR'] = float(hour)"
1179
   ]
1180
  },
1181
  {
1182
   "cell_type": "code",
1183
   "execution_count": null,
1184
   "metadata": {},
1185
   "outputs": [],
1186
   "source": [
1187
    "train_df_hour['LOS']"
1188
   ]
1189
  },
1190
  {
1191
   "cell_type": "code",
1192
   "execution_count": null,
1193
   "metadata": {},
1194
   "outputs": [],
1195
   "source": [
1196
    "los = []\n",
1197
    "for pat in tqdm(train_df_hour['PATIENT_ID'].unique()):\n",
1198
    "    los.append(float(train_df_hour[train_df_hour['PATIENT_ID'] == pat]['LOS'].head(1)))"
1199
   ]
1200
  },
1201
  {
1202
   "cell_type": "code",
1203
   "execution_count": null,
1204
   "metadata": {},
1205
   "outputs": [],
1206
   "source": [
1207
    "get_statistic(los, 'los')"
1208
   ]
1209
  },
1210
  {
1211
   "cell_type": "code",
1212
   "execution_count": null,
1213
   "metadata": {},
1214
   "outputs": [],
1215
   "source": [
1216
    "import matplotlib.pyplot as plt\n",
1217
    "from matplotlib.ticker import PercentFormatter\n",
1218
    "import matplotlib.font_manager as font_manager\n",
1219
    "import pandas as pd\n",
1220
    "import numpy as np\n",
1221
    "csfont = {'fontname':'Times New Roman', 'fontsize': 18}\n",
1222
    "font = 'Times New Roman'\n",
1223
    "fig=plt.figure(figsize=(6,6), dpi= 100, facecolor='w', edgecolor='k')\n",
1224
    "plt.style.use('seaborn-whitegrid')\n",
1225
    "color = 'cornflowerblue'\n",
1226
    "ec = 'None'\n",
1227
    "alpha=0.5\n",
1228
    "\n",
1229
    "ax = plt.subplot(1, 1, 1)\n",
1230
    "ax.hist(los, bins=20, weights=np.ones(len(los)) / len(los), color=color, ec=ec, alpha=alpha, label='overall')\n",
1231
    "plt.xlabel('Length of stay',**csfont)\n",
1232
    "plt.ylabel('Percentage',**csfont)\n",
1233
    "plt.gca().yaxis.set_major_formatter(PercentFormatter(1))\n",
1234
    "plt.xticks(**csfont)\n",
1235
    "plt.yticks(**csfont)\n",
1236
    "\n",
1237
    "plt.show()"
1238
   ]
1239
  },
1240
  {
1241
   "cell_type": "code",
1242
   "execution_count": null,
1243
   "metadata": {},
1244
   "outputs": [],
1245
   "source": [
1246
    "train_df_hour_idx = train_df_hour.reset_index()"
1247
   ]
1248
  },
1249
  {
1250
   "cell_type": "code",
1251
   "execution_count": null,
1252
   "metadata": {},
1253
   "outputs": [],
1254
   "source": [
1255
    "train_df_hour_idx['LOS'] = train_df_hour_idx['ADMISSION_DATE']\n",
1256
    "\n",
1257
    "for idx in tqdm(range(len(train_df_hour_idx))):\n",
1258
    "    info = train_df_hour_idx.loc[idx]\n",
1259
    "    # admission = datetime.datetime.strptime(info['ADMISSION_DATE'], '%Y-%m-%d %H:%M:%S')\n",
1260
    "    departure = datetime.datetime.strptime(info['DEPARTURE_DATE'], '%Y-%m-%d %H:%M:%S')\n",
1261
    "    visit_hour = datetime.datetime.strptime(info['RECORD_TIME_HOUR'], '%Y-%m-%d %H')\n",
1262
    "    hour = (departure - visit_hour).seconds / (24 * 60 * 60) + (departure - visit_hour).days\n",
1263
    "    train_df_hour_idx.at[idx, 'LOS'] = float(hour)"
1264
   ]
1265
  },
1266
  {
1267
   "cell_type": "code",
1268
   "execution_count": null,
1269
   "metadata": {},
1270
   "outputs": [],
1271
   "source": [
1272
    "train_df_hour['LOS'] = train_df_hour['LOS_HOUR']\n",
1273
    "train_df_hour.drop(columns=['LOS_HOUR'])"
1274
   ]
1275
  },
1276
  {
1277
   "cell_type": "code",
1278
   "execution_count": null,
1279
   "metadata": {},
1280
   "outputs": [],
1281
   "source": [
1282
    "# los_threshold = 13.0\n",
1283
    "\n",
1284
    "# visit_num_hour = []\n",
1285
    "\n",
1286
    "# for pat in tqdm(train_df_hour_idx['PATIENT_ID'].unique()):\n",
1287
    "#     pat_records = train_df_hour_idx[train_df_hour_idx['PATIENT_ID'] == pat]\n",
1288
    "#     hour = 0\n",
1289
    "#     for vis in pat_records.index:\n",
1290
    "#         pat_visit = pat_records.loc[vis]\n",
1291
    "#         if pat_visit['LOS_HOUR'] <= los_threshold:\n",
1292
    "#             hour += 1\n",
1293
    "#     visit_num_hour.append(hour)\n",
1294
    "#     if hour == 0:\n",
1295
    "#         print(pat)"
1296
   ]
1297
  },
1298
  {
1299
   "cell_type": "code",
1300
   "execution_count": null,
1301
   "metadata": {},
1302
   "outputs": [],
1303
   "source": [
1304
    "# import matplotlib.pyplot as plt\n",
1305
    "# from matplotlib.ticker import PercentFormatter\n",
1306
    "# import matplotlib.font_manager as font_manager\n",
1307
    "# import pandas as pd\n",
1308
    "# import numpy as np\n",
1309
    "# csfont = {'fontname':'Times New Roman', 'fontsize': 18}\n",
1310
    "# font = 'Times New Roman'\n",
1311
    "# fig=plt.figure(figsize=(6,6), dpi= 100, facecolor='w', edgecolor='k')\n",
1312
    "# plt.style.use('seaborn-whitegrid')\n",
1313
    "# color = 'cornflowerblue'\n",
1314
    "# ec = 'None'\n",
1315
    "# alpha=0.5\n",
1316
    "\n",
1317
    "# ax = plt.subplot(1, 1, 1)\n",
1318
    "# ax.hist(visit_num_hour, bins=20, weights=np.ones(len(visit_num_hour)) / len(visit_num_hour), color=color, ec=ec, alpha=alpha, label='overall')\n",
1319
    "# plt.xlabel('Visit num (80% los)',**csfont)\n",
1320
    "# plt.ylabel('Percentage',**csfont)\n",
1321
    "# plt.gca().yaxis.set_major_formatter(PercentFormatter(1))\n",
1322
    "# plt.xticks(**csfont)\n",
1323
    "# plt.yticks(**csfont)\n",
1324
    "\n",
1325
    "# plt.show()"
1326
   ]
1327
  },
1328
  {
1329
   "cell_type": "code",
1330
   "execution_count": null,
1331
   "metadata": {},
1332
   "outputs": [],
1333
   "source": [
1334
    "train_df = train_df_hour\n",
1335
    "train_df.head()"
1336
   ]
1337
  },
1338
  {
1339
   "cell_type": "code",
1340
   "execution_count": null,
1341
   "metadata": {},
1342
   "outputs": [],
1343
   "source": [
1344
    "train_df.describe()"
1345
   ]
1346
  },
1347
  {
1348
   "cell_type": "code",
1349
   "execution_count": null,
1350
   "metadata": {},
1351
   "outputs": [],
1352
   "source": [
1353
    "get_statistic(train_df['LOS'], 'los')"
1354
   ]
1355
  },
1356
  {
1357
   "cell_type": "code",
1358
   "execution_count": null,
1359
   "metadata": {},
1360
   "outputs": [],
1361
   "source": [
1362
    "train_df['LOS'] = train_df['LOS'].clip(lower=0)"
1363
   ]
1364
  },
1365
  {
1366
   "cell_type": "code",
1367
   "execution_count": null,
1368
   "metadata": {},
1369
   "outputs": [],
1370
   "source": [
1371
    "get_statistic(train_df['LOS'], 'los')"
1372
   ]
1373
  },
1374
  {
1375
   "cell_type": "code",
1376
   "execution_count": null,
1377
   "metadata": {},
1378
   "outputs": [],
1379
   "source": [
1380
    "# the first visit of each person\n",
1381
    "def init_prev(prev):\n",
1382
    "    miss = []\n",
1383
    "    l = len(prev)\n",
1384
    "    for idx in range(l):\n",
1385
    "        #print(prev[idx])\n",
1386
    "        #print(type(prev[idx]))\n",
1387
    "        if np.isnan(prev[idx]): # there is no previous record\n",
1388
    "            prev[idx] = test_median[idx] # replace nan to median\n",
1389
    "            miss.append(1) # mark miss as 1\n",
1390
    "        else: # there is a previous record\n",
1391
    "            miss.append(0)\n",
1392
    "    return miss\n",
1393
    "\n",
1394
    "# the rest of the visits\n",
1395
    "def fill_nan(cur, prev):\n",
1396
    "    l = len(prev)\n",
1397
    "    miss = []\n",
1398
    "    for idx in range(l):\n",
1399
    "        #print(cur[idx])\n",
1400
    "        if np.isnan(cur[idx]): # there is no record in current timestep\n",
1401
    "            cur[idx] = prev[idx] # cur <- prev\n",
1402
    "            miss.append(1)\n",
1403
    "        else: # there is a record in current timestep\n",
1404
    "            miss.append(0)\n",
1405
    "    return miss"
1406
   ]
1407
  },
1408
  {
1409
   "cell_type": "code",
1410
   "execution_count": null,
1411
   "metadata": {},
1412
   "outputs": [],
1413
   "source": [
1414
    "index = train_df.loc[0].index\n",
1415
    "\n",
1416
    "csv = dict()\n",
1417
    "for key in ['PatientID', 'RecordTime', 'AdmissionTime', 'DischargeTime', 'Outcome', 'LOS', 'Sex', 'Age']:\n",
1418
    "    csv[key] = []\n",
1419
    "for key in index[8:-2]:\n",
1420
    "    csv[key] = []\n",
1421
    "    \n",
1422
    "for pat in tqdm(patient_ids): # for all patients\n",
1423
    "    # get visits for pat.id == PATIENT_ID\n",
1424
    "    info = train_df[train_df['PATIENT_ID'] == pat]\n",
1425
    "    info = info[max(0, len(info) - 76):]\n",
1426
    "    idxs = info.index\n",
1427
    "    for i in idxs:\n",
1428
    "        visit = info.loc[i]\n",
1429
    "        for key in index[8:-2]:\n",
1430
    "            csv[key].append(visit[key])\n",
1431
    "        # ['PatientID', 'RecordTime', 'AdmissionTime', 'DischargeTime', 'Outcome', 'LOS', 'Sex', 'Age']\n",
1432
    "        csv['PatientID'].append(visit['PATIENT_ID'])\n",
1433
    "        t, h = visit['RECORD_TIME_HOUR'].split()\n",
1434
    "        t = t.split('-')\n",
1435
    "        csv['RecordTime'].append(t[1]+'/'+t[2]+'/'+t[0]+' '+h) # 2020-04-06 10 -> 04/06/2020 10\n",
1436
    "        t = visit['ADMISSION_DATE'][:10].split('-')\n",
1437
    "        csv['AdmissionTime'].append(t[1]+'/'+t[2]+'/'+t[0])\n",
1438
    "        t = visit['DEPARTURE_DATE'][:10].split('-')\n",
1439
    "        csv['DischargeTime'].append(t[1]+'/'+t[2]+'/'+t[0])\n",
1440
    "        csv['Outcome'].append(visit['OUTCOME'])\n",
1441
    "        csv['LOS'].append(visit['LOS_HOUR'])\n",
1442
    "        csv['Sex'].append(visit['SEX'])\n",
1443
    "        csv['Age'].append(visit['AGE'])\n",
1444
    "    \n",
1445
    "pd.DataFrame(csv).to_csv('processed_data/CDSL.csv')"
1446
   ]
1447
  },
1448
  {
1449
   "cell_type": "code",
1450
   "execution_count": null,
1451
   "metadata": {},
1452
   "outputs": [],
1453
   "source": [
1454
    "x, y, demo, x_lab_len, missing_mask = [], [], [], [], []\n",
1455
    "\n",
1456
    "for pat in tqdm(patient_ids): # for all patients\n",
1457
    "    # get visits for pat.id == PATIENT_ID\n",
1458
    "    info = train_df[train_df['PATIENT_ID'] == pat]\n",
1459
    "    info = info[max(0, len(info) - 76):]\n",
1460
    "    indexes = info.index\n",
1461
    "    visit = info.loc[indexes[0]] # get the first visit\n",
1462
    "\n",
1463
    "    # demographic data\n",
1464
    "    demo.append([visit[k] for k in demo_cols])\n",
1465
    "    \n",
1466
    "    # label\n",
1467
    "    outcome = visit['OUTCOME']\n",
1468
    "    los = []\n",
1469
    "\n",
1470
    "    # lab test & vital signs\n",
1471
    "    tests = []\n",
1472
    "    prev = visit[test_cols]\n",
1473
    "    miss = [] # missing matrix\n",
1474
    "    miss.append(init_prev(prev)) # fill nan for the first visit for every patient and add missing status to missing matrix\n",
1475
    "    # leave = datetime.datetime.strptime(visit['DEPARTURE_DATE'], '%Y-%m-%d %H:%M:%S')\n",
1476
    "    \n",
1477
    "    first = True\n",
1478
    "    for i in indexes:\n",
1479
    "        visit = info.loc[i]\n",
1480
    "        # now = datetime.datetime.strptime(visit['RECORD_TIME'], '%Y-%m-%d %H')\n",
1481
    "        cur = visit[test_cols]\n",
1482
    "        tmp = fill_nan(cur, prev) # fill nan for the rest of the visits\n",
1483
    "        if not first:\n",
1484
    "            miss.append(tmp) # add missing status to missing matrix\n",
1485
    "        tests.append(cur)\n",
1486
    "        # los_visit = (leave - now).days\n",
1487
    "        # if los_visit < 0:\n",
1488
    "        #     los_visit = 0\n",
1489
    "        los.append(visit['LOS'])\n",
1490
    "        prev = cur\n",
1491
    "        first = False\n",
1492
    "\n",
1493
    "    valid_visit = len(los)\n",
1494
    "    # outcome = [outcome] * valid_visit\n",
1495
    "    x_lab_len.append(valid_visit)\n",
1496
    "    missing_mask.append(miss) # append the patient's missing matrix to the total missing matrix\n",
1497
    "\n",
1498
    "    # tests = np.pad(tests, ((0, max_visit - valid_visit), (0, 0)))\n",
1499
    "    # outcome = np.pad(outcome, (0, max_visit - valid_visit))\n",
1500
    "    # los = np.pad(los, (0, max_visit - valid_visit))\n",
1501
    "    \n",
1502
    "    y.append([outcome, los])\n",
1503
    "    x.append(tests)"
1504
   ]
1505
  },
1506
  {
1507
   "cell_type": "code",
1508
   "execution_count": null,
1509
   "metadata": {},
1510
   "outputs": [],
1511
   "source": [
1512
    "all_x = x\n",
1513
    "all_x_demo = demo\n",
1514
    "all_y = y\n",
1515
    "all_missing_mask = missing_mask"
1516
   ]
1517
  },
1518
  {
1519
   "cell_type": "code",
1520
   "execution_count": null,
1521
   "metadata": {},
1522
   "outputs": [],
1523
   "source": [
1524
    "all_x_labtest = np.array(all_x, dtype=object)\n",
1525
    "x_lab_length = [len(_) for _ in all_x_labtest]\n",
1526
    "x_lab_length = torch.tensor(x_lab_length, dtype=torch.int)\n",
1527
    "max_length = int(x_lab_length.max())\n",
1528
    "all_x_labtest = [torch.tensor(_) for _ in all_x_labtest]\n",
1529
    "all_x_labtest = torch.nn.utils.rnn.pad_sequence((all_x_labtest), batch_first=True)\n",
1530
    "all_x_demographic = torch.tensor(all_x_demo)\n",
1531
    "batch_size, demo_dim = all_x_demographic.shape\n",
1532
    "all_x_demographic = torch.reshape(all_x_demographic.repeat(1, max_length), (batch_size, max_length, demo_dim))\n",
1533
    "all_x = torch.cat((all_x_demographic, all_x_labtest), 2)\n",
1534
    "\n",
1535
    "all_y = np.array(all_y, dtype=object)\n",
1536
    "patient_list = []\n",
1537
    "for pat in all_y:\n",
1538
    "    visits = []\n",
1539
    "    for i in pat[1]:\n",
1540
    "        visits.append([pat[0], i])\n",
1541
    "    patient_list.append(visits)\n",
1542
    "new_all_y = np.array(patient_list, dtype=object)\n",
1543
    "output_all_y = [torch.Tensor(_) for _ in new_all_y]\n",
1544
    "output_all_y = torch.nn.utils.rnn.pad_sequence((output_all_y), batch_first=True)"
1545
   ]
1546
  },
1547
  {
1548
   "cell_type": "code",
1549
   "execution_count": null,
1550
   "metadata": {},
1551
   "outputs": [],
1552
   "source": [
1553
    "all_missing_mask = np.array(all_missing_mask, dtype=object)\n",
1554
    "all_missing_mask = [torch.tensor(_) for _ in all_missing_mask]\n",
1555
    "all_missing_mask = torch.nn.utils.rnn.pad_sequence((all_missing_mask), batch_first=True)"
1556
   ]
1557
  },
1558
  {
1559
   "cell_type": "code",
1560
   "execution_count": null,
1561
   "metadata": {},
1562
   "outputs": [],
1563
   "source": [
1564
    "all_x.shape"
1565
   ]
1566
  },
1567
  {
1568
   "cell_type": "code",
1569
   "execution_count": null,
1570
   "metadata": {},
1571
   "outputs": [],
1572
   "source": [
1573
    "all_missing_mask.shape"
1574
   ]
1575
  },
1576
  {
1577
   "cell_type": "code",
1578
   "execution_count": null,
1579
   "metadata": {},
1580
   "outputs": [],
1581
   "source": [
1582
    "# save pickle format dataset (torch)\n",
1583
    "pd.to_pickle(all_x,f'./processed_data/x.pkl' )\n",
1584
    "pd.to_pickle(all_missing_mask,f'./processed_data/missing_mask.pkl' )\n",
1585
    "pd.to_pickle(output_all_y,f'./processed_data/y.pkl' )\n",
1586
    "pd.to_pickle(x_lab_length,f'./processed_data/visits_length.pkl' )"
1587
   ]
1588
  },
1589
  {
1590
   "cell_type": "code",
1591
   "execution_count": null,
1592
   "metadata": {},
1593
   "outputs": [],
1594
   "source": [
1595
    "# Calculate patients' outcome statistics (patients-wise)\n",
1596
    "outcome_list = []\n",
1597
    "y_outcome = output_all_y[:, :, 0]\n",
1598
    "indices = torch.arange(len(x_lab_length), dtype=torch.int64)\n",
1599
    "for i in indices:\n",
1600
    "    outcome_list.append(y_outcome[i][0].item())\n",
1601
    "outcome_list = np.array(outcome_list)\n",
1602
    "print(len(outcome_list))\n",
1603
    "unique, count=np.unique(outcome_list,return_counts=True)\n",
1604
    "data_count=dict(zip(unique,count))\n",
1605
    "print(data_count)"
1606
   ]
1607
  },
1608
  {
1609
   "cell_type": "code",
1610
   "execution_count": null,
1611
   "metadata": {},
1612
   "outputs": [],
1613
   "source": [
1614
    "# Calculate patients' outcome statistics (records-wise)\n",
1615
    "outcome_records_list = []\n",
1616
    "y_outcome = output_all_y[:, :, 0]\n",
1617
    "indices = torch.arange(len(x_lab_length), dtype=torch.int64)\n",
1618
    "for i in indices:\n",
1619
    "    outcome_records_list.extend(y_outcome[i][0:x_lab_length[i]].tolist())\n",
1620
    "outcome_records_list = np.array(outcome_records_list)\n",
1621
    "print(len(outcome_records_list))\n",
1622
    "unique, count=np.unique(outcome_records_list,return_counts=True)\n",
1623
    "data_count=dict(zip(unique,count))\n",
1624
    "print(data_count)"
1625
   ]
1626
  },
1627
  {
1628
   "cell_type": "code",
1629
   "execution_count": null,
1630
   "metadata": {},
1631
   "outputs": [],
1632
   "source": [
1633
    "# Calculate patients' mean los and 95% percentile los\n",
1634
    "los_list = []\n",
1635
    "y_los = output_all_y[:, :, 1]\n",
1636
    "indices = torch.arange(len(x_lab_length), dtype=torch.int64)\n",
1637
    "for i in indices:\n",
1638
    "    # los_list.extend(y_los[i][: x_lab_length[i].long()].tolist())\n",
1639
    "    los_list.append(y_los[i][0].item())\n",
1640
    "los_list = np.array(los_list)\n",
1641
    "print(los_list.mean() * 0.5)\n",
1642
    "print(np.median(los_list) * 0.5)\n",
1643
    "print(np.percentile(los_list, 95))\n",
1644
    "\n",
1645
    "print('median:', np.median(los_list))\n",
1646
    "print('Q1:', np.percentile(los_list, 25))\n",
1647
    "print('Q3:', np.percentile(los_list, 75))"
1648
   ]
1649
  },
1650
  {
1651
   "cell_type": "code",
1652
   "execution_count": null,
1653
   "metadata": {},
1654
   "outputs": [],
1655
   "source": [
1656
    "los_alive_list = np.array([los_list[i] for i in range(len(los_list)) if outcome_list[i] == 0])\n",
1657
    "los_dead_list = np.array([los_list[i] for i in range(len(los_list)) if outcome_list[i] == 1])\n",
1658
    "print(len(los_alive_list))\n",
1659
    "print(len(los_dead_list))\n",
1660
    "\n",
1661
    "print('[Alive]')\n",
1662
    "print('median:', np.median(los_alive_list))\n",
1663
    "print('Q1:', np.percentile(los_alive_list, 25))\n",
1664
    "print('Q3:', np.percentile(los_alive_list, 75))\n",
1665
    "\n",
1666
    "print('[Dead]')\n",
1667
    "print('median:', np.median(los_dead_list))\n",
1668
    "print('Q1:', np.percentile(los_dead_list, 25))\n",
1669
    "print('Q3:', np.percentile(los_dead_list, 75))"
1670
   ]
1671
  },
1672
  {
1673
   "cell_type": "code",
1674
   "execution_count": null,
1675
   "metadata": {},
1676
   "outputs": [],
1677
   "source": [
1678
    "cdsl_los_statistics = {\n",
1679
    "    'overall': los_list,\n",
1680
    "    'alive': los_alive_list,\n",
1681
    "    'dead': los_dead_list\n",
1682
    "}\n",
1683
    "pd.to_pickle(cdsl_los_statistics, 'cdsl_los_statistics.pkl')"
1684
   ]
1685
  },
1686
  {
1687
   "cell_type": "code",
1688
   "execution_count": null,
1689
   "metadata": {},
1690
   "outputs": [],
1691
   "source": [
1692
    "# calculate visits length Median [Q1, Q3]\n",
1693
    "visits_list = np.array(x_lab_length)\n",
1694
    "visits_alive_list = np.array([x_lab_length[i] for i in range(len(x_lab_length)) if outcome_list[i] == 0])\n",
1695
    "visits_dead_list = np.array([x_lab_length[i] for i in range(len(x_lab_length)) if outcome_list[i] == 1])\n",
1696
    "print(len(visits_alive_list))\n",
1697
    "print(len(visits_dead_list))\n",
1698
    "\n",
1699
    "print('[Total]')\n",
1700
    "print('median:', np.median(visits_list))\n",
1701
    "print('Q1:', np.percentile(visits_list, 25))\n",
1702
    "print('Q3:', np.percentile(visits_list, 75))\n",
1703
    "\n",
1704
    "print('[Alive]')\n",
1705
    "print('median:', np.median(visits_alive_list))\n",
1706
    "print('Q1:', np.percentile(visits_alive_list, 25))\n",
1707
    "print('Q3:', np.percentile(visits_alive_list, 75))\n",
1708
    "\n",
1709
    "print('[Dead]')\n",
1710
    "print('median:', np.median(visits_dead_list))\n",
1711
    "print('Q1:', np.percentile(visits_dead_list, 25))\n",
1712
    "print('Q3:', np.percentile(visits_dead_list, 75))"
1713
   ]
1714
  },
1715
  {
1716
   "cell_type": "code",
1717
   "execution_count": null,
1718
   "metadata": {},
1719
   "outputs": [],
1720
   "source": [
1721
    "def check_nan(x):\n",
1722
    "    if np.isnan(np.sum(x.cpu().numpy())):\n",
1723
    "        print(\"some values from input are nan\")\n",
1724
    "    else:\n",
1725
    "        print(\"no nan\")"
1726
   ]
1727
  },
1728
  {
1729
   "cell_type": "code",
1730
   "execution_count": null,
1731
   "metadata": {},
1732
   "outputs": [],
1733
   "source": [
1734
    "check_nan(all_x)"
1735
   ]
1736
  },
1737
  {
1738
   "cell_type": "markdown",
1739
   "metadata": {},
1740
   "source": [
1741
    "# Draw Charts"
1742
   ]
1743
  },
1744
  {
1745
   "cell_type": "markdown",
1746
   "metadata": {},
1747
   "source": [
1748
    "## Import packages"
1749
   ]
1750
  },
1751
  {
1752
   "cell_type": "code",
1753
   "execution_count": null,
1754
   "metadata": {},
1755
   "outputs": [],
1756
   "source": [
1757
    "import matplotlib.pyplot as plt\n",
1758
    "from matplotlib.ticker import PercentFormatter\n",
1759
    "import matplotlib.font_manager as font_manager\n",
1760
    "import pandas as pd\n",
1761
    "import numpy as np\n",
1762
    "\n",
1763
    "plt.style.use('seaborn-whitegrid')\n",
1764
    "color = 'cornflowerblue'\n",
1765
    "ec = 'None'\n",
1766
    "alpha=0.5\n",
1767
    "alive_color = 'olivedrab'\n",
1768
    "dead_color = 'orchid'"
1769
   ]
1770
  },
1771
  {
1772
   "cell_type": "markdown",
1773
   "metadata": {},
1774
   "source": [
1775
    "## Read data"
1776
   ]
1777
  },
1778
  {
1779
   "cell_type": "code",
1780
   "execution_count": null,
1781
   "metadata": {},
1782
   "outputs": [],
1783
   "source": [
1784
    "demographic.head()"
1785
   ]
1786
  },
1787
  {
1788
   "cell_type": "code",
1789
   "execution_count": null,
1790
   "metadata": {},
1791
   "outputs": [],
1792
   "source": [
1793
    "train = pd.read_csv('./train.csv')\n",
1794
    "train['PATIENT_ID']=train['PATIENT_ID'].astype(str)\n",
1795
    "demographic['PATIENT_ID']=demographic['PATIENT_ID'].astype(str)\n",
1796
    "pat = {\n",
1797
    "    'PATIENT_ID': train['PATIENT_ID'].unique()\n",
1798
    "}\n",
1799
    "pat = pd.DataFrame(pat)\n",
1800
    "demo = pd.merge(demographic, pat, on='PATIENT_ID', how='inner')\n",
1801
    "\n",
1802
    "demo_alive = demo.loc[demo['OUTCOME'] == 0]\n",
1803
    "demo_dead = demo.loc[demo['OUTCOME'] == 1]\n",
1804
    "demo_overall = demo"
1805
   ]
1806
  },
1807
  {
1808
   "cell_type": "code",
1809
   "execution_count": null,
1810
   "metadata": {},
1811
   "outputs": [],
1812
   "source": [
1813
    "demo.to_csv('demo_overall.csv', index=False)\n",
1814
    "demo_alive.to_csv('demo_alive.csv', index=False)\n",
1815
    "demo_dead.to_csv('demo_dead.csv', index=False)"
1816
   ]
1817
  },
1818
  {
1819
   "cell_type": "code",
1820
   "execution_count": null,
1821
   "metadata": {},
1822
   "outputs": [],
1823
   "source": [
1824
    "patient = pd.DataFrame({\"PATIENT_ID\": (demo_alive['PATIENT_ID'].unique())})\n",
1825
    "lab_tests_alive = pd.merge(lab_tests, patient, how='inner', on='PATIENT_ID')\n",
1826
    "print(len(lab_tests_alive['PATIENT_ID'].unique()))\n",
1827
    "\n",
1828
    "patient = pd.DataFrame({\"PATIENT_ID\": (demo_dead['PATIENT_ID'].unique())})\n",
1829
    "lab_tests_dead = pd.merge(lab_tests, patient, how='inner', on='PATIENT_ID')\n",
1830
    "print(len(lab_tests_dead['PATIENT_ID'].unique()))\n",
1831
    "\n",
1832
    "patient = pd.DataFrame({\"PATIENT_ID\": (demo_overall['PATIENT_ID'].unique())})\n",
1833
    "lab_tests_overall = pd.merge(lab_tests, patient, how='inner', on='PATIENT_ID')\n",
1834
    "print(len(lab_tests_overall['PATIENT_ID'].unique()))"
1835
   ]
1836
  },
1837
  {
1838
   "cell_type": "code",
1839
   "execution_count": null,
1840
   "metadata": {},
1841
   "outputs": [],
1842
   "source": [
1843
    "patient = pd.DataFrame({\"PATIENT_ID\": (demo_alive['PATIENT_ID'].unique())})\n",
1844
    "vital_signs_alive = pd.merge(vital_signs, patient, how='inner', on='PATIENT_ID')\n",
1845
    "len(vital_signs_alive['PATIENT_ID'].unique())"
1846
   ]
1847
  },
1848
  {
1849
   "cell_type": "code",
1850
   "execution_count": null,
1851
   "metadata": {},
1852
   "outputs": [],
1853
   "source": [
1854
    "patient = pd.DataFrame({\"PATIENT_ID\": (demo_dead['PATIENT_ID'].unique())})\n",
1855
    "vital_signs_dead = pd.merge(vital_signs, patient, how='inner', on='PATIENT_ID')\n",
1856
    "len(vital_signs_dead['PATIENT_ID'].unique())"
1857
   ]
1858
  },
1859
  {
1860
   "cell_type": "code",
1861
   "execution_count": null,
1862
   "metadata": {},
1863
   "outputs": [],
1864
   "source": [
1865
    "patient = pd.DataFrame({\"PATIENT_ID\": (demo_overall['PATIENT_ID'].unique())})\n",
1866
    "vital_signs_overall = pd.merge(vital_signs, patient, how='inner', on='PATIENT_ID')\n",
1867
    "len(vital_signs_overall['PATIENT_ID'].unique())"
1868
   ]
1869
  },
1870
  {
1871
   "cell_type": "code",
1872
   "execution_count": null,
1873
   "metadata": {},
1874
   "outputs": [],
1875
   "source": [
1876
    "limit = 0.05\n",
1877
    "\n",
1878
    "csfont = {'fontname':'Times New Roman', 'fontsize': 18}\n",
1879
    "font = 'Times New Roman'\n",
1880
    "fig=plt.figure(figsize=(16,12), dpi= 100, facecolor='w', edgecolor='k')\n",
1881
    "\n",
1882
    "idx = 1\n",
1883
    "\n",
1884
    "key = 'AGE'\n",
1885
    "low = demo_overall[key].quantile(limit)\n",
1886
    "high = demo_overall[key].quantile(1 - limit)\n",
1887
    "demo_AGE_overall = demo_overall[demo_overall[key].between(low, high)]\n",
1888
    "demo_AGE_dead = demo_dead[demo_dead[key].between(low, high)]\n",
1889
    "demo_AGE_alive = demo_alive[demo_alive[key].between(low, high)]\n",
1890
    "ax = plt.subplot(4, 4, idx)\n",
1891
    "ax.hist(demo_AGE_overall[key], bins=20, weights=np.ones(len(demo_AGE_overall[key])) / len(demo_AGE_overall), color=color, ec=ec, alpha=alpha, label='overall')\n",
1892
    "plt.xlabel('Age',**csfont)\n",
1893
    "plt.ylabel('Percentage',**csfont)\n",
1894
    "plt.gca().yaxis.set_major_formatter(PercentFormatter(1))\n",
1895
    "# ax.title('Age Histogram', **csfont)\n",
1896
    "ax.hist(demo_AGE_alive[key], bins=20, weights=np.ones(len(demo_AGE_alive[key])) / len(demo_AGE_alive), color='green', ec=alive_color, alpha=1, histtype=\"step\", linewidth=2, label='alive')\n",
1897
    "ax.hist(demo_AGE_dead[key], bins=20, weights=np.ones(len(demo_AGE_dead[key])) / len(demo_AGE_dead), color='green', ec=dead_color, alpha=1, histtype=\"step\", linewidth=2, label='dead')\n",
1898
    "plt.xticks(**csfont)\n",
1899
    "plt.yticks(**csfont)\n",
1900
    "idx += 1\n",
1901
    "\n",
1902
    "key = 'TEMPERATURE'\n",
1903
    "low = vital_signs_overall[key].quantile(limit)\n",
1904
    "high = vital_signs_overall[key].quantile(1 - limit)\n",
1905
    "vs_TEMPERATURE_overall = vital_signs_overall[vital_signs_overall[key].between(low, high)]\n",
1906
    "vs_TEMPERATURE_dead = vital_signs_dead[vital_signs_dead[key].between(low, high)]\n",
1907
    "vs_TEMPERATURE_alive = vital_signs_alive[vital_signs_alive[key].between(low, high)]\n",
1908
    "plt.subplot(4, 4, idx)\n",
1909
    "plt.hist(vs_TEMPERATURE_overall['TEMPERATURE'], bins=20, weights=np.ones(len(vs_TEMPERATURE_overall)) / len(vs_TEMPERATURE_overall), color=color, ec=ec, alpha=alpha)\n",
1910
    "plt.xlabel('Temperature',**csfont)\n",
1911
    "plt.ylabel('Percentage',**csfont)\n",
1912
    "plt.gca().yaxis.set_major_formatter(PercentFormatter(1))\n",
1913
    "# plt.title('Temperature Histogram', **csfont)\n",
1914
    "plt.hist(vs_TEMPERATURE_alive['TEMPERATURE'], bins=20, weights=np.ones(len(vs_TEMPERATURE_alive)) / len(vs_TEMPERATURE_alive), color='green', ec=alive_color, alpha=1, histtype=\"step\", linewidth=2)\n",
1915
    "plt.hist(vs_TEMPERATURE_dead['TEMPERATURE'], bins=20, weights=np.ones(len(vs_TEMPERATURE_dead)) / len(vs_TEMPERATURE_dead), color='green', ec=dead_color, alpha=1, histtype=\"step\", linewidth=2)\n",
1916
    "plt.xticks(**csfont)\n",
1917
    "plt.yticks(**csfont)\n",
1918
    "idx += 1\n",
1919
    "\n",
1920
    "# plt.subplot(4, 4, 3)\n",
1921
    "# plt.hist(lab_tests_overall['CREA -- CREATININA'], bins=20, density=True, color=color, ec=ec, alpha=alpha)\n",
1922
    "# plt.xlabel('CREA -- CREATININA',**csfont)\n",
1923
    "# plt.ylabel('Percentage',**csfont)\n",
1924
    "# plt.gca().yaxis.set_major_formatter(PercentFormatter(1))\n",
1925
    "# # plt.title('Temperature Histogram', **csfont)\n",
1926
    "# plt.hist(lab_tests_alive['CREA -- CREATININA'], bins=20, density=True, color='green', ec=alive_color, alpha=1, histtype=\"step\", linewidth=2)\n",
1927
    "# plt.hist(lab_tests_dead['CREA -- CREATININA'], bins=20, density=True, color='green', ec=dead_color, alpha=1, histtype=\"step\", linewidth=2)\n",
1928
    "# plt.xticks(**csfont)\n",
1929
    "# plt.yticks(**csfont)\n",
1930
    "\n",
1931
    "key = 'CREA -- CREATININA'\n",
1932
    "low = lab_tests_overall[key].quantile(limit)\n",
1933
    "high = lab_tests_overall[key].quantile(1 - limit)\n",
1934
    "lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]\n",
1935
    "lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]\n",
1936
    "lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]\n",
1937
    "plt.subplot(4, 4, idx)\n",
1938
    "plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)\n",
1939
    "plt.xlabel('CREA -- CREATININA',**csfont)\n",
1940
    "plt.ylabel('Percentage',**csfont)\n",
1941
    "plt.gca().yaxis.set_major_formatter(PercentFormatter(1))\n",
1942
    "# plt.title('Temperature Histogram', **csfont)\n",
1943
    "plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype=\"step\", linewidth=2)\n",
1944
    "plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype=\"step\", linewidth=2)\n",
1945
    "plt.xticks(**csfont)\n",
1946
    "plt.yticks(**csfont)\n",
1947
    "idx += 1\n",
1948
    "\n",
1949
    "key = 'HEM -- Hemat¡es'\n",
1950
    "low = lab_tests_overall[key].quantile(limit)\n",
1951
    "high = lab_tests_overall[key].quantile(1 - limit)\n",
1952
    "lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]\n",
1953
    "lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]\n",
1954
    "lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]\n",
1955
    "plt.subplot(4, 4, idx)\n",
1956
    "plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)\n",
1957
    "plt.xlabel('HEM -- Hemat¡es',**csfont)\n",
1958
    "plt.ylabel('Percentage',**csfont)\n",
1959
    "plt.gca().yaxis.set_major_formatter(PercentFormatter(1))\n",
1960
    "# plt.title('Temperature Histogram', **csfont)\n",
1961
    "plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype=\"step\", linewidth=2)\n",
1962
    "plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype=\"step\", linewidth=2)\n",
1963
    "plt.xticks(**csfont)\n",
1964
    "plt.yticks(**csfont)\n",
1965
    "idx += 1\n",
1966
    "\n",
1967
    "key = 'LEUC -- Leucocitos'\n",
1968
    "low = lab_tests_overall[key].quantile(limit)\n",
1969
    "high = lab_tests_overall[key].quantile(1 - limit)\n",
1970
    "lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]\n",
1971
    "lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]\n",
1972
    "lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]\n",
1973
    "plt.subplot(4, 4, idx)\n",
1974
    "plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)\n",
1975
    "plt.xlabel('LEUC -- Leucocitos',**csfont)\n",
1976
    "plt.ylabel('Percentage',**csfont)\n",
1977
    "plt.gca().yaxis.set_major_formatter(PercentFormatter(1))\n",
1978
    "# plt.title('Temperature Histogram', **csfont)\n",
1979
    "plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype=\"step\", linewidth=2)\n",
1980
    "plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype=\"step\", linewidth=2)\n",
1981
    "plt.xticks(**csfont)\n",
1982
    "plt.yticks(**csfont)\n",
1983
    "idx += 1\n",
1984
    "\n",
1985
    "key = 'PLAQ -- Recuento de plaquetas'\n",
1986
    "low = lab_tests_overall[key].quantile(limit)\n",
1987
    "high = lab_tests_overall[key].quantile(1 - limit)\n",
1988
    "lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]\n",
1989
    "lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]\n",
1990
    "lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]\n",
1991
    "plt.subplot(4, 4, idx)\n",
1992
    "plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)\n",
1993
    "plt.xlabel('PLAQ -- Recuento de plaquetas',**csfont)\n",
1994
    "plt.ylabel('Percentage',**csfont)\n",
1995
    "plt.gca().yaxis.set_major_formatter(PercentFormatter(1))\n",
1996
    "# plt.title('Temperature Histogram', **csfont)\n",
1997
    "plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype=\"step\", linewidth=2)\n",
1998
    "plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype=\"step\", linewidth=2)\n",
1999
    "plt.xticks(**csfont)\n",
2000
    "plt.yticks(**csfont)\n",
2001
    "idx += 1\n",
2002
    "\n",
2003
    "key = 'CHCM -- Conc. Hemoglobina Corpuscular Media'\n",
2004
    "low = lab_tests_overall[key].quantile(limit)\n",
2005
    "high = lab_tests_overall[key].quantile(1 - limit)\n",
2006
    "lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]\n",
2007
    "lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]\n",
2008
    "lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]\n",
2009
    "plt.subplot(4, 4, idx)\n",
2010
    "plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)\n",
2011
    "plt.xlabel('CHCM',**csfont)\n",
2012
    "plt.ylabel('Percentage',**csfont)\n",
2013
    "plt.gca().yaxis.set_major_formatter(PercentFormatter(1))\n",
2014
    "# plt.title('Temperature Histogram', **csfont)\n",
2015
    "plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype=\"step\", linewidth=2)\n",
2016
    "plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype=\"step\", linewidth=2)\n",
2017
    "plt.xticks(**csfont)\n",
2018
    "plt.yticks(**csfont)\n",
2019
    "idx += 1\n",
2020
    "\n",
2021
    "key = 'HCTO -- Hematocrito'\n",
2022
    "low = lab_tests_overall[key].quantile(limit)\n",
2023
    "high = lab_tests_overall[key].quantile(1 - limit)\n",
2024
    "lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]\n",
2025
    "lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]\n",
2026
    "lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]\n",
2027
    "plt.subplot(4, 4, idx)\n",
2028
    "plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)\n",
2029
    "plt.xlabel('HCTO -- Hematocrito',**csfont)\n",
2030
    "plt.ylabel('Percentage',**csfont)\n",
2031
    "plt.gca().yaxis.set_major_formatter(PercentFormatter(1))\n",
2032
    "# plt.title('Temperature Histogram', **csfont)\n",
2033
    "plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype=\"step\", linewidth=2)\n",
2034
    "plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype=\"step\", linewidth=2)\n",
2035
    "plt.xticks(**csfont)\n",
2036
    "plt.yticks(**csfont)\n",
2037
    "idx += 1\n",
2038
    "\n",
2039
    "key = 'VCM -- Volumen Corpuscular Medio'\n",
2040
    "low = lab_tests_overall[key].quantile(limit)\n",
2041
    "high = lab_tests_overall[key].quantile(1 - limit)\n",
2042
    "lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]\n",
2043
    "lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]\n",
2044
    "lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]\n",
2045
    "plt.subplot(4, 4, idx)\n",
2046
    "plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)\n",
2047
    "plt.xlabel('VCM -- Volumen Corpuscular Medio',**csfont)\n",
2048
    "plt.ylabel('Percentage',**csfont)\n",
2049
    "plt.gca().yaxis.set_major_formatter(PercentFormatter(1))\n",
2050
    "# plt.title('Temperature Histogram', **csfont)\n",
2051
    "plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype=\"step\", linewidth=2)\n",
2052
    "plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype=\"step\", linewidth=2)\n",
2053
    "plt.xticks(**csfont)\n",
2054
    "plt.yticks(**csfont)\n",
2055
    "idx += 1\n",
2056
    "\n",
2057
    "key = 'HGB -- Hemoglobina'\n",
2058
    "low = lab_tests_overall[key].quantile(limit)\n",
2059
    "high = lab_tests_overall[key].quantile(1 - limit)\n",
2060
    "lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]\n",
2061
    "lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]\n",
2062
    "lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]\n",
2063
    "plt.subplot(4, 4, idx)\n",
2064
    "plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)\n",
2065
    "plt.xlabel('HGB -- Hemoglobina',**csfont)\n",
2066
    "plt.ylabel('Percentage',**csfont)\n",
2067
    "plt.gca().yaxis.set_major_formatter(PercentFormatter(1))\n",
2068
    "# plt.title('Temperature Histogram', **csfont)\n",
2069
    "plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype=\"step\", linewidth=2)\n",
2070
    "plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype=\"step\", linewidth=2)\n",
2071
    "plt.xticks(**csfont)\n",
2072
    "plt.yticks(**csfont)\n",
2073
    "idx += 1\n",
2074
    "\n",
2075
    "key = 'HCM -- Hemoglobina Corpuscular Media'\n",
2076
    "low = lab_tests_overall[key].quantile(limit)\n",
2077
    "high = lab_tests_overall[key].quantile(1 - limit)\n",
2078
    "lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]\n",
2079
    "lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]\n",
2080
    "lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]\n",
2081
    "plt.subplot(4, 4, idx)\n",
2082
    "plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)\n",
2083
    "plt.xlabel('HCM -- Hemoglobina Corpuscular Media',**csfont)\n",
2084
    "plt.ylabel('Percentage',**csfont)\n",
2085
    "plt.gca().yaxis.set_major_formatter(PercentFormatter(1))\n",
2086
    "# plt.title('Temperature Histogram', **csfont)\n",
2087
    "plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype=\"step\", linewidth=2)\n",
2088
    "plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype=\"step\", linewidth=2)\n",
2089
    "plt.xticks(**csfont)\n",
2090
    "plt.yticks(**csfont)\n",
2091
    "idx += 1\n",
2092
    "\n",
2093
    "key = 'NEU -- Neutr¢filos'\n",
2094
    "low = lab_tests_overall[key].quantile(limit)\n",
2095
    "high = lab_tests_overall[key].quantile(1 - limit)\n",
2096
    "lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]\n",
2097
    "lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]\n",
2098
    "lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]\n",
2099
    "plt.subplot(4, 4, idx)\n",
2100
    "plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)\n",
2101
    "plt.xlabel('NEU -- Neutr¢filos',**csfont)\n",
2102
    "plt.ylabel('Percentage',**csfont)\n",
2103
    "plt.gca().yaxis.set_major_formatter(PercentFormatter(1))\n",
2104
    "# plt.title('Temperature Histogram', **csfont)\n",
2105
    "plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype=\"step\", linewidth=2)\n",
2106
    "plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype=\"step\", linewidth=2)\n",
2107
    "plt.xticks(**csfont)\n",
2108
    "plt.yticks(**csfont)\n",
2109
    "idx += 1\n",
2110
    "\n",
2111
    "key = 'NEU% -- Neutr¢filos %'\n",
2112
    "low = lab_tests_overall[key].quantile(limit)\n",
2113
    "high = lab_tests_overall[key].quantile(1 - limit)\n",
2114
    "lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]\n",
2115
    "lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]\n",
2116
    "lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]\n",
2117
    "plt.subplot(4, 4, idx)\n",
2118
    "plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)\n",
2119
    "plt.xlabel('NEU% -- Neutr¢filos%',**csfont)\n",
2120
    "plt.ylabel('Percentage',**csfont)\n",
2121
    "plt.gca().yaxis.set_major_formatter(PercentFormatter(1))\n",
2122
    "# plt.title('Temperature Histogram', **csfont)\n",
2123
    "plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype=\"step\", linewidth=2)\n",
2124
    "plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype=\"step\", linewidth=2)\n",
2125
    "plt.xticks(**csfont)\n",
2126
    "plt.yticks(**csfont)\n",
2127
    "idx += 1\n",
2128
    "\n",
2129
    "key = 'LIN -- Linfocitos'\n",
2130
    "low = lab_tests_overall[key].quantile(limit)\n",
2131
    "high = lab_tests_overall[key].quantile(1 - limit)\n",
2132
    "lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]\n",
2133
    "lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]\n",
2134
    "lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]\n",
2135
    "plt.subplot(4, 4, idx)\n",
2136
    "plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)\n",
2137
    "plt.xlabel('LIN -- Linfocitos',**csfont)\n",
2138
    "plt.ylabel('Percentage',**csfont)\n",
2139
    "plt.gca().yaxis.set_major_formatter(PercentFormatter(1))\n",
2140
    "# plt.title('Temperature Histogram', **csfont)\n",
2141
    "plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype=\"step\", linewidth=2)\n",
2142
    "plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype=\"step\", linewidth=2)\n",
2143
    "plt.xticks(**csfont)\n",
2144
    "plt.yticks(**csfont)\n",
2145
    "idx += 1\n",
2146
    "\n",
2147
    "key = 'LIN% -- Linfocitos %'\n",
2148
    "low = lab_tests_overall[key].quantile(limit)\n",
2149
    "high = lab_tests_overall[key].quantile(1 - limit)\n",
2150
    "lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]\n",
2151
    "lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]\n",
2152
    "lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]\n",
2153
    "plt.subplot(4, 4, idx)\n",
2154
    "plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)\n",
2155
    "plt.xlabel('LIN% -- Linfocitos%',**csfont)\n",
2156
    "plt.ylabel('Percentage',**csfont)\n",
2157
    "plt.gca().yaxis.set_major_formatter(PercentFormatter(1))\n",
2158
    "# plt.title('Temperature Histogram', **csfont)\n",
2159
    "plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype=\"step\", linewidth=2)\n",
2160
    "plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype=\"step\", linewidth=2)\n",
2161
    "plt.xticks(**csfont)\n",
2162
    "plt.yticks(**csfont)\n",
2163
    "idx += 1\n",
2164
    "\n",
2165
    "key = 'ADW -- Coeficiente de anisocitosis'\n",
2166
    "low = lab_tests_overall[key].quantile(limit)\n",
2167
    "high = lab_tests_overall[key].quantile(1 - limit)\n",
2168
    "lt_key_overall = lab_tests_overall[lab_tests_overall[key].between(low, high)]\n",
2169
    "lt_key_dead = lab_tests_dead[lab_tests_dead[key].between(low, high)]\n",
2170
    "lt_key_alive = lab_tests_alive[lab_tests_alive[key].between(low, high)]\n",
2171
    "plt.subplot(4, 4, idx)\n",
2172
    "plt.hist(lt_key_overall[key], bins=20, weights=np.ones(len(lt_key_overall[key])) / len(lt_key_overall[key]), color=color, ec=ec, alpha=alpha)\n",
2173
    "plt.xlabel('ADW -- Coeficiente de anisocitosis',**csfont)\n",
2174
    "plt.ylabel('Percentage',**csfont)\n",
2175
    "plt.gca().yaxis.set_major_formatter(PercentFormatter(1))\n",
2176
    "# plt.title('Temperature Histogram', **csfont)\n",
2177
    "plt.hist(lt_key_alive[key], bins=20, weights=np.ones(len(lt_key_alive[key])) / len(lt_key_alive[key]), color='green', ec=alive_color, alpha=1, histtype=\"step\", linewidth=2)\n",
2178
    "plt.hist(lt_key_dead[key], bins=20, weights=np.ones(len(lt_key_dead[key])) / len(lt_key_dead[key]), color='green', ec=dead_color, alpha=1, histtype=\"step\", linewidth=2)\n",
2179
    "plt.xticks(**csfont)\n",
2180
    "plt.yticks(**csfont)\n",
2181
    "idx += 1\n",
2182
    "\n",
2183
    "handles, labels = ax.get_legend_handles_labels()\n",
2184
    "print(handles, labels)\n",
2185
    "# fig.legend(handles, labels, loc='upper center')\n",
2186
    "plt.figlegend(handles, labels, loc='upper center', ncol=5, fontsize=18, bbox_to_anchor=(0.5, 1.05), prop=font_manager.FontProperties(family='Times New Roman',\n",
2187
    "                                   style='normal', size=18))\n",
2188
    "# fig.legend(, [], loc='upper center')\n",
2189
    "\n",
2190
    "fig.tight_layout()\n",
2191
    "plt.show()"
2192
   ]
2193
  }
2194
 ],
2195
 "metadata": {
2196
  "kernelspec": {
2197
   "display_name": "Python 3.7.11 ('python37')",
2198
   "language": "python",
2199
   "name": "python3"
2200
  },
2201
  "language_info": {
2202
   "codemirror_mode": {
2203
    "name": "ipython",
2204
    "version": 3
2205
   },
2206
   "file_extension": ".py",
2207
   "mimetype": "text/x-python",
2208
   "name": "python",
2209
   "nbconvert_exporter": "python",
2210
   "pygments_lexer": "ipython3",
2211
   "version": "3.7.11"
2212
  },
2213
  "vscode": {
2214
   "interpreter": {
2215
    "hash": "a10b846bdc9fc41ee38835cbc29d70b69dd5fd54e1341ea2c410a7804a50447a"
2216
   }
2217
  }
2218
 },
2219
 "nbformat": 4,
2220
 "nbformat_minor": 2
2221
}