Diff of /covid19_icu_prediction.py [000000] .. [2ea2fa]

Switch to unified view

a b/covid19_icu_prediction.py
1
# -*- coding: utf-8 -*-
2
"""COVID19_ICU_Prediction.ipynb
3
4
Automatically generated by Colaboratory.
5
6
Original file is located at
7
    https://colab.research.google.com/drive/11cMcxeMqpI_dQjuo31iPkSDOf0kTSWHP
8
9
#**Machine Learning Project**
10
11
***Title: Predicting ICU admission of confirmed COVID-19 cases***
12
13
The COVID-19 pandemic has shown us the
14
unpreparedness of our current healthcare system and
15
services. We need to optimize the allocation of medical
16
resources to maximize the utilization of resources. We are
17
preparing this Machine Learning model based on the
18
clinical data of confirmed COVID-19 cases. This will help
19
us to predict the need of ICU for a patient in advance. By
20
this information hospitals can plan the flow of operations
21
and take critical decisions like shifting patient to another
22
hospital or arrangement of resources within the time so
23
that the lives of patients can be saved.
24
25
##Libraries and Packages
26
List of all the packages that is used in the notebook
27
"""
28
29
import tensorflow as tf
30
import pandas as pd
31
import numpy as np
32
import matplotlib.pyplot as plt
33
from sklearn.manifold import TSNE 
34
from sklearn.decomposition import PCA 
35
36
pd.set_option('display.max_columns', None)
37
38
"""Downloading Dataset
39
40
"""
41
42
!wget -O "Kaggle_Sirio_Libanes_ICU_Prediction.xlsx" "https://drive.google.com/uc?export=download&id=1_shaH6SQajy1zrnALzim9jGaRmF3PLIn"
43
44
"""##Reading Dataset
45
Reading the dataset from the given CSV file.
46
"""
47
48
data = pd.read_excel("Kaggle_Sirio_Libanes_ICU_Prediction.xlsx")
49
data
50
51
"""##Data Pre-Processing
52
Converting the data into usable format.
53
Following modifications has been done to the data to get most out of it:
54
1. Binary hotcoding to convert not float columns.
55
2. Marking Window 0-2 as 1 if the patient was admitted to ICU in any of the future windows. 
56
3. Removing all the records of the windows in which patients were actually admitted to the ICU (windows with ICU label 1 before the step 2).
57
4. Filling the NaN values of window 0-2 with the help of mean of values in all the windows of that patient.
58
5. Removing all the rows still having NaN values.
59
60
"""
61
62
print(data.dtypes)
63
data.select_dtypes(object)
64
65
without_ICU_column = data.drop('ICU', axis = 1)       #seperating the ICU lable column
66
ICU_column = data['ICU']
67
colums_to_convert = data.select_dtypes(object).columns   #finding columns that are not of type float or int
68
colums_to_convert
69
70
without_ICU_column = pd.get_dummies(without_ICU_column, columns = colums_to_convert)      #performing hotcoding
71
without_ICU_column.head()
72
73
data_expand = pd.concat([without_ICU_column, ICU_column], axis = 1)         #adding the ICU column again at the last position
74
data_expand.head(5)
75
76
column_names = data_expand.columns
77
arr = data_expand.to_numpy()
78
print(arr)
79
i=0
80
ICU_admitted_rows = []
81
while(i<len(arr)):            #loop to record the rows in which patient is admitted to the ICU and adding 1 label to the previous rows.
82
  for j in range(5):
83
    if(arr[i+j][-1]==1):
84
      for k in range(j):
85
        arr[i+k][-1]=1
86
      for toremove in range(i+j,i+5):
87
        ICU_admitted_rows.append(toremove)
88
      break
89
  i+=5
90
print(ICU_admitted_rows)
91
deletedcount = 0
92
for rowToRemove in ICU_admitted_rows:             #removing the rows in which patient was admitted to the ICU
93
  arr = np.delete(arr, rowToRemove-deletedcount, axis=0)
94
  deletedcount+=1
95
df = pd.DataFrame(arr, columns = column_names)
96
df.head(10)
97
98
#Filling missing values
99
pd.options.mode.chained_assignment = None 
100
edited_dfs_list = []
101
max_patient_id = df['PATIENT_VISIT_IDENTIFIER'].max()
102
for i in range(int(max_patient_id)):                      #keeping only the first window that is 0-2 for every patient and filling NaN values with mean of all windows
103
  tempdf = df[df['PATIENT_VISIT_IDENTIFIER']==i]
104
  if(len(tempdf)!=0):
105
    tempdf.fillna(tempdf.mean(), inplace=True)
106
    tempdf = tempdf.iloc[[0]]
107
    edited_dfs_list.append(tempdf)
108
109
  
110
final_data = pd.concat(edited_dfs_list)
111
final_data.head(30)
112
113
final_data = final_data.drop(['GENDER','PATIENT_VISIT_IDENTIFIER','WINDOW_0-2', 'WINDOW_2-4',   'WINDOW_4-6',   'WINDOW_6-12',  'WINDOW_ABOVE_12'],axis = 1)
114
final_data.head()
115
116
final_data.describe()
117
118
final_data = final_data.dropna(axis = 0)            #Now we must have to drop the rows having nan values as there is no data in any window to fill it.
119
120
"""##Data Analysis
121
Visualising the pre preoessed data and trying to get the intution about different characterstics.
122
"""
123
124
final_data.describe()
125
126
ICU_admission_distribution = final_data['ICU'].value_counts()
127
print("Total Patients after pre processing: ", sum(ICU_admission_distribution))
128
print("Distribution of ICU admissions")
129
print("Patients who were not admitted to ICU: ",ICU_admission_distribution[0])
130
print("Patients who were admitted to ICU: ",ICU_admission_distribution[1])
131
labels= ['Admitted to ICU', 'Not Admitted to ICU']
132
colors=['tomato', 'deepskyblue']
133
sizes= [ICU_admission_distribution[1], ICU_admission_distribution[0]]
134
plt.pie(sizes,labels=labels, colors=colors, startangle=90, autopct='%1.1f%%')
135
plt.title("ICU Distribution of data")
136
plt.axis('equal')
137
plt.show()
138
139
Age_distribution = final_data['AGE_ABOVE65'].value_counts()
140
print("Age Distribution")
141
print("Patients below age 65: ",Age_distribution[0])
142
print("Patients above age 65: ",Age_distribution[1])
143
labels= ['Below 65', 'Above 65']
144
colors=['lightgreen', 'violet']
145
sizes= [Age_distribution[0], Age_distribution[1]]
146
plt.pie(sizes,labels=labels, colors=colors, startangle=90, autopct='%1.1f%%')
147
plt.axis('equal')
148
plt.title("Age Distribution of data")
149
plt.show()
150
151
ICU_Admitted_data = final_data[final_data['ICU']==1]
152
Age_distribution = ICU_Admitted_data['AGE_ABOVE65'].value_counts()
153
print("Age Distribution")
154
print("Patients below age 65: ",Age_distribution[0])
155
print("Patients above age 65: ",Age_distribution[1])
156
labels= ['Below 65', 'Above 65']
157
colors=['orange', 'cyan']
158
sizes= [Age_distribution[0], Age_distribution[1]]
159
plt.pie(sizes,labels=labels, colors=colors, startangle=90, autopct='%1.1f%%')
160
plt.axis('equal')
161
plt.title("Age Distribution of ICU Admitted patients")
162
plt.show()
163
164
x = [[],[]]
165
x[0].append(final_data['AGE_PERCENTIL_10th'].value_counts()[1])
166
x[0].append(final_data['AGE_PERCENTIL_20th'].value_counts()[1])
167
x[0].append(final_data['AGE_PERCENTIL_30th'].value_counts()[1])
168
x[0].append(final_data['AGE_PERCENTIL_40th'].value_counts()[1])
169
x[0].append(final_data['AGE_PERCENTIL_50th'].value_counts()[1])
170
x[0].append(final_data['AGE_PERCENTIL_60th'].value_counts()[1])
171
x[0].append(final_data['AGE_PERCENTIL_70th'].value_counts()[1])
172
x[0].append(final_data['AGE_PERCENTIL_80th'].value_counts()[1])
173
x[0].append(final_data['AGE_PERCENTIL_90th'].value_counts()[1])
174
x[0].append(final_data['AGE_PERCENTIL_Above 90th'].value_counts()[1])
175
176
x[1].append(ICU_Admitted_data['AGE_PERCENTIL_10th'].value_counts()[1])
177
x[1].append(ICU_Admitted_data['AGE_PERCENTIL_20th'].value_counts()[1])
178
x[1].append(ICU_Admitted_data['AGE_PERCENTIL_30th'].value_counts()[1])
179
x[1].append(ICU_Admitted_data['AGE_PERCENTIL_40th'].value_counts()[1])
180
x[1].append(ICU_Admitted_data['AGE_PERCENTIL_50th'].value_counts()[1])
181
x[1].append(ICU_Admitted_data['AGE_PERCENTIL_60th'].value_counts()[1])
182
x[1].append(ICU_Admitted_data['AGE_PERCENTIL_70th'].value_counts()[1])
183
x[1].append(ICU_Admitted_data['AGE_PERCENTIL_80th'].value_counts()[1])
184
x[1].append(ICU_Admitted_data['AGE_PERCENTIL_90th'].value_counts()[1])
185
x[1].append(ICU_Admitted_data['AGE_PERCENTIL_Above 90th'].value_counts()[1])
186
187
a = []
188
c=1
189
for i in x[0]:
190
  a.extend([c*10]*i)
191
  c+=1
192
plt.hist(a, 20, label='Total')
193
b = []
194
c=1
195
for i in x[1]:
196
  b.extend([c*10]*i)
197
  c+=1
198
print(x)
199
plt.hist(b, 20, label='ICU Admitted')
200
plt.xticks([10,20,30,40,50,60,70,80,90,100],['AGE_PERCENTIL_10th','AGE_PERCENTIL_20th','AGE_PERCENTIL_30th','AGE_PERCENTIL_40th','AGE_PERCENTIL_50th','AGE_PERCENTIL_60th','AGE_PERCENTIL_70th','AGE_PERCENTIL_80th','AGE_PERCENTIL_90th','AGE_PERCENTIL_Above 90'], rotation = 70)
201
plt.legend()
202
plt.ylabel('Frequency')
203
plt.title('Age Distribution Total and ICU Admitted')
204
plt.show()
205
206
Diesease_Grouping_1 = final_data['DISEASE GROUPING 1'].value_counts()
207
Diesease_Grouping_2 = final_data['DISEASE GROUPING 2'].value_counts()
208
Diesease_Grouping_3 = final_data['DISEASE GROUPING 3'].value_counts()
209
Diesease_Grouping_4 = final_data['DISEASE GROUPING 4'].value_counts()
210
Diesease_Grouping_5 = final_data['DISEASE GROUPING 5'].value_counts()
211
Diesease_Grouping_6 = final_data['DISEASE GROUPING 6'].value_counts()
212
HTN_total = final_data['HTN'].value_counts()
213
Immunocompromised_total = final_data['IMMUNOCOMPROMISED'].value_counts()
214
Other_total = final_data['OTHER'].value_counts()
215
216
ICU_Diesease_Grouping_1 = ICU_Admitted_data['DISEASE GROUPING 1'].value_counts()
217
ICU_Diesease_Grouping_2 = ICU_Admitted_data['DISEASE GROUPING 2'].value_counts()
218
ICU_Diesease_Grouping_3 = ICU_Admitted_data['DISEASE GROUPING 3'].value_counts()
219
ICU_Diesease_Grouping_4 = ICU_Admitted_data['DISEASE GROUPING 4'].value_counts()
220
ICU_Diesease_Grouping_5 = ICU_Admitted_data['DISEASE GROUPING 5'].value_counts()
221
ICU_Diesease_Grouping_6 = ICU_Admitted_data['DISEASE GROUPING 6'].value_counts()
222
HTN_ICU = ICU_Admitted_data['HTN'].value_counts()
223
Immunocompromised_ICU = ICU_Admitted_data['IMMUNOCOMPROMISED'].value_counts()
224
Other_ICU = ICU_Admitted_data['OTHER'].value_counts()
225
226
x = np.array([[Diesease_Grouping_1[1],Diesease_Grouping_2[1],Diesease_Grouping_3[1],Diesease_Grouping_4[1],Diesease_Grouping_5[1],Diesease_Grouping_6[1],HTN_total[1], Immunocompromised_total[1]],[ICU_Diesease_Grouping_1[1],ICU_Diesease_Grouping_2[1],ICU_Diesease_Grouping_3[1],ICU_Diesease_Grouping_4[1],ICU_Diesease_Grouping_5[1],ICU_Diesease_Grouping_6[1],HTN_ICU[1], Immunocompromised_ICU[1]]])
227
a = []
228
c=1
229
for i in x[0]:
230
  a.extend([c]*i)
231
  c+=1
232
plt.hist(a, 15, label='Total')
233
b = []
234
c=1
235
for i in x[1]:
236
  b.extend([c]*i)
237
  c+=1
238
print(x)
239
plt.hist(b, 15, label='ICU Admitted')
240
plt.xticks([1,2,3,4,5,6,7,8,9],['Diesease_Grouping_1','Diesease_Grouping_2','Diesease_Grouping_3','Diesease_Grouping_4','Diesease_Grouping_5','Diesease_Grouping_6', 'Hypertension', 'Immunocompromised'], rotation = 70)
241
plt.legend()
242
plt.ylabel('Frequency')
243
plt.title('Disease Distribution Total and ICU Admitted')
244
plt.show()
245
246
import seaborn as sns
247
corr = final_data.corr()
248
corr.shape
249
plt.subplots(figsize=(100,100))
250
ax = sns.heatmap(
251
    corr, 
252
    vmin=-1, vmax=1, center=0,
253
    cmap=sns.diverging_palette(20, 220, n=200),
254
    square=True
255
)
256
ax.set_xticklabels(
257
    ax.get_xticklabels(),
258
    rotation=90,
259
    horizontalalignment='right'
260
);
261
corr.tail()
262
263
corr.shape
264
ICU_corr = corr.iloc[236]
265
ICU_corr.describe()
266
267
ICU_corr = np.array(ICU_corr)
268
selection = []
269
for i in ICU_corr:
270
  if(i):
271
    if(i>0.11):
272
      selection.append(True)
273
    elif(i<-0.12):
274
      selection.append(True)
275
    else:
276
      selection.append(False)
277
  else:
278
    selection.append(False)
279
280
print(len(selection), selection.count(True))
281
selection = np.array(selection)
282
selected_final_data = final_data.loc[:, selection]
283
selected_final_data.head()
284
285
selected_final_data = selected_final_data[['AGE_ABOVE65', 'DISEASE GROUPING 2', 'DISEASE GROUPING 3', 'DISEASE GROUPING 4',
286
                                           'HTN', 'BIC_VENOUS_MEAN', 'CALCIUM_MEAN' , 'CREATININ_MEAN', 'GLUCOSE_MEAN', 'INR_MEAN',
287
                                           'LACTATE_MEAN', 'LEUKOCYTES_MEAN', 'LINFOCITOS_MEAN', 'NEUTROPHILES_MEAN', 'PC02_VENOUS_MEAN',
288
                                           'PCR_MEAN', 'PLATELETS_MEAN', 'SAT02_VENOUS_MEAN', 'SODIUM_MEAN', 'UREA_MEAN', 'BLOODPRESSURE_DIASTOLIC_MEAN',
289
                                           'RESPIRATORY_RATE_MEAN', 'TEMPERATURE_MEAN', 'OXYGEN_SATURATION_MEAN', 'BLOODPRESSURE_SISTOLIC_MIN',
290
                                           'HEART_RATE_MIN', 'RESPIRATORY_RATE_MIN', 'TEMPERATURE_MIN', 'BLOODPRESSURE_DIASTOLIC_MAX', 'BLOODPRESSURE_SISTOLIC_MAX',
291
                                           'HEART_RATE_MAX', 'OXYGEN_SATURATION_MAX', 'BLOODPRESSURE_DIASTOLIC_DIFF', 'BLOODPRESSURE_SISTOLIC_DIFF', 
292
                                           'HEART_RATE_DIFF', 'RESPIRATORY_RATE_DIFF', 'TEMPERATURE_DIFF', 'OXYGEN_SATURATION_DIFF', 
293
                                           'AGE_PERCENTIL_10th', 'AGE_PERCENTIL_20th', 'AGE_PERCENTIL_80th', 'AGE_PERCENTIL_90th', 'ICU']]
294
295
print(selected_final_data.shape)
296
selected_final_data.head()
297
298
corr = selected_final_data.corr()
299
corr.shape
300
plt.subplots(figsize=(30,30))
301
ax = sns.heatmap(
302
    corr, 
303
    vmin=-1, vmax=1, center=0,
304
    cmap=sns.diverging_palette(20, 220, n=200),
305
    square=True
306
)
307
ax.set_xticklabels(
308
    ax.get_xticklabels(),
309
    rotation=90,
310
    horizontalalignment='right'
311
);
312
corr.tail()
313
314
selected_final_data.columns
315
316
Non_ICU_Admitted_data = selected_final_data[selected_final_data['ICU']==0]
317
ICU_Admitted_data = selected_final_data[selected_final_data['ICU']==1]
318
319
Vital_Non_ICU_Admitted_data = Non_ICU_Admitted_data[['BLOODPRESSURE_DIASTOLIC_MEAN',
320
       'RESPIRATORY_RATE_MEAN', 'TEMPERATURE_MEAN', 'OXYGEN_SATURATION_MEAN',
321
       'BLOODPRESSURE_SISTOLIC_MIN', 'HEART_RATE_MIN', 'RESPIRATORY_RATE_MIN',
322
       'TEMPERATURE_MIN', 'BLOODPRESSURE_DIASTOLIC_MAX',
323
       'BLOODPRESSURE_SISTOLIC_MAX', 'HEART_RATE_MAX', 'OXYGEN_SATURATION_MAX',
324
       'HEART_RATE_DIFF', 'RESPIRATORY_RATE_DIFF', 'TEMPERATURE_DIFF']]
325
326
Vital_ICU_Admitted_data = ICU_Admitted_data[['BLOODPRESSURE_DIASTOLIC_MEAN',
327
       'RESPIRATORY_RATE_MEAN', 'TEMPERATURE_MEAN', 'OXYGEN_SATURATION_MEAN',
328
       'BLOODPRESSURE_SISTOLIC_MIN', 'HEART_RATE_MIN', 'RESPIRATORY_RATE_MIN',
329
       'TEMPERATURE_MIN', 'BLOODPRESSURE_DIASTOLIC_MAX',
330
       'BLOODPRESSURE_SISTOLIC_MAX', 'HEART_RATE_MAX', 'OXYGEN_SATURATION_MAX',
331
       'HEART_RATE_DIFF', 'RESPIRATORY_RATE_DIFF', 'TEMPERATURE_DIFF']]
332
333
334
Lab_Non_ICU_Admitted_data = Non_ICU_Admitted_data[['HTN', 'BIC_VENOUS_MEAN', 'CALCIUM_MEAN',
335
       'CREATININ_MEAN', 'GLUCOSE_MEAN', 'INR_MEAN', 'LACTATE_MEAN',
336
       'LEUKOCYTES_MEAN', 'LINFOCITOS_MEAN', 'NEUTROPHILES_MEAN',
337
       'PC02_VENOUS_MEAN', 'PCR_MEAN', 'PLATELETS_MEAN', 'SAT02_VENOUS_MEAN',
338
       'SODIUM_MEAN', 'UREA_MEAN']]
339
Lab_ICU_Admitted_data = ICU_Admitted_data[['HTN', 'BIC_VENOUS_MEAN', 'CALCIUM_MEAN',
340
       'CREATININ_MEAN', 'GLUCOSE_MEAN', 'INR_MEAN', 'LACTATE_MEAN',
341
       'LEUKOCYTES_MEAN', 'LINFOCITOS_MEAN', 'NEUTROPHILES_MEAN',
342
       'PC02_VENOUS_MEAN', 'PCR_MEAN', 'PLATELETS_MEAN', 'SAT02_VENOUS_MEAN',
343
       'SODIUM_MEAN', 'UREA_MEAN']]
344
345
346
# set width of bar 
347
barWidth = 0.25
348
fig = plt.subplots(figsize =(20, 10)) 
349
   
350
vital_non_ICU = np.array(Vital_Non_ICU_Admitted_data.mean(axis=0)) 
351
vital_ICU = np.array(Vital_ICU_Admitted_data.mean(axis=0)) 
352
   
353
# Set position of bar on X axis 
354
br1 = np.arange(len(vital_ICU)) + (barWidth*0.5)
355
br2 = [x + barWidth for x in br1]  
356
   
357
# Make the plot 
358
plt.bar(br2, vital_ICU, color ='r', width = barWidth, edgecolor ='grey', label ='ICU Admitted') 
359
plt.bar(br1, vital_non_ICU, color ='b', width = barWidth, edgecolor ='grey', label ='NOT Admitted') 
360
361
   
362
plt.xlabel('Features', fontweight ='bold') 
363
plt.ylabel('Normalized Values', fontweight ='bold') 
364
plt.xticks([r + barWidth for r in range(len(vital_ICU))], ['BLOODPRESSURE_DIASTOLIC_MEAN',
365
       'RESPIRATORY_RATE_MEAN', 'TEMPERATURE_MEAN', 'OXYGEN_SATURATION_MEAN',
366
       'BLOODPRESSURE_SISTOLIC_MIN', 'HEART_RATE_MIN', 'RESPIRATORY_RATE_MIN',
367
       'TEMPERATURE_MIN', 'BLOODPRESSURE_DIASTOLIC_MAX',
368
       'BLOODPRESSURE_SISTOLIC_MAX', 'HEART_RATE_MAX', 'OXYGEN_SATURATION_MAX',
369
       'HEART_RATE_DIFF', 'RESPIRATORY_RATE_DIFF', 'TEMPERATURE_DIFF'], rotation = 90) 
370
371
plt.legend()
372
plt.title("Vital Signs of Covid19 Patients")
373
plt.show()
374
375
376
# set width of bar 
377
barWidth = 0.25
378
fig = plt.subplots(figsize =(20, 10)) 
379
   
380
lab_non_ICU = np.array(Lab_Non_ICU_Admitted_data.mean(axis=0)) 
381
lab_ICU = np.array(Lab_ICU_Admitted_data.mean(axis=0)) 
382
   
383
# Set position of bar on X axis 
384
br1 = np.arange(len(lab_ICU)) + (barWidth*0.5)
385
br2 = [x + barWidth for x in br1]  
386
   
387
# Make the plot 
388
plt.bar(br2, lab_ICU, color ='r', width = barWidth, edgecolor ='grey', label ='ICU Admitted') 
389
plt.bar(br1, lab_non_ICU, color ='b', width = barWidth, edgecolor ='grey', label ='NOT Admitted') 
390
391
   
392
plt.xlabel('Features', fontweight ='bold') 
393
plt.ylabel('Normalized Value', fontweight ='bold') 
394
plt.legend()
395
plt.xticks([r + barWidth for r in range(len(lab_ICU))], ['HTN', 'BIC_VENOUS_MEAN', 'CALCIUM_MEAN',
396
       'CREATININ_MEAN', 'GLUCOSE_MEAN', 'INR_MEAN', 'LACTATE_MEAN',
397
       'LEUKOCYTES_MEAN', 'LINFOCITOS_MEAN', 'NEUTROPHILES_MEAN',
398
       'PC02_VENOUS_MEAN', 'PCR_MEAN', 'PLATELETS_MEAN', 'SAT02_VENOUS_MEAN',
399
       'SODIUM_MEAN', 'UREA_MEAN'], rotation = 90) 
400
plt.title("Lab Test Results of Covid19 patients")
401
plt.show()
402
403
X_data = np.array(selected_final_data.drop(['ICU'], axis = 1))
404
Y_data = np.array(selected_final_data[['ICU']])
405
print(X_data.shape)
406
print(Y_data.shape)
407
from sklearn.decomposition import PCA 
408
409
labels = []
410
for i in Y_data:
411
  if(i[0]==0):
412
    labels.append(0)
413
  else:
414
    labels.append(1)
415
print(X_data)
416
Y_data = np.array(labels)
417
418
#pca = PCA(0.80)
419
#X_data = pca.fit_transform(X_data)
420
print("pca ", X_data.shape)
421
model = TSNE(n_components = 2, random_state = 0) 
422
  
423
tsne_data = model.fit_transform(X_data) 
424
425
426
# creating a new data frame which 
427
# help us in ploting the result data 
428
tsne_data = np.vstack((tsne_data.T, Y_data)).T 
429
tsne_df = pd.DataFrame(data = tsne_data, 
430
     columns =("Dim_1", "Dim_2","label")) 
431
  
432
# Ploting the result of tsne 
433
sns.FacetGrid(tsne_df, hue ="label", size = 6).map( 
434
       plt.scatter, 'Dim_1', 'Dim_2', s = 100).add_legend() 
435
  
436
plt.show()
437
438
selected_final_data.head()
439
440
print(X_data)
441
print(Y_data)
442
443
"""## Training and Testing using various classifiers
444
445
Importing Libraries
446
"""
447
448
from sklearn.linear_model import LogisticRegressionCV
449
from sklearn.linear_model import LogisticRegression
450
from sklearn.model_selection import KFold
451
from sklearn.model_selection import train_test_split
452
from sklearn.naive_bayes import GaussianNB
453
from sklearn.linear_model import SGDClassifier
454
from sklearn.preprocessing import StandardScaler
455
from sklearn.pipeline import make_pipeline
456
from sklearn.ensemble import RandomForestClassifier
457
from sklearn.datasets import make_classification
458
from sklearn import svm
459
from sklearn import tree
460
from sklearn.neighbors import KNeighborsClassifier
461
from sklearn.metrics import confusion_matrix
462
from sklearn.metrics import roc_auc_score
463
from sklearn.model_selection import GridSearchCV
464
from sklearn.tree import DecisionTreeClassifier
465
import matplotlib.pyplot as plt 
466
from sklearn.metrics import log_loss
467
from sklearn import tree
468
import graphviz
469
from sklearn.neural_network import MLPClassifier
470
471
"""Shape of Datasets"""
472
473
print(X_data.shape)
474
print(Y_data.shape)
475
476
def ass(y_true,y_pred):
477
  tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
478
  accuracy=(tp+tn)/(tp+fp+fn+tn)
479
  specificity = tn/(tn+fp)
480
  sensitivity=tp/(tp+fn)
481
  print("Accuracy:",accuracy*100)
482
  print("Sensitivity:",sensitivity*100)
483
  print("Specificity:",specificity*100)
484
  print("ROC_AUC_Score:",roc_auc_score(y_true, y_pred)*100)
485
486
"""Splitting Data into Training Data and Testing Data"""
487
488
X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.30, random_state=1)
489
490
"""Performing Logistic Regression with Cross Validation Estimator"""
491
492
lgc=make_pipeline(LogisticRegressionCV(cv=5,random_state=1,max_iter=5000))
493
lgc.fit(X_train, Y_train)
494
y_pred=lgc.predict(X_test)
495
ass(Y_test,y_pred)
496
497
"""Performing Gaussian Naive Bayes """
498
499
gnb=make_pipeline(GaussianNB())
500
gnb.fit(X_train,Y_train)
501
y_pred=gnb.predict(X_test)
502
ass(Y_test,y_pred)
503
504
"""Finding Optimal Depth (SGD Classifier)"""
505
506
mx=-1
507
ri=-1
508
for i in range(1,10000):
509
  sgd= make_pipeline(SGDClassifier(random_state=i))
510
  sgd.fit(X_train,Y_train)
511
  pmx=mx
512
  mx=max(mx,sgd.score(X_test,Y_test))
513
  if(pmx!=mx):
514
    ri=i
515
print(ri)
516
517
"""Performing SGD classifier with optimal Depth"""
518
519
sgd= make_pipeline(SGDClassifier(random_state=ri))
520
sgd.fit(X_train,Y_train)
521
y_pred=sgd.predict(X_test)
522
ass(Y_test,y_pred)
523
524
"""Performing SVM ( Supoort Vector Machine ) classification on the given data"""
525
526
SVM_object = make_pipeline(svm.SVC(kernel='linear'))
527
SVM_object.fit(X_train,Y_train)
528
y_pred=SVM_object.predict(X_test)
529
ass(Y_test,y_pred)
530
531
"""Performing Decision tree classification
532
533
"""
534
535
DT_object=tree.DecisionTreeClassifier(criterion='entropy',max_depth=4,max_leaf_nodes=10)
536
DT_object.fit(X_train,Y_train)
537
y_pred=DT_object.predict(X_test)
538
ass(Y_test,y_pred)
539
540
from sklearn import tree
541
import graphviz
542
text_representation = tree.export_text(DT_object)
543
print(text_representation)
544
545
features=['AGE_ABOVE65', 'DISEASE GROUPING 2', 'DISEASE GROUPING 3',
546
       'DISEASE GROUPING 4', 'HTN', 'BIC_VENOUS_MEAN', 'CALCIUM_MEAN',
547
       'CREATININ_MEAN', 'GLUCOSE_MEAN', 'INR_MEAN', 'LACTATE_MEAN',
548
       'LEUKOCYTES_MEAN', 'LINFOCITOS_MEAN', 'NEUTROPHILES_MEAN',
549
       'PC02_VENOUS_MEAN', 'PCR_MEAN', 'PLATELETS_MEAN', 'SAT02_VENOUS_MEAN',
550
       'SODIUM_MEAN', 'UREA_MEAN', 'BLOODPRESSURE_DIASTOLIC_MEAN',
551
       'RESPIRATORY_RATE_MEAN', 'TEMPERATURE_MEAN', 'OXYGEN_SATURATION_MEAN',
552
       'BLOODPRESSURE_SISTOLIC_MIN', 'HEART_RATE_MIN', 'RESPIRATORY_RATE_MIN',
553
       'TEMPERATURE_MIN', 'BLOODPRESSURE_DIASTOLIC_MAX',
554
       'BLOODPRESSURE_SISTOLIC_MAX', 'HEART_RATE_MAX', 'OXYGEN_SATURATION_MAX',
555
       'BLOODPRESSURE_DIASTOLIC_DIFF', 'BLOODPRESSURE_SISTOLIC_DIFF',
556
       'HEART_RATE_DIFF', 'RESPIRATORY_RATE_DIFF', 'TEMPERATURE_DIFF',
557
       'OXYGEN_SATURATION_DIFF', 'AGE_PERCENTIL_10th', 'AGE_PERCENTIL_20th',
558
       'AGE_PERCENTIL_80th', 'AGE_PERCENTIL_90th']
559
classes=['Non-ICU','ICU']
560
dot_data = tree.export_graphviz(DT_object, out_file=None, 
561
                                feature_names=features,  
562
                                class_names=classes,
563
                                filled=True)
564
graph = graphviz.Source(dot_data, format="png") 
565
graph
566
567
"""Performing K-Nearest Neighbour Classifier 
568
569
"""
570
571
KNN_object=make_pipeline(KNeighborsClassifier(n_neighbors=25,p=1))
572
KNN_object.fit(X_train,Y_train)
573
y_pred=KNN_object.predict(X_test)
574
ass(Y_test,y_pred)
575
576
"""Performing Random Forest Classifier"""
577
578
RF_object = RandomForestClassifier(criterion='gini',random_state=23,max_depth=6,bootstrap=True)
579
RF_object.fit(X_train,Y_train)
580
y_pred=RF_object.predict(X_test)
581
ass(Y_test,y_pred)
582
583
"""##Performing Grid Search on Various ML Algorithm
584
585
Grid Search on Decision Tree
586
"""
587
588
param_grid = {'criterion':['entropy','gini'],'max_depth':np.arange(1,30),'max_leaf_nodes':np.arange(3,20),'random_state':[1,2]}
589
GS_DT=GridSearchCV(DecisionTreeClassifier(), param_grid,cv=5)
590
GS_DT.fit(X_train,Y_train)
591
GS_DT.best_params_
592
593
GS_DT.score(X_test,Y_test)
594
595
dt_train_score=[]
596
dt_test_score=[]
597
for i in np.arange(1, 30):
598
  param_grid = {'criterion':['entropy','gini'],'max_depth': [i],'max_leaf_nodes':np.arange(3,20),'random_state':[1,2]}
599
  GS_DT=GridSearchCV(DecisionTreeClassifier(), param_grid,cv=5)
600
  GS_DT.fit(X_train,Y_train)
601
  y_train_pred=GS_DT.predict(X_train)
602
  y_pred=GS_DT.predict(X_test)
603
  dt_train_score.append(log_loss(Y_train,y_train_pred))
604
  dt_test_score.append(log_loss(Y_test,y_pred))
605
606
plt.title("Decision Tree Classifier : Error vs Depth")
607
plt.xlabel("Depth")
608
plt.ylabel("Error")
609
plt.plot(np.arange(1,30),dt_train_score,label="Training Error")
610
plt.plot(np.arange(1,30),dt_test_score,label="Testing Error")
611
plt.legend()
612
plt.plot()
613
614
""" Best kernel Performance using Grid Search"""
615
616
param_grid = {'kernel':['linear','poly','sigmoid','rbf'],'gamma':['scale','auto'],'random_state':[1,2,3]}
617
GS_SVM=GridSearchCV(svm.SVC(), param_grid,cv=5)
618
GS_SVM.fit(X_train,Y_train)
619
GS_SVM.best_params_
620
621
GS_SVM.score(X_test,Y_test)
622
623
dt_train_score=[]
624
dt_test_score=[]
625
for i in ['linear','poly','sigmoid','rbf']:
626
  param_grid = {'kernel':[i],'gamma':['scale','auto'],'random_state':[1,2,3]}
627
  GS_SVM=GridSearchCV(svm.SVC(), param_grid,cv=5)
628
  GS_SVM.fit(X_train,Y_train)
629
  y_train_pred=GS_SVM.predict(X_train)
630
  y_pred=GS_SVM.predict(X_test)
631
  dt_train_score.append(log_loss(Y_train,y_train_pred))
632
  dt_test_score.append(log_loss(Y_test,y_pred))
633
634
plt.title("SVM: Error vs kernel")
635
plt.xlabel("Kernel")
636
plt.ylabel("Error")
637
plt.plot(['linear','poly','sigmoid','rbf'],dt_train_score,label="Training Error")
638
plt.plot(['linear','poly','sigmoid','rbf'],dt_test_score,label="Testing Error")
639
plt.legend()
640
plt.plot()
641
642
"""Grid Search on K nearest neighbour"""
643
644
param_grid = {'n_neighbors':[10,15,20,25,30,35,40],'leaf_size':np.arange(3,20),'p':[1,2]}
645
GS_KNN=GridSearchCV(KNeighborsClassifier(), param_grid,cv=5)
646
GS_KNN.fit(X_train,Y_train)
647
GS_KNN.best_params_
648
649
GS_KNN.score(X_test,Y_test)
650
651
knn_train_score=[]
652
knn_test_score=[]
653
for i in [10,15,20,25,30,35,40]:
654
  param_grid = {'n_neighbors': [i],'leaf_size':np.arange(3,20),'p':[1,2]}
655
  GS_KNN=GridSearchCV(KNeighborsClassifier(), param_grid,cv=5)
656
  GS_KNN.fit(X_train,Y_train)
657
  y_train_pred=GS_KNN.predict(X_train)
658
  y_pred=GS_KNN.predict(X_test)
659
  knn_train_score.append(log_loss(Y_train,y_train_pred))
660
  knn_test_score.append(log_loss(Y_test,y_pred))
661
662
plt.title("K-Neighbours Classifier: Error vs Number of Neighbors ")
663
plt.xlabel("Number of Neighbors")
664
plt.ylabel("Error")
665
plt.plot([10,15,20,25,30,35,40],knn_train_score,label="Training Error")
666
plt.plot([10,15,20,25,30,35,40],knn_test_score,label="Testing Error")
667
plt.legend()
668
plt.plot()
669
670
"""Grid search on Random Forest Classifier"""
671
672
param_grid = {'criterion':['gini','entropy'],'max_depth': [6],'random_state':[23]}
673
GS_RF=GridSearchCV(RandomForestClassifier(), param_grid,cv=5)
674
GS_RF.fit(X_train,Y_train)
675
GS_RF.best_params_
676
677
GS_RF.score(X_test,Y_test)
678
679
rf_train_score=[]
680
rf_test_score=[]
681
for i in np.arange(1, 30):
682
  param_grid = {'criterion':['gini','entropy'],'max_depth': [i],'random_state':[23]}
683
  GS_RF=GridSearchCV(RandomForestClassifier(), param_grid,cv=5)
684
  GS_RF.fit(X_train,Y_train)
685
  y_train_pred=GS_RF.predict(X_train)
686
  y_pred=GS_RF.predict(X_test)
687
  rf_train_score.append(log_loss(Y_train,y_train_pred))
688
  rf_test_score.append(log_loss(Y_test,y_pred))
689
690
plt.title("Random Forest Classifier : Error vs Max Depth")
691
plt.xlabel("Max Depth")
692
plt.ylabel("Error")
693
plt.plot(np.arange(1,30),rf_train_score,label="Training Error")
694
plt.plot(np.arange(1,30),rf_test_score,label="Testing Error")
695
plt.legend()
696
plt.plot()
697
698
"""Training model with different activation functions and finding model with best accuracy"""
699
700
best=1
701
acc=-1
702
for a in ["identity", "logistic", "tanh", "relu"]:
703
    model = MLPClassifier(activation=a,max_iter=10000, batch_size=64,alpha=0.1,random_state=1).fit(X_train,Y_train)
704
    y_pred = model.predict(X_test)
705
    print(a)
706
    ass(Y_test,y_pred)
707
    score = model.score(X_test,Y_test)
708
    if score>acc:
709
      acc=score
710
      best = a
711
    #print(a," - ",model.score(X_test,Y_test))
712
print(best,acc)
713
714
"""Performing Grid search on the model we got from the above"""
715
716
rf_train_score=[]
717
rf_test_score=[]
718
a=[0.001,0.01,0.1]
719
for i in range(len(a)):
720
  param_grid = {'activation':[best],'max_iter': [10000],'batch_size':[64],'alpha':[0.1],'learning_rate_init':[a[i]],'random_state':[1]}
721
  GS=GridSearchCV(MLPClassifier(), param_grid)
722
  GS.fit(X_train,Y_train)
723
  y_train_pred=GS.predict(X_train)
724
  y_pred=GS.predict(X_test)
725
  rf_train_score.append(log_loss(Y_train,y_train_pred))
726
  rf_test_score.append(log_loss(Y_test,y_pred))
727
728
plt.title(" MLPClassifier Error vs Learning rate")
729
plt.xlabel("Learning rate")
730
plt.ylabel("Error")
731
plt.plot([0.001,0.01,0.1],rf_train_score,label="Training Error")
732
plt.plot([0.001,0.01,0.1],rf_test_score,label="Testing Error")
733
plt.legend()
734
plt.plot()