|
a |
|
b/covid19_icu_prediction.py |
|
|
1 |
# -*- coding: utf-8 -*- |
|
|
2 |
"""COVID19_ICU_Prediction.ipynb |
|
|
3 |
|
|
|
4 |
Automatically generated by Colaboratory. |
|
|
5 |
|
|
|
6 |
Original file is located at |
|
|
7 |
https://colab.research.google.com/drive/11cMcxeMqpI_dQjuo31iPkSDOf0kTSWHP |
|
|
8 |
|
|
|
9 |
#**Machine Learning Project** |
|
|
10 |
|
|
|
11 |
***Title: Predicting ICU admission of confirmed COVID-19 cases*** |
|
|
12 |
|
|
|
13 |
The COVID-19 pandemic has shown us the |
|
|
14 |
unpreparedness of our current healthcare system and |
|
|
15 |
services. We need to optimize the allocation of medical |
|
|
16 |
resources to maximize the utilization of resources. We are |
|
|
17 |
preparing this Machine Learning model based on the |
|
|
18 |
clinical data of confirmed COVID-19 cases. This will help |
|
|
19 |
us to predict the need of ICU for a patient in advance. By |
|
|
20 |
this information hospitals can plan the flow of operations |
|
|
21 |
and take critical decisions like shifting patient to another |
|
|
22 |
hospital or arrangement of resources within the time so |
|
|
23 |
that the lives of patients can be saved. |
|
|
24 |
|
|
|
25 |
##Libraries and Packages |
|
|
26 |
List of all the packages that is used in the notebook |
|
|
27 |
""" |
|
|
28 |
|
|
|
29 |
import tensorflow as tf |
|
|
30 |
import pandas as pd |
|
|
31 |
import numpy as np |
|
|
32 |
import matplotlib.pyplot as plt |
|
|
33 |
from sklearn.manifold import TSNE |
|
|
34 |
from sklearn.decomposition import PCA |
|
|
35 |
|
|
|
36 |
pd.set_option('display.max_columns', None) |
|
|
37 |
|
|
|
38 |
"""Downloading Dataset |
|
|
39 |
|
|
|
40 |
""" |
|
|
41 |
|
|
|
42 |
!wget -O "Kaggle_Sirio_Libanes_ICU_Prediction.xlsx" "https://drive.google.com/uc?export=download&id=1_shaH6SQajy1zrnALzim9jGaRmF3PLIn" |
|
|
43 |
|
|
|
44 |
"""##Reading Dataset |
|
|
45 |
Reading the dataset from the given CSV file. |
|
|
46 |
""" |
|
|
47 |
|
|
|
48 |
data = pd.read_excel("Kaggle_Sirio_Libanes_ICU_Prediction.xlsx") |
|
|
49 |
data |
|
|
50 |
|
|
|
51 |
"""##Data Pre-Processing |
|
|
52 |
Converting the data into usable format. |
|
|
53 |
Following modifications has been done to the data to get most out of it: |
|
|
54 |
1. Binary hotcoding to convert not float columns. |
|
|
55 |
2. Marking Window 0-2 as 1 if the patient was admitted to ICU in any of the future windows. |
|
|
56 |
3. Removing all the records of the windows in which patients were actually admitted to the ICU (windows with ICU label 1 before the step 2). |
|
|
57 |
4. Filling the NaN values of window 0-2 with the help of mean of values in all the windows of that patient. |
|
|
58 |
5. Removing all the rows still having NaN values. |
|
|
59 |
|
|
|
60 |
""" |
|
|
61 |
|
|
|
62 |
print(data.dtypes) |
|
|
63 |
data.select_dtypes(object) |
|
|
64 |
|
|
|
65 |
without_ICU_column = data.drop('ICU', axis = 1) #seperating the ICU lable column |
|
|
66 |
ICU_column = data['ICU'] |
|
|
67 |
colums_to_convert = data.select_dtypes(object).columns #finding columns that are not of type float or int |
|
|
68 |
colums_to_convert |
|
|
69 |
|
|
|
70 |
without_ICU_column = pd.get_dummies(without_ICU_column, columns = colums_to_convert) #performing hotcoding |
|
|
71 |
without_ICU_column.head() |
|
|
72 |
|
|
|
73 |
data_expand = pd.concat([without_ICU_column, ICU_column], axis = 1) #adding the ICU column again at the last position |
|
|
74 |
data_expand.head(5) |
|
|
75 |
|
|
|
76 |
column_names = data_expand.columns |
|
|
77 |
arr = data_expand.to_numpy() |
|
|
78 |
print(arr) |
|
|
79 |
i=0 |
|
|
80 |
ICU_admitted_rows = [] |
|
|
81 |
while(i<len(arr)): #loop to record the rows in which patient is admitted to the ICU and adding 1 label to the previous rows. |
|
|
82 |
for j in range(5): |
|
|
83 |
if(arr[i+j][-1]==1): |
|
|
84 |
for k in range(j): |
|
|
85 |
arr[i+k][-1]=1 |
|
|
86 |
for toremove in range(i+j,i+5): |
|
|
87 |
ICU_admitted_rows.append(toremove) |
|
|
88 |
break |
|
|
89 |
i+=5 |
|
|
90 |
print(ICU_admitted_rows) |
|
|
91 |
deletedcount = 0 |
|
|
92 |
for rowToRemove in ICU_admitted_rows: #removing the rows in which patient was admitted to the ICU |
|
|
93 |
arr = np.delete(arr, rowToRemove-deletedcount, axis=0) |
|
|
94 |
deletedcount+=1 |
|
|
95 |
df = pd.DataFrame(arr, columns = column_names) |
|
|
96 |
df.head(10) |
|
|
97 |
|
|
|
98 |
#Filling missing values |
|
|
99 |
pd.options.mode.chained_assignment = None |
|
|
100 |
edited_dfs_list = [] |
|
|
101 |
max_patient_id = df['PATIENT_VISIT_IDENTIFIER'].max() |
|
|
102 |
for i in range(int(max_patient_id)): #keeping only the first window that is 0-2 for every patient and filling NaN values with mean of all windows |
|
|
103 |
tempdf = df[df['PATIENT_VISIT_IDENTIFIER']==i] |
|
|
104 |
if(len(tempdf)!=0): |
|
|
105 |
tempdf.fillna(tempdf.mean(), inplace=True) |
|
|
106 |
tempdf = tempdf.iloc[[0]] |
|
|
107 |
edited_dfs_list.append(tempdf) |
|
|
108 |
|
|
|
109 |
|
|
|
110 |
final_data = pd.concat(edited_dfs_list) |
|
|
111 |
final_data.head(30) |
|
|
112 |
|
|
|
113 |
final_data = final_data.drop(['GENDER','PATIENT_VISIT_IDENTIFIER','WINDOW_0-2', 'WINDOW_2-4', 'WINDOW_4-6', 'WINDOW_6-12', 'WINDOW_ABOVE_12'],axis = 1) |
|
|
114 |
final_data.head() |
|
|
115 |
|
|
|
116 |
final_data.describe() |
|
|
117 |
|
|
|
118 |
final_data = final_data.dropna(axis = 0) #Now we must have to drop the rows having nan values as there is no data in any window to fill it. |
|
|
119 |
|
|
|
120 |
"""##Data Analysis |
|
|
121 |
Visualising the pre preoessed data and trying to get the intution about different characterstics. |
|
|
122 |
""" |
|
|
123 |
|
|
|
124 |
final_data.describe() |
|
|
125 |
|
|
|
126 |
ICU_admission_distribution = final_data['ICU'].value_counts() |
|
|
127 |
print("Total Patients after pre processing: ", sum(ICU_admission_distribution)) |
|
|
128 |
print("Distribution of ICU admissions") |
|
|
129 |
print("Patients who were not admitted to ICU: ",ICU_admission_distribution[0]) |
|
|
130 |
print("Patients who were admitted to ICU: ",ICU_admission_distribution[1]) |
|
|
131 |
labels= ['Admitted to ICU', 'Not Admitted to ICU'] |
|
|
132 |
colors=['tomato', 'deepskyblue'] |
|
|
133 |
sizes= [ICU_admission_distribution[1], ICU_admission_distribution[0]] |
|
|
134 |
plt.pie(sizes,labels=labels, colors=colors, startangle=90, autopct='%1.1f%%') |
|
|
135 |
plt.title("ICU Distribution of data") |
|
|
136 |
plt.axis('equal') |
|
|
137 |
plt.show() |
|
|
138 |
|
|
|
139 |
Age_distribution = final_data['AGE_ABOVE65'].value_counts() |
|
|
140 |
print("Age Distribution") |
|
|
141 |
print("Patients below age 65: ",Age_distribution[0]) |
|
|
142 |
print("Patients above age 65: ",Age_distribution[1]) |
|
|
143 |
labels= ['Below 65', 'Above 65'] |
|
|
144 |
colors=['lightgreen', 'violet'] |
|
|
145 |
sizes= [Age_distribution[0], Age_distribution[1]] |
|
|
146 |
plt.pie(sizes,labels=labels, colors=colors, startangle=90, autopct='%1.1f%%') |
|
|
147 |
plt.axis('equal') |
|
|
148 |
plt.title("Age Distribution of data") |
|
|
149 |
plt.show() |
|
|
150 |
|
|
|
151 |
ICU_Admitted_data = final_data[final_data['ICU']==1] |
|
|
152 |
Age_distribution = ICU_Admitted_data['AGE_ABOVE65'].value_counts() |
|
|
153 |
print("Age Distribution") |
|
|
154 |
print("Patients below age 65: ",Age_distribution[0]) |
|
|
155 |
print("Patients above age 65: ",Age_distribution[1]) |
|
|
156 |
labels= ['Below 65', 'Above 65'] |
|
|
157 |
colors=['orange', 'cyan'] |
|
|
158 |
sizes= [Age_distribution[0], Age_distribution[1]] |
|
|
159 |
plt.pie(sizes,labels=labels, colors=colors, startangle=90, autopct='%1.1f%%') |
|
|
160 |
plt.axis('equal') |
|
|
161 |
plt.title("Age Distribution of ICU Admitted patients") |
|
|
162 |
plt.show() |
|
|
163 |
|
|
|
164 |
x = [[],[]] |
|
|
165 |
x[0].append(final_data['AGE_PERCENTIL_10th'].value_counts()[1]) |
|
|
166 |
x[0].append(final_data['AGE_PERCENTIL_20th'].value_counts()[1]) |
|
|
167 |
x[0].append(final_data['AGE_PERCENTIL_30th'].value_counts()[1]) |
|
|
168 |
x[0].append(final_data['AGE_PERCENTIL_40th'].value_counts()[1]) |
|
|
169 |
x[0].append(final_data['AGE_PERCENTIL_50th'].value_counts()[1]) |
|
|
170 |
x[0].append(final_data['AGE_PERCENTIL_60th'].value_counts()[1]) |
|
|
171 |
x[0].append(final_data['AGE_PERCENTIL_70th'].value_counts()[1]) |
|
|
172 |
x[0].append(final_data['AGE_PERCENTIL_80th'].value_counts()[1]) |
|
|
173 |
x[0].append(final_data['AGE_PERCENTIL_90th'].value_counts()[1]) |
|
|
174 |
x[0].append(final_data['AGE_PERCENTIL_Above 90th'].value_counts()[1]) |
|
|
175 |
|
|
|
176 |
x[1].append(ICU_Admitted_data['AGE_PERCENTIL_10th'].value_counts()[1]) |
|
|
177 |
x[1].append(ICU_Admitted_data['AGE_PERCENTIL_20th'].value_counts()[1]) |
|
|
178 |
x[1].append(ICU_Admitted_data['AGE_PERCENTIL_30th'].value_counts()[1]) |
|
|
179 |
x[1].append(ICU_Admitted_data['AGE_PERCENTIL_40th'].value_counts()[1]) |
|
|
180 |
x[1].append(ICU_Admitted_data['AGE_PERCENTIL_50th'].value_counts()[1]) |
|
|
181 |
x[1].append(ICU_Admitted_data['AGE_PERCENTIL_60th'].value_counts()[1]) |
|
|
182 |
x[1].append(ICU_Admitted_data['AGE_PERCENTIL_70th'].value_counts()[1]) |
|
|
183 |
x[1].append(ICU_Admitted_data['AGE_PERCENTIL_80th'].value_counts()[1]) |
|
|
184 |
x[1].append(ICU_Admitted_data['AGE_PERCENTIL_90th'].value_counts()[1]) |
|
|
185 |
x[1].append(ICU_Admitted_data['AGE_PERCENTIL_Above 90th'].value_counts()[1]) |
|
|
186 |
|
|
|
187 |
a = [] |
|
|
188 |
c=1 |
|
|
189 |
for i in x[0]: |
|
|
190 |
a.extend([c*10]*i) |
|
|
191 |
c+=1 |
|
|
192 |
plt.hist(a, 20, label='Total') |
|
|
193 |
b = [] |
|
|
194 |
c=1 |
|
|
195 |
for i in x[1]: |
|
|
196 |
b.extend([c*10]*i) |
|
|
197 |
c+=1 |
|
|
198 |
print(x) |
|
|
199 |
plt.hist(b, 20, label='ICU Admitted') |
|
|
200 |
plt.xticks([10,20,30,40,50,60,70,80,90,100],['AGE_PERCENTIL_10th','AGE_PERCENTIL_20th','AGE_PERCENTIL_30th','AGE_PERCENTIL_40th','AGE_PERCENTIL_50th','AGE_PERCENTIL_60th','AGE_PERCENTIL_70th','AGE_PERCENTIL_80th','AGE_PERCENTIL_90th','AGE_PERCENTIL_Above 90'], rotation = 70) |
|
|
201 |
plt.legend() |
|
|
202 |
plt.ylabel('Frequency') |
|
|
203 |
plt.title('Age Distribution Total and ICU Admitted') |
|
|
204 |
plt.show() |
|
|
205 |
|
|
|
206 |
Diesease_Grouping_1 = final_data['DISEASE GROUPING 1'].value_counts() |
|
|
207 |
Diesease_Grouping_2 = final_data['DISEASE GROUPING 2'].value_counts() |
|
|
208 |
Diesease_Grouping_3 = final_data['DISEASE GROUPING 3'].value_counts() |
|
|
209 |
Diesease_Grouping_4 = final_data['DISEASE GROUPING 4'].value_counts() |
|
|
210 |
Diesease_Grouping_5 = final_data['DISEASE GROUPING 5'].value_counts() |
|
|
211 |
Diesease_Grouping_6 = final_data['DISEASE GROUPING 6'].value_counts() |
|
|
212 |
HTN_total = final_data['HTN'].value_counts() |
|
|
213 |
Immunocompromised_total = final_data['IMMUNOCOMPROMISED'].value_counts() |
|
|
214 |
Other_total = final_data['OTHER'].value_counts() |
|
|
215 |
|
|
|
216 |
ICU_Diesease_Grouping_1 = ICU_Admitted_data['DISEASE GROUPING 1'].value_counts() |
|
|
217 |
ICU_Diesease_Grouping_2 = ICU_Admitted_data['DISEASE GROUPING 2'].value_counts() |
|
|
218 |
ICU_Diesease_Grouping_3 = ICU_Admitted_data['DISEASE GROUPING 3'].value_counts() |
|
|
219 |
ICU_Diesease_Grouping_4 = ICU_Admitted_data['DISEASE GROUPING 4'].value_counts() |
|
|
220 |
ICU_Diesease_Grouping_5 = ICU_Admitted_data['DISEASE GROUPING 5'].value_counts() |
|
|
221 |
ICU_Diesease_Grouping_6 = ICU_Admitted_data['DISEASE GROUPING 6'].value_counts() |
|
|
222 |
HTN_ICU = ICU_Admitted_data['HTN'].value_counts() |
|
|
223 |
Immunocompromised_ICU = ICU_Admitted_data['IMMUNOCOMPROMISED'].value_counts() |
|
|
224 |
Other_ICU = ICU_Admitted_data['OTHER'].value_counts() |
|
|
225 |
|
|
|
226 |
x = np.array([[Diesease_Grouping_1[1],Diesease_Grouping_2[1],Diesease_Grouping_3[1],Diesease_Grouping_4[1],Diesease_Grouping_5[1],Diesease_Grouping_6[1],HTN_total[1], Immunocompromised_total[1]],[ICU_Diesease_Grouping_1[1],ICU_Diesease_Grouping_2[1],ICU_Diesease_Grouping_3[1],ICU_Diesease_Grouping_4[1],ICU_Diesease_Grouping_5[1],ICU_Diesease_Grouping_6[1],HTN_ICU[1], Immunocompromised_ICU[1]]]) |
|
|
227 |
a = [] |
|
|
228 |
c=1 |
|
|
229 |
for i in x[0]: |
|
|
230 |
a.extend([c]*i) |
|
|
231 |
c+=1 |
|
|
232 |
plt.hist(a, 15, label='Total') |
|
|
233 |
b = [] |
|
|
234 |
c=1 |
|
|
235 |
for i in x[1]: |
|
|
236 |
b.extend([c]*i) |
|
|
237 |
c+=1 |
|
|
238 |
print(x) |
|
|
239 |
plt.hist(b, 15, label='ICU Admitted') |
|
|
240 |
plt.xticks([1,2,3,4,5,6,7,8,9],['Diesease_Grouping_1','Diesease_Grouping_2','Diesease_Grouping_3','Diesease_Grouping_4','Diesease_Grouping_5','Diesease_Grouping_6', 'Hypertension', 'Immunocompromised'], rotation = 70) |
|
|
241 |
plt.legend() |
|
|
242 |
plt.ylabel('Frequency') |
|
|
243 |
plt.title('Disease Distribution Total and ICU Admitted') |
|
|
244 |
plt.show() |
|
|
245 |
|
|
|
246 |
import seaborn as sns |
|
|
247 |
corr = final_data.corr() |
|
|
248 |
corr.shape |
|
|
249 |
plt.subplots(figsize=(100,100)) |
|
|
250 |
ax = sns.heatmap( |
|
|
251 |
corr, |
|
|
252 |
vmin=-1, vmax=1, center=0, |
|
|
253 |
cmap=sns.diverging_palette(20, 220, n=200), |
|
|
254 |
square=True |
|
|
255 |
) |
|
|
256 |
ax.set_xticklabels( |
|
|
257 |
ax.get_xticklabels(), |
|
|
258 |
rotation=90, |
|
|
259 |
horizontalalignment='right' |
|
|
260 |
); |
|
|
261 |
corr.tail() |
|
|
262 |
|
|
|
263 |
corr.shape |
|
|
264 |
ICU_corr = corr.iloc[236] |
|
|
265 |
ICU_corr.describe() |
|
|
266 |
|
|
|
267 |
ICU_corr = np.array(ICU_corr) |
|
|
268 |
selection = [] |
|
|
269 |
for i in ICU_corr: |
|
|
270 |
if(i): |
|
|
271 |
if(i>0.11): |
|
|
272 |
selection.append(True) |
|
|
273 |
elif(i<-0.12): |
|
|
274 |
selection.append(True) |
|
|
275 |
else: |
|
|
276 |
selection.append(False) |
|
|
277 |
else: |
|
|
278 |
selection.append(False) |
|
|
279 |
|
|
|
280 |
print(len(selection), selection.count(True)) |
|
|
281 |
selection = np.array(selection) |
|
|
282 |
selected_final_data = final_data.loc[:, selection] |
|
|
283 |
selected_final_data.head() |
|
|
284 |
|
|
|
285 |
selected_final_data = selected_final_data[['AGE_ABOVE65', 'DISEASE GROUPING 2', 'DISEASE GROUPING 3', 'DISEASE GROUPING 4', |
|
|
286 |
'HTN', 'BIC_VENOUS_MEAN', 'CALCIUM_MEAN' , 'CREATININ_MEAN', 'GLUCOSE_MEAN', 'INR_MEAN', |
|
|
287 |
'LACTATE_MEAN', 'LEUKOCYTES_MEAN', 'LINFOCITOS_MEAN', 'NEUTROPHILES_MEAN', 'PC02_VENOUS_MEAN', |
|
|
288 |
'PCR_MEAN', 'PLATELETS_MEAN', 'SAT02_VENOUS_MEAN', 'SODIUM_MEAN', 'UREA_MEAN', 'BLOODPRESSURE_DIASTOLIC_MEAN', |
|
|
289 |
'RESPIRATORY_RATE_MEAN', 'TEMPERATURE_MEAN', 'OXYGEN_SATURATION_MEAN', 'BLOODPRESSURE_SISTOLIC_MIN', |
|
|
290 |
'HEART_RATE_MIN', 'RESPIRATORY_RATE_MIN', 'TEMPERATURE_MIN', 'BLOODPRESSURE_DIASTOLIC_MAX', 'BLOODPRESSURE_SISTOLIC_MAX', |
|
|
291 |
'HEART_RATE_MAX', 'OXYGEN_SATURATION_MAX', 'BLOODPRESSURE_DIASTOLIC_DIFF', 'BLOODPRESSURE_SISTOLIC_DIFF', |
|
|
292 |
'HEART_RATE_DIFF', 'RESPIRATORY_RATE_DIFF', 'TEMPERATURE_DIFF', 'OXYGEN_SATURATION_DIFF', |
|
|
293 |
'AGE_PERCENTIL_10th', 'AGE_PERCENTIL_20th', 'AGE_PERCENTIL_80th', 'AGE_PERCENTIL_90th', 'ICU']] |
|
|
294 |
|
|
|
295 |
print(selected_final_data.shape) |
|
|
296 |
selected_final_data.head() |
|
|
297 |
|
|
|
298 |
corr = selected_final_data.corr() |
|
|
299 |
corr.shape |
|
|
300 |
plt.subplots(figsize=(30,30)) |
|
|
301 |
ax = sns.heatmap( |
|
|
302 |
corr, |
|
|
303 |
vmin=-1, vmax=1, center=0, |
|
|
304 |
cmap=sns.diverging_palette(20, 220, n=200), |
|
|
305 |
square=True |
|
|
306 |
) |
|
|
307 |
ax.set_xticklabels( |
|
|
308 |
ax.get_xticklabels(), |
|
|
309 |
rotation=90, |
|
|
310 |
horizontalalignment='right' |
|
|
311 |
); |
|
|
312 |
corr.tail() |
|
|
313 |
|
|
|
314 |
selected_final_data.columns |
|
|
315 |
|
|
|
316 |
Non_ICU_Admitted_data = selected_final_data[selected_final_data['ICU']==0] |
|
|
317 |
ICU_Admitted_data = selected_final_data[selected_final_data['ICU']==1] |
|
|
318 |
|
|
|
319 |
Vital_Non_ICU_Admitted_data = Non_ICU_Admitted_data[['BLOODPRESSURE_DIASTOLIC_MEAN', |
|
|
320 |
'RESPIRATORY_RATE_MEAN', 'TEMPERATURE_MEAN', 'OXYGEN_SATURATION_MEAN', |
|
|
321 |
'BLOODPRESSURE_SISTOLIC_MIN', 'HEART_RATE_MIN', 'RESPIRATORY_RATE_MIN', |
|
|
322 |
'TEMPERATURE_MIN', 'BLOODPRESSURE_DIASTOLIC_MAX', |
|
|
323 |
'BLOODPRESSURE_SISTOLIC_MAX', 'HEART_RATE_MAX', 'OXYGEN_SATURATION_MAX', |
|
|
324 |
'HEART_RATE_DIFF', 'RESPIRATORY_RATE_DIFF', 'TEMPERATURE_DIFF']] |
|
|
325 |
|
|
|
326 |
Vital_ICU_Admitted_data = ICU_Admitted_data[['BLOODPRESSURE_DIASTOLIC_MEAN', |
|
|
327 |
'RESPIRATORY_RATE_MEAN', 'TEMPERATURE_MEAN', 'OXYGEN_SATURATION_MEAN', |
|
|
328 |
'BLOODPRESSURE_SISTOLIC_MIN', 'HEART_RATE_MIN', 'RESPIRATORY_RATE_MIN', |
|
|
329 |
'TEMPERATURE_MIN', 'BLOODPRESSURE_DIASTOLIC_MAX', |
|
|
330 |
'BLOODPRESSURE_SISTOLIC_MAX', 'HEART_RATE_MAX', 'OXYGEN_SATURATION_MAX', |
|
|
331 |
'HEART_RATE_DIFF', 'RESPIRATORY_RATE_DIFF', 'TEMPERATURE_DIFF']] |
|
|
332 |
|
|
|
333 |
|
|
|
334 |
Lab_Non_ICU_Admitted_data = Non_ICU_Admitted_data[['HTN', 'BIC_VENOUS_MEAN', 'CALCIUM_MEAN', |
|
|
335 |
'CREATININ_MEAN', 'GLUCOSE_MEAN', 'INR_MEAN', 'LACTATE_MEAN', |
|
|
336 |
'LEUKOCYTES_MEAN', 'LINFOCITOS_MEAN', 'NEUTROPHILES_MEAN', |
|
|
337 |
'PC02_VENOUS_MEAN', 'PCR_MEAN', 'PLATELETS_MEAN', 'SAT02_VENOUS_MEAN', |
|
|
338 |
'SODIUM_MEAN', 'UREA_MEAN']] |
|
|
339 |
Lab_ICU_Admitted_data = ICU_Admitted_data[['HTN', 'BIC_VENOUS_MEAN', 'CALCIUM_MEAN', |
|
|
340 |
'CREATININ_MEAN', 'GLUCOSE_MEAN', 'INR_MEAN', 'LACTATE_MEAN', |
|
|
341 |
'LEUKOCYTES_MEAN', 'LINFOCITOS_MEAN', 'NEUTROPHILES_MEAN', |
|
|
342 |
'PC02_VENOUS_MEAN', 'PCR_MEAN', 'PLATELETS_MEAN', 'SAT02_VENOUS_MEAN', |
|
|
343 |
'SODIUM_MEAN', 'UREA_MEAN']] |
|
|
344 |
|
|
|
345 |
|
|
|
346 |
# set width of bar |
|
|
347 |
barWidth = 0.25 |
|
|
348 |
fig = plt.subplots(figsize =(20, 10)) |
|
|
349 |
|
|
|
350 |
vital_non_ICU = np.array(Vital_Non_ICU_Admitted_data.mean(axis=0)) |
|
|
351 |
vital_ICU = np.array(Vital_ICU_Admitted_data.mean(axis=0)) |
|
|
352 |
|
|
|
353 |
# Set position of bar on X axis |
|
|
354 |
br1 = np.arange(len(vital_ICU)) + (barWidth*0.5) |
|
|
355 |
br2 = [x + barWidth for x in br1] |
|
|
356 |
|
|
|
357 |
# Make the plot |
|
|
358 |
plt.bar(br2, vital_ICU, color ='r', width = barWidth, edgecolor ='grey', label ='ICU Admitted') |
|
|
359 |
plt.bar(br1, vital_non_ICU, color ='b', width = barWidth, edgecolor ='grey', label ='NOT Admitted') |
|
|
360 |
|
|
|
361 |
|
|
|
362 |
plt.xlabel('Features', fontweight ='bold') |
|
|
363 |
plt.ylabel('Normalized Values', fontweight ='bold') |
|
|
364 |
plt.xticks([r + barWidth for r in range(len(vital_ICU))], ['BLOODPRESSURE_DIASTOLIC_MEAN', |
|
|
365 |
'RESPIRATORY_RATE_MEAN', 'TEMPERATURE_MEAN', 'OXYGEN_SATURATION_MEAN', |
|
|
366 |
'BLOODPRESSURE_SISTOLIC_MIN', 'HEART_RATE_MIN', 'RESPIRATORY_RATE_MIN', |
|
|
367 |
'TEMPERATURE_MIN', 'BLOODPRESSURE_DIASTOLIC_MAX', |
|
|
368 |
'BLOODPRESSURE_SISTOLIC_MAX', 'HEART_RATE_MAX', 'OXYGEN_SATURATION_MAX', |
|
|
369 |
'HEART_RATE_DIFF', 'RESPIRATORY_RATE_DIFF', 'TEMPERATURE_DIFF'], rotation = 90) |
|
|
370 |
|
|
|
371 |
plt.legend() |
|
|
372 |
plt.title("Vital Signs of Covid19 Patients") |
|
|
373 |
plt.show() |
|
|
374 |
|
|
|
375 |
|
|
|
376 |
# set width of bar |
|
|
377 |
barWidth = 0.25 |
|
|
378 |
fig = plt.subplots(figsize =(20, 10)) |
|
|
379 |
|
|
|
380 |
lab_non_ICU = np.array(Lab_Non_ICU_Admitted_data.mean(axis=0)) |
|
|
381 |
lab_ICU = np.array(Lab_ICU_Admitted_data.mean(axis=0)) |
|
|
382 |
|
|
|
383 |
# Set position of bar on X axis |
|
|
384 |
br1 = np.arange(len(lab_ICU)) + (barWidth*0.5) |
|
|
385 |
br2 = [x + barWidth for x in br1] |
|
|
386 |
|
|
|
387 |
# Make the plot |
|
|
388 |
plt.bar(br2, lab_ICU, color ='r', width = barWidth, edgecolor ='grey', label ='ICU Admitted') |
|
|
389 |
plt.bar(br1, lab_non_ICU, color ='b', width = barWidth, edgecolor ='grey', label ='NOT Admitted') |
|
|
390 |
|
|
|
391 |
|
|
|
392 |
plt.xlabel('Features', fontweight ='bold') |
|
|
393 |
plt.ylabel('Normalized Value', fontweight ='bold') |
|
|
394 |
plt.legend() |
|
|
395 |
plt.xticks([r + barWidth for r in range(len(lab_ICU))], ['HTN', 'BIC_VENOUS_MEAN', 'CALCIUM_MEAN', |
|
|
396 |
'CREATININ_MEAN', 'GLUCOSE_MEAN', 'INR_MEAN', 'LACTATE_MEAN', |
|
|
397 |
'LEUKOCYTES_MEAN', 'LINFOCITOS_MEAN', 'NEUTROPHILES_MEAN', |
|
|
398 |
'PC02_VENOUS_MEAN', 'PCR_MEAN', 'PLATELETS_MEAN', 'SAT02_VENOUS_MEAN', |
|
|
399 |
'SODIUM_MEAN', 'UREA_MEAN'], rotation = 90) |
|
|
400 |
plt.title("Lab Test Results of Covid19 patients") |
|
|
401 |
plt.show() |
|
|
402 |
|
|
|
403 |
X_data = np.array(selected_final_data.drop(['ICU'], axis = 1)) |
|
|
404 |
Y_data = np.array(selected_final_data[['ICU']]) |
|
|
405 |
print(X_data.shape) |
|
|
406 |
print(Y_data.shape) |
|
|
407 |
from sklearn.decomposition import PCA |
|
|
408 |
|
|
|
409 |
labels = [] |
|
|
410 |
for i in Y_data: |
|
|
411 |
if(i[0]==0): |
|
|
412 |
labels.append(0) |
|
|
413 |
else: |
|
|
414 |
labels.append(1) |
|
|
415 |
print(X_data) |
|
|
416 |
Y_data = np.array(labels) |
|
|
417 |
|
|
|
418 |
#pca = PCA(0.80) |
|
|
419 |
#X_data = pca.fit_transform(X_data) |
|
|
420 |
print("pca ", X_data.shape) |
|
|
421 |
model = TSNE(n_components = 2, random_state = 0) |
|
|
422 |
|
|
|
423 |
tsne_data = model.fit_transform(X_data) |
|
|
424 |
|
|
|
425 |
|
|
|
426 |
# creating a new data frame which |
|
|
427 |
# help us in ploting the result data |
|
|
428 |
tsne_data = np.vstack((tsne_data.T, Y_data)).T |
|
|
429 |
tsne_df = pd.DataFrame(data = tsne_data, |
|
|
430 |
columns =("Dim_1", "Dim_2","label")) |
|
|
431 |
|
|
|
432 |
# Ploting the result of tsne |
|
|
433 |
sns.FacetGrid(tsne_df, hue ="label", size = 6).map( |
|
|
434 |
plt.scatter, 'Dim_1', 'Dim_2', s = 100).add_legend() |
|
|
435 |
|
|
|
436 |
plt.show() |
|
|
437 |
|
|
|
438 |
selected_final_data.head() |
|
|
439 |
|
|
|
440 |
print(X_data) |
|
|
441 |
print(Y_data) |
|
|
442 |
|
|
|
443 |
"""## Training and Testing using various classifiers |
|
|
444 |
|
|
|
445 |
Importing Libraries |
|
|
446 |
""" |
|
|
447 |
|
|
|
448 |
from sklearn.linear_model import LogisticRegressionCV |
|
|
449 |
from sklearn.linear_model import LogisticRegression |
|
|
450 |
from sklearn.model_selection import KFold |
|
|
451 |
from sklearn.model_selection import train_test_split |
|
|
452 |
from sklearn.naive_bayes import GaussianNB |
|
|
453 |
from sklearn.linear_model import SGDClassifier |
|
|
454 |
from sklearn.preprocessing import StandardScaler |
|
|
455 |
from sklearn.pipeline import make_pipeline |
|
|
456 |
from sklearn.ensemble import RandomForestClassifier |
|
|
457 |
from sklearn.datasets import make_classification |
|
|
458 |
from sklearn import svm |
|
|
459 |
from sklearn import tree |
|
|
460 |
from sklearn.neighbors import KNeighborsClassifier |
|
|
461 |
from sklearn.metrics import confusion_matrix |
|
|
462 |
from sklearn.metrics import roc_auc_score |
|
|
463 |
from sklearn.model_selection import GridSearchCV |
|
|
464 |
from sklearn.tree import DecisionTreeClassifier |
|
|
465 |
import matplotlib.pyplot as plt |
|
|
466 |
from sklearn.metrics import log_loss |
|
|
467 |
from sklearn import tree |
|
|
468 |
import graphviz |
|
|
469 |
from sklearn.neural_network import MLPClassifier |
|
|
470 |
|
|
|
471 |
"""Shape of Datasets""" |
|
|
472 |
|
|
|
473 |
print(X_data.shape) |
|
|
474 |
print(Y_data.shape) |
|
|
475 |
|
|
|
476 |
def ass(y_true,y_pred): |
|
|
477 |
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() |
|
|
478 |
accuracy=(tp+tn)/(tp+fp+fn+tn) |
|
|
479 |
specificity = tn/(tn+fp) |
|
|
480 |
sensitivity=tp/(tp+fn) |
|
|
481 |
print("Accuracy:",accuracy*100) |
|
|
482 |
print("Sensitivity:",sensitivity*100) |
|
|
483 |
print("Specificity:",specificity*100) |
|
|
484 |
print("ROC_AUC_Score:",roc_auc_score(y_true, y_pred)*100) |
|
|
485 |
|
|
|
486 |
"""Splitting Data into Training Data and Testing Data""" |
|
|
487 |
|
|
|
488 |
X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.30, random_state=1) |
|
|
489 |
|
|
|
490 |
"""Performing Logistic Regression with Cross Validation Estimator""" |
|
|
491 |
|
|
|
492 |
lgc=make_pipeline(LogisticRegressionCV(cv=5,random_state=1,max_iter=5000)) |
|
|
493 |
lgc.fit(X_train, Y_train) |
|
|
494 |
y_pred=lgc.predict(X_test) |
|
|
495 |
ass(Y_test,y_pred) |
|
|
496 |
|
|
|
497 |
"""Performing Gaussian Naive Bayes """ |
|
|
498 |
|
|
|
499 |
gnb=make_pipeline(GaussianNB()) |
|
|
500 |
gnb.fit(X_train,Y_train) |
|
|
501 |
y_pred=gnb.predict(X_test) |
|
|
502 |
ass(Y_test,y_pred) |
|
|
503 |
|
|
|
504 |
"""Finding Optimal Depth (SGD Classifier)""" |
|
|
505 |
|
|
|
506 |
mx=-1 |
|
|
507 |
ri=-1 |
|
|
508 |
for i in range(1,10000): |
|
|
509 |
sgd= make_pipeline(SGDClassifier(random_state=i)) |
|
|
510 |
sgd.fit(X_train,Y_train) |
|
|
511 |
pmx=mx |
|
|
512 |
mx=max(mx,sgd.score(X_test,Y_test)) |
|
|
513 |
if(pmx!=mx): |
|
|
514 |
ri=i |
|
|
515 |
print(ri) |
|
|
516 |
|
|
|
517 |
"""Performing SGD classifier with optimal Depth""" |
|
|
518 |
|
|
|
519 |
sgd= make_pipeline(SGDClassifier(random_state=ri)) |
|
|
520 |
sgd.fit(X_train,Y_train) |
|
|
521 |
y_pred=sgd.predict(X_test) |
|
|
522 |
ass(Y_test,y_pred) |
|
|
523 |
|
|
|
524 |
"""Performing SVM ( Supoort Vector Machine ) classification on the given data""" |
|
|
525 |
|
|
|
526 |
SVM_object = make_pipeline(svm.SVC(kernel='linear')) |
|
|
527 |
SVM_object.fit(X_train,Y_train) |
|
|
528 |
y_pred=SVM_object.predict(X_test) |
|
|
529 |
ass(Y_test,y_pred) |
|
|
530 |
|
|
|
531 |
"""Performing Decision tree classification |
|
|
532 |
|
|
|
533 |
""" |
|
|
534 |
|
|
|
535 |
DT_object=tree.DecisionTreeClassifier(criterion='entropy',max_depth=4,max_leaf_nodes=10) |
|
|
536 |
DT_object.fit(X_train,Y_train) |
|
|
537 |
y_pred=DT_object.predict(X_test) |
|
|
538 |
ass(Y_test,y_pred) |
|
|
539 |
|
|
|
540 |
from sklearn import tree |
|
|
541 |
import graphviz |
|
|
542 |
text_representation = tree.export_text(DT_object) |
|
|
543 |
print(text_representation) |
|
|
544 |
|
|
|
545 |
features=['AGE_ABOVE65', 'DISEASE GROUPING 2', 'DISEASE GROUPING 3', |
|
|
546 |
'DISEASE GROUPING 4', 'HTN', 'BIC_VENOUS_MEAN', 'CALCIUM_MEAN', |
|
|
547 |
'CREATININ_MEAN', 'GLUCOSE_MEAN', 'INR_MEAN', 'LACTATE_MEAN', |
|
|
548 |
'LEUKOCYTES_MEAN', 'LINFOCITOS_MEAN', 'NEUTROPHILES_MEAN', |
|
|
549 |
'PC02_VENOUS_MEAN', 'PCR_MEAN', 'PLATELETS_MEAN', 'SAT02_VENOUS_MEAN', |
|
|
550 |
'SODIUM_MEAN', 'UREA_MEAN', 'BLOODPRESSURE_DIASTOLIC_MEAN', |
|
|
551 |
'RESPIRATORY_RATE_MEAN', 'TEMPERATURE_MEAN', 'OXYGEN_SATURATION_MEAN', |
|
|
552 |
'BLOODPRESSURE_SISTOLIC_MIN', 'HEART_RATE_MIN', 'RESPIRATORY_RATE_MIN', |
|
|
553 |
'TEMPERATURE_MIN', 'BLOODPRESSURE_DIASTOLIC_MAX', |
|
|
554 |
'BLOODPRESSURE_SISTOLIC_MAX', 'HEART_RATE_MAX', 'OXYGEN_SATURATION_MAX', |
|
|
555 |
'BLOODPRESSURE_DIASTOLIC_DIFF', 'BLOODPRESSURE_SISTOLIC_DIFF', |
|
|
556 |
'HEART_RATE_DIFF', 'RESPIRATORY_RATE_DIFF', 'TEMPERATURE_DIFF', |
|
|
557 |
'OXYGEN_SATURATION_DIFF', 'AGE_PERCENTIL_10th', 'AGE_PERCENTIL_20th', |
|
|
558 |
'AGE_PERCENTIL_80th', 'AGE_PERCENTIL_90th'] |
|
|
559 |
classes=['Non-ICU','ICU'] |
|
|
560 |
dot_data = tree.export_graphviz(DT_object, out_file=None, |
|
|
561 |
feature_names=features, |
|
|
562 |
class_names=classes, |
|
|
563 |
filled=True) |
|
|
564 |
graph = graphviz.Source(dot_data, format="png") |
|
|
565 |
graph |
|
|
566 |
|
|
|
567 |
"""Performing K-Nearest Neighbour Classifier |
|
|
568 |
|
|
|
569 |
""" |
|
|
570 |
|
|
|
571 |
KNN_object=make_pipeline(KNeighborsClassifier(n_neighbors=25,p=1)) |
|
|
572 |
KNN_object.fit(X_train,Y_train) |
|
|
573 |
y_pred=KNN_object.predict(X_test) |
|
|
574 |
ass(Y_test,y_pred) |
|
|
575 |
|
|
|
576 |
"""Performing Random Forest Classifier""" |
|
|
577 |
|
|
|
578 |
RF_object = RandomForestClassifier(criterion='gini',random_state=23,max_depth=6,bootstrap=True) |
|
|
579 |
RF_object.fit(X_train,Y_train) |
|
|
580 |
y_pred=RF_object.predict(X_test) |
|
|
581 |
ass(Y_test,y_pred) |
|
|
582 |
|
|
|
583 |
"""##Performing Grid Search on Various ML Algorithm |
|
|
584 |
|
|
|
585 |
Grid Search on Decision Tree |
|
|
586 |
""" |
|
|
587 |
|
|
|
588 |
param_grid = {'criterion':['entropy','gini'],'max_depth':np.arange(1,30),'max_leaf_nodes':np.arange(3,20),'random_state':[1,2]} |
|
|
589 |
GS_DT=GridSearchCV(DecisionTreeClassifier(), param_grid,cv=5) |
|
|
590 |
GS_DT.fit(X_train,Y_train) |
|
|
591 |
GS_DT.best_params_ |
|
|
592 |
|
|
|
593 |
GS_DT.score(X_test,Y_test) |
|
|
594 |
|
|
|
595 |
dt_train_score=[] |
|
|
596 |
dt_test_score=[] |
|
|
597 |
for i in np.arange(1, 30): |
|
|
598 |
param_grid = {'criterion':['entropy','gini'],'max_depth': [i],'max_leaf_nodes':np.arange(3,20),'random_state':[1,2]} |
|
|
599 |
GS_DT=GridSearchCV(DecisionTreeClassifier(), param_grid,cv=5) |
|
|
600 |
GS_DT.fit(X_train,Y_train) |
|
|
601 |
y_train_pred=GS_DT.predict(X_train) |
|
|
602 |
y_pred=GS_DT.predict(X_test) |
|
|
603 |
dt_train_score.append(log_loss(Y_train,y_train_pred)) |
|
|
604 |
dt_test_score.append(log_loss(Y_test,y_pred)) |
|
|
605 |
|
|
|
606 |
plt.title("Decision Tree Classifier : Error vs Depth") |
|
|
607 |
plt.xlabel("Depth") |
|
|
608 |
plt.ylabel("Error") |
|
|
609 |
plt.plot(np.arange(1,30),dt_train_score,label="Training Error") |
|
|
610 |
plt.plot(np.arange(1,30),dt_test_score,label="Testing Error") |
|
|
611 |
plt.legend() |
|
|
612 |
plt.plot() |
|
|
613 |
|
|
|
614 |
""" Best kernel Performance using Grid Search""" |
|
|
615 |
|
|
|
616 |
param_grid = {'kernel':['linear','poly','sigmoid','rbf'],'gamma':['scale','auto'],'random_state':[1,2,3]} |
|
|
617 |
GS_SVM=GridSearchCV(svm.SVC(), param_grid,cv=5) |
|
|
618 |
GS_SVM.fit(X_train,Y_train) |
|
|
619 |
GS_SVM.best_params_ |
|
|
620 |
|
|
|
621 |
GS_SVM.score(X_test,Y_test) |
|
|
622 |
|
|
|
623 |
dt_train_score=[] |
|
|
624 |
dt_test_score=[] |
|
|
625 |
for i in ['linear','poly','sigmoid','rbf']: |
|
|
626 |
param_grid = {'kernel':[i],'gamma':['scale','auto'],'random_state':[1,2,3]} |
|
|
627 |
GS_SVM=GridSearchCV(svm.SVC(), param_grid,cv=5) |
|
|
628 |
GS_SVM.fit(X_train,Y_train) |
|
|
629 |
y_train_pred=GS_SVM.predict(X_train) |
|
|
630 |
y_pred=GS_SVM.predict(X_test) |
|
|
631 |
dt_train_score.append(log_loss(Y_train,y_train_pred)) |
|
|
632 |
dt_test_score.append(log_loss(Y_test,y_pred)) |
|
|
633 |
|
|
|
634 |
plt.title("SVM: Error vs kernel") |
|
|
635 |
plt.xlabel("Kernel") |
|
|
636 |
plt.ylabel("Error") |
|
|
637 |
plt.plot(['linear','poly','sigmoid','rbf'],dt_train_score,label="Training Error") |
|
|
638 |
plt.plot(['linear','poly','sigmoid','rbf'],dt_test_score,label="Testing Error") |
|
|
639 |
plt.legend() |
|
|
640 |
plt.plot() |
|
|
641 |
|
|
|
642 |
"""Grid Search on K nearest neighbour""" |
|
|
643 |
|
|
|
644 |
param_grid = {'n_neighbors':[10,15,20,25,30,35,40],'leaf_size':np.arange(3,20),'p':[1,2]} |
|
|
645 |
GS_KNN=GridSearchCV(KNeighborsClassifier(), param_grid,cv=5) |
|
|
646 |
GS_KNN.fit(X_train,Y_train) |
|
|
647 |
GS_KNN.best_params_ |
|
|
648 |
|
|
|
649 |
GS_KNN.score(X_test,Y_test) |
|
|
650 |
|
|
|
651 |
knn_train_score=[] |
|
|
652 |
knn_test_score=[] |
|
|
653 |
for i in [10,15,20,25,30,35,40]: |
|
|
654 |
param_grid = {'n_neighbors': [i],'leaf_size':np.arange(3,20),'p':[1,2]} |
|
|
655 |
GS_KNN=GridSearchCV(KNeighborsClassifier(), param_grid,cv=5) |
|
|
656 |
GS_KNN.fit(X_train,Y_train) |
|
|
657 |
y_train_pred=GS_KNN.predict(X_train) |
|
|
658 |
y_pred=GS_KNN.predict(X_test) |
|
|
659 |
knn_train_score.append(log_loss(Y_train,y_train_pred)) |
|
|
660 |
knn_test_score.append(log_loss(Y_test,y_pred)) |
|
|
661 |
|
|
|
662 |
plt.title("K-Neighbours Classifier: Error vs Number of Neighbors ") |
|
|
663 |
plt.xlabel("Number of Neighbors") |
|
|
664 |
plt.ylabel("Error") |
|
|
665 |
plt.plot([10,15,20,25,30,35,40],knn_train_score,label="Training Error") |
|
|
666 |
plt.plot([10,15,20,25,30,35,40],knn_test_score,label="Testing Error") |
|
|
667 |
plt.legend() |
|
|
668 |
plt.plot() |
|
|
669 |
|
|
|
670 |
"""Grid search on Random Forest Classifier""" |
|
|
671 |
|
|
|
672 |
param_grid = {'criterion':['gini','entropy'],'max_depth': [6],'random_state':[23]} |
|
|
673 |
GS_RF=GridSearchCV(RandomForestClassifier(), param_grid,cv=5) |
|
|
674 |
GS_RF.fit(X_train,Y_train) |
|
|
675 |
GS_RF.best_params_ |
|
|
676 |
|
|
|
677 |
GS_RF.score(X_test,Y_test) |
|
|
678 |
|
|
|
679 |
rf_train_score=[] |
|
|
680 |
rf_test_score=[] |
|
|
681 |
for i in np.arange(1, 30): |
|
|
682 |
param_grid = {'criterion':['gini','entropy'],'max_depth': [i],'random_state':[23]} |
|
|
683 |
GS_RF=GridSearchCV(RandomForestClassifier(), param_grid,cv=5) |
|
|
684 |
GS_RF.fit(X_train,Y_train) |
|
|
685 |
y_train_pred=GS_RF.predict(X_train) |
|
|
686 |
y_pred=GS_RF.predict(X_test) |
|
|
687 |
rf_train_score.append(log_loss(Y_train,y_train_pred)) |
|
|
688 |
rf_test_score.append(log_loss(Y_test,y_pred)) |
|
|
689 |
|
|
|
690 |
plt.title("Random Forest Classifier : Error vs Max Depth") |
|
|
691 |
plt.xlabel("Max Depth") |
|
|
692 |
plt.ylabel("Error") |
|
|
693 |
plt.plot(np.arange(1,30),rf_train_score,label="Training Error") |
|
|
694 |
plt.plot(np.arange(1,30),rf_test_score,label="Testing Error") |
|
|
695 |
plt.legend() |
|
|
696 |
plt.plot() |
|
|
697 |
|
|
|
698 |
"""Training model with different activation functions and finding model with best accuracy""" |
|
|
699 |
|
|
|
700 |
best=1 |
|
|
701 |
acc=-1 |
|
|
702 |
for a in ["identity", "logistic", "tanh", "relu"]: |
|
|
703 |
model = MLPClassifier(activation=a,max_iter=10000, batch_size=64,alpha=0.1,random_state=1).fit(X_train,Y_train) |
|
|
704 |
y_pred = model.predict(X_test) |
|
|
705 |
print(a) |
|
|
706 |
ass(Y_test,y_pred) |
|
|
707 |
score = model.score(X_test,Y_test) |
|
|
708 |
if score>acc: |
|
|
709 |
acc=score |
|
|
710 |
best = a |
|
|
711 |
#print(a," - ",model.score(X_test,Y_test)) |
|
|
712 |
print(best,acc) |
|
|
713 |
|
|
|
714 |
"""Performing Grid search on the model we got from the above""" |
|
|
715 |
|
|
|
716 |
rf_train_score=[] |
|
|
717 |
rf_test_score=[] |
|
|
718 |
a=[0.001,0.01,0.1] |
|
|
719 |
for i in range(len(a)): |
|
|
720 |
param_grid = {'activation':[best],'max_iter': [10000],'batch_size':[64],'alpha':[0.1],'learning_rate_init':[a[i]],'random_state':[1]} |
|
|
721 |
GS=GridSearchCV(MLPClassifier(), param_grid) |
|
|
722 |
GS.fit(X_train,Y_train) |
|
|
723 |
y_train_pred=GS.predict(X_train) |
|
|
724 |
y_pred=GS.predict(X_test) |
|
|
725 |
rf_train_score.append(log_loss(Y_train,y_train_pred)) |
|
|
726 |
rf_test_score.append(log_loss(Y_test,y_pred)) |
|
|
727 |
|
|
|
728 |
plt.title(" MLPClassifier Error vs Learning rate") |
|
|
729 |
plt.xlabel("Learning rate") |
|
|
730 |
plt.ylabel("Error") |
|
|
731 |
plt.plot([0.001,0.01,0.1],rf_train_score,label="Training Error") |
|
|
732 |
plt.plot([0.001,0.01,0.1],rf_test_score,label="Testing Error") |
|
|
733 |
plt.legend() |
|
|
734 |
plt.plot() |