Diff of /covid19_icu_prediction.py [000000] .. [2ea2fa]

Switch to side-by-side view

--- a
+++ b/covid19_icu_prediction.py
@@ -0,0 +1,734 @@
+# -*- coding: utf-8 -*-
+"""COVID19_ICU_Prediction.ipynb
+
+Automatically generated by Colaboratory.
+
+Original file is located at
+    https://colab.research.google.com/drive/11cMcxeMqpI_dQjuo31iPkSDOf0kTSWHP
+
+#**Machine Learning Project**
+
+***Title: Predicting ICU admission of confirmed COVID-19 cases***
+
+The COVID-19 pandemic has shown us the
+unpreparedness of our current healthcare system and
+services. We need to optimize the allocation of medical
+resources to maximize the utilization of resources. We are
+preparing this Machine Learning model based on the
+clinical data of confirmed COVID-19 cases. This will help
+us to predict the need of ICU for a patient in advance. By
+this information hospitals can plan the flow of operations
+and take critical decisions like shifting patient to another
+hospital or arrangement of resources within the time so
+that the lives of patients can be saved.
+
+##Libraries and Packages
+List of all the packages that is used in the notebook
+"""
+
+import tensorflow as tf
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.manifold import TSNE 
+from sklearn.decomposition import PCA 
+
+pd.set_option('display.max_columns', None)
+
+"""Downloading Dataset
+
+"""
+
+!wget -O "Kaggle_Sirio_Libanes_ICU_Prediction.xlsx" "https://drive.google.com/uc?export=download&id=1_shaH6SQajy1zrnALzim9jGaRmF3PLIn"
+
+"""##Reading Dataset
+Reading the dataset from the given CSV file.
+"""
+
+data = pd.read_excel("Kaggle_Sirio_Libanes_ICU_Prediction.xlsx")
+data
+
+"""##Data Pre-Processing
+Converting the data into usable format.
+Following modifications has been done to the data to get most out of it:
+1. Binary hotcoding to convert not float columns.
+2. Marking Window 0-2 as 1 if the patient was admitted to ICU in any of the future windows. 
+3. Removing all the records of the windows in which patients were actually admitted to the ICU (windows with ICU label 1 before the step 2).
+4. Filling the NaN values of window 0-2 with the help of mean of values in all the windows of that patient.
+5. Removing all the rows still having NaN values.
+
+"""
+
+print(data.dtypes)
+data.select_dtypes(object)
+
+without_ICU_column = data.drop('ICU', axis = 1)       #seperating the ICU lable column
+ICU_column = data['ICU']
+colums_to_convert = data.select_dtypes(object).columns   #finding columns that are not of type float or int
+colums_to_convert
+
+without_ICU_column = pd.get_dummies(without_ICU_column, columns = colums_to_convert)      #performing hotcoding
+without_ICU_column.head()
+
+data_expand = pd.concat([without_ICU_column, ICU_column], axis = 1)         #adding the ICU column again at the last position
+data_expand.head(5)
+
+column_names = data_expand.columns
+arr = data_expand.to_numpy()
+print(arr)
+i=0
+ICU_admitted_rows = []
+while(i<len(arr)):            #loop to record the rows in which patient is admitted to the ICU and adding 1 label to the previous rows.
+  for j in range(5):
+    if(arr[i+j][-1]==1):
+      for k in range(j):
+        arr[i+k][-1]=1
+      for toremove in range(i+j,i+5):
+        ICU_admitted_rows.append(toremove)
+      break
+  i+=5
+print(ICU_admitted_rows)
+deletedcount = 0
+for rowToRemove in ICU_admitted_rows:             #removing the rows in which patient was admitted to the ICU
+  arr = np.delete(arr, rowToRemove-deletedcount, axis=0)
+  deletedcount+=1
+df = pd.DataFrame(arr, columns = column_names)
+df.head(10)
+
+#Filling missing values
+pd.options.mode.chained_assignment = None 
+edited_dfs_list = []
+max_patient_id = df['PATIENT_VISIT_IDENTIFIER'].max()
+for i in range(int(max_patient_id)):                      #keeping only the first window that is 0-2 for every patient and filling NaN values with mean of all windows
+  tempdf = df[df['PATIENT_VISIT_IDENTIFIER']==i]
+  if(len(tempdf)!=0):
+    tempdf.fillna(tempdf.mean(), inplace=True)
+    tempdf = tempdf.iloc[[0]]
+    edited_dfs_list.append(tempdf)
+
+  
+final_data = pd.concat(edited_dfs_list)
+final_data.head(30)
+
+final_data = final_data.drop(['GENDER','PATIENT_VISIT_IDENTIFIER','WINDOW_0-2',	'WINDOW_2-4',	'WINDOW_4-6',	'WINDOW_6-12',	'WINDOW_ABOVE_12'],axis = 1)
+final_data.head()
+
+final_data.describe()
+
+final_data = final_data.dropna(axis = 0)            #Now we must have to drop the rows having nan values as there is no data in any window to fill it.
+
+"""##Data Analysis
+Visualising the pre preoessed data and trying to get the intution about different characterstics.
+"""
+
+final_data.describe()
+
+ICU_admission_distribution = final_data['ICU'].value_counts()
+print("Total Patients after pre processing: ", sum(ICU_admission_distribution))
+print("Distribution of ICU admissions")
+print("Patients who were not admitted to ICU: ",ICU_admission_distribution[0])
+print("Patients who were admitted to ICU: ",ICU_admission_distribution[1])
+labels= ['Admitted to ICU', 'Not Admitted to ICU']
+colors=['tomato', 'deepskyblue']
+sizes= [ICU_admission_distribution[1], ICU_admission_distribution[0]]
+plt.pie(sizes,labels=labels, colors=colors, startangle=90, autopct='%1.1f%%')
+plt.title("ICU Distribution of data")
+plt.axis('equal')
+plt.show()
+
+Age_distribution = final_data['AGE_ABOVE65'].value_counts()
+print("Age Distribution")
+print("Patients below age 65: ",Age_distribution[0])
+print("Patients above age 65: ",Age_distribution[1])
+labels= ['Below 65', 'Above 65']
+colors=['lightgreen', 'violet']
+sizes= [Age_distribution[0], Age_distribution[1]]
+plt.pie(sizes,labels=labels, colors=colors, startangle=90, autopct='%1.1f%%')
+plt.axis('equal')
+plt.title("Age Distribution of data")
+plt.show()
+
+ICU_Admitted_data = final_data[final_data['ICU']==1]
+Age_distribution = ICU_Admitted_data['AGE_ABOVE65'].value_counts()
+print("Age Distribution")
+print("Patients below age 65: ",Age_distribution[0])
+print("Patients above age 65: ",Age_distribution[1])
+labels= ['Below 65', 'Above 65']
+colors=['orange', 'cyan']
+sizes= [Age_distribution[0], Age_distribution[1]]
+plt.pie(sizes,labels=labels, colors=colors, startangle=90, autopct='%1.1f%%')
+plt.axis('equal')
+plt.title("Age Distribution of ICU Admitted patients")
+plt.show()
+
+x = [[],[]]
+x[0].append(final_data['AGE_PERCENTIL_10th'].value_counts()[1])
+x[0].append(final_data['AGE_PERCENTIL_20th'].value_counts()[1])
+x[0].append(final_data['AGE_PERCENTIL_30th'].value_counts()[1])
+x[0].append(final_data['AGE_PERCENTIL_40th'].value_counts()[1])
+x[0].append(final_data['AGE_PERCENTIL_50th'].value_counts()[1])
+x[0].append(final_data['AGE_PERCENTIL_60th'].value_counts()[1])
+x[0].append(final_data['AGE_PERCENTIL_70th'].value_counts()[1])
+x[0].append(final_data['AGE_PERCENTIL_80th'].value_counts()[1])
+x[0].append(final_data['AGE_PERCENTIL_90th'].value_counts()[1])
+x[0].append(final_data['AGE_PERCENTIL_Above 90th'].value_counts()[1])
+
+x[1].append(ICU_Admitted_data['AGE_PERCENTIL_10th'].value_counts()[1])
+x[1].append(ICU_Admitted_data['AGE_PERCENTIL_20th'].value_counts()[1])
+x[1].append(ICU_Admitted_data['AGE_PERCENTIL_30th'].value_counts()[1])
+x[1].append(ICU_Admitted_data['AGE_PERCENTIL_40th'].value_counts()[1])
+x[1].append(ICU_Admitted_data['AGE_PERCENTIL_50th'].value_counts()[1])
+x[1].append(ICU_Admitted_data['AGE_PERCENTIL_60th'].value_counts()[1])
+x[1].append(ICU_Admitted_data['AGE_PERCENTIL_70th'].value_counts()[1])
+x[1].append(ICU_Admitted_data['AGE_PERCENTIL_80th'].value_counts()[1])
+x[1].append(ICU_Admitted_data['AGE_PERCENTIL_90th'].value_counts()[1])
+x[1].append(ICU_Admitted_data['AGE_PERCENTIL_Above 90th'].value_counts()[1])
+
+a = []
+c=1
+for i in x[0]:
+  a.extend([c*10]*i)
+  c+=1
+plt.hist(a, 20, label='Total')
+b = []
+c=1
+for i in x[1]:
+  b.extend([c*10]*i)
+  c+=1
+print(x)
+plt.hist(b, 20, label='ICU Admitted')
+plt.xticks([10,20,30,40,50,60,70,80,90,100],['AGE_PERCENTIL_10th','AGE_PERCENTIL_20th','AGE_PERCENTIL_30th','AGE_PERCENTIL_40th','AGE_PERCENTIL_50th','AGE_PERCENTIL_60th','AGE_PERCENTIL_70th','AGE_PERCENTIL_80th','AGE_PERCENTIL_90th','AGE_PERCENTIL_Above 90'], rotation = 70)
+plt.legend()
+plt.ylabel('Frequency')
+plt.title('Age Distribution Total and ICU Admitted')
+plt.show()
+
+Diesease_Grouping_1 = final_data['DISEASE GROUPING 1'].value_counts()
+Diesease_Grouping_2 = final_data['DISEASE GROUPING 2'].value_counts()
+Diesease_Grouping_3 = final_data['DISEASE GROUPING 3'].value_counts()
+Diesease_Grouping_4 = final_data['DISEASE GROUPING 4'].value_counts()
+Diesease_Grouping_5 = final_data['DISEASE GROUPING 5'].value_counts()
+Diesease_Grouping_6 = final_data['DISEASE GROUPING 6'].value_counts()
+HTN_total = final_data['HTN'].value_counts()
+Immunocompromised_total = final_data['IMMUNOCOMPROMISED'].value_counts()
+Other_total = final_data['OTHER'].value_counts()
+
+ICU_Diesease_Grouping_1 = ICU_Admitted_data['DISEASE GROUPING 1'].value_counts()
+ICU_Diesease_Grouping_2 = ICU_Admitted_data['DISEASE GROUPING 2'].value_counts()
+ICU_Diesease_Grouping_3 = ICU_Admitted_data['DISEASE GROUPING 3'].value_counts()
+ICU_Diesease_Grouping_4 = ICU_Admitted_data['DISEASE GROUPING 4'].value_counts()
+ICU_Diesease_Grouping_5 = ICU_Admitted_data['DISEASE GROUPING 5'].value_counts()
+ICU_Diesease_Grouping_6 = ICU_Admitted_data['DISEASE GROUPING 6'].value_counts()
+HTN_ICU = ICU_Admitted_data['HTN'].value_counts()
+Immunocompromised_ICU = ICU_Admitted_data['IMMUNOCOMPROMISED'].value_counts()
+Other_ICU = ICU_Admitted_data['OTHER'].value_counts()
+
+x = np.array([[Diesease_Grouping_1[1],Diesease_Grouping_2[1],Diesease_Grouping_3[1],Diesease_Grouping_4[1],Diesease_Grouping_5[1],Diesease_Grouping_6[1],HTN_total[1], Immunocompromised_total[1]],[ICU_Diesease_Grouping_1[1],ICU_Diesease_Grouping_2[1],ICU_Diesease_Grouping_3[1],ICU_Diesease_Grouping_4[1],ICU_Diesease_Grouping_5[1],ICU_Diesease_Grouping_6[1],HTN_ICU[1], Immunocompromised_ICU[1]]])
+a = []
+c=1
+for i in x[0]:
+  a.extend([c]*i)
+  c+=1
+plt.hist(a, 15, label='Total')
+b = []
+c=1
+for i in x[1]:
+  b.extend([c]*i)
+  c+=1
+print(x)
+plt.hist(b, 15, label='ICU Admitted')
+plt.xticks([1,2,3,4,5,6,7,8,9],['Diesease_Grouping_1','Diesease_Grouping_2','Diesease_Grouping_3','Diesease_Grouping_4','Diesease_Grouping_5','Diesease_Grouping_6', 'Hypertension', 'Immunocompromised'], rotation = 70)
+plt.legend()
+plt.ylabel('Frequency')
+plt.title('Disease Distribution Total and ICU Admitted')
+plt.show()
+
+import seaborn as sns
+corr = final_data.corr()
+corr.shape
+plt.subplots(figsize=(100,100))
+ax = sns.heatmap(
+    corr, 
+    vmin=-1, vmax=1, center=0,
+    cmap=sns.diverging_palette(20, 220, n=200),
+    square=True
+)
+ax.set_xticklabels(
+    ax.get_xticklabels(),
+    rotation=90,
+    horizontalalignment='right'
+);
+corr.tail()
+
+corr.shape
+ICU_corr = corr.iloc[236]
+ICU_corr.describe()
+
+ICU_corr = np.array(ICU_corr)
+selection = []
+for i in ICU_corr:
+  if(i):
+    if(i>0.11):
+      selection.append(True)
+    elif(i<-0.12):
+      selection.append(True)
+    else:
+      selection.append(False)
+  else:
+    selection.append(False)
+
+print(len(selection), selection.count(True))
+selection = np.array(selection)
+selected_final_data = final_data.loc[:, selection]
+selected_final_data.head()
+
+selected_final_data = selected_final_data[['AGE_ABOVE65', 'DISEASE GROUPING 2', 'DISEASE GROUPING 3', 'DISEASE GROUPING 4',
+                                           'HTN', 'BIC_VENOUS_MEAN', 'CALCIUM_MEAN' , 'CREATININ_MEAN', 'GLUCOSE_MEAN', 'INR_MEAN',
+                                           'LACTATE_MEAN', 'LEUKOCYTES_MEAN', 'LINFOCITOS_MEAN', 'NEUTROPHILES_MEAN', 'PC02_VENOUS_MEAN',
+                                           'PCR_MEAN', 'PLATELETS_MEAN', 'SAT02_VENOUS_MEAN', 'SODIUM_MEAN', 'UREA_MEAN', 'BLOODPRESSURE_DIASTOLIC_MEAN',
+                                           'RESPIRATORY_RATE_MEAN', 'TEMPERATURE_MEAN', 'OXYGEN_SATURATION_MEAN', 'BLOODPRESSURE_SISTOLIC_MIN',
+                                           'HEART_RATE_MIN', 'RESPIRATORY_RATE_MIN', 'TEMPERATURE_MIN', 'BLOODPRESSURE_DIASTOLIC_MAX', 'BLOODPRESSURE_SISTOLIC_MAX',
+                                           'HEART_RATE_MAX', 'OXYGEN_SATURATION_MAX', 'BLOODPRESSURE_DIASTOLIC_DIFF', 'BLOODPRESSURE_SISTOLIC_DIFF', 
+                                           'HEART_RATE_DIFF', 'RESPIRATORY_RATE_DIFF', 'TEMPERATURE_DIFF', 'OXYGEN_SATURATION_DIFF', 
+                                           'AGE_PERCENTIL_10th', 'AGE_PERCENTIL_20th', 'AGE_PERCENTIL_80th', 'AGE_PERCENTIL_90th', 'ICU']]
+
+print(selected_final_data.shape)
+selected_final_data.head()
+
+corr = selected_final_data.corr()
+corr.shape
+plt.subplots(figsize=(30,30))
+ax = sns.heatmap(
+    corr, 
+    vmin=-1, vmax=1, center=0,
+    cmap=sns.diverging_palette(20, 220, n=200),
+    square=True
+)
+ax.set_xticklabels(
+    ax.get_xticklabels(),
+    rotation=90,
+    horizontalalignment='right'
+);
+corr.tail()
+
+selected_final_data.columns
+
+Non_ICU_Admitted_data = selected_final_data[selected_final_data['ICU']==0]
+ICU_Admitted_data = selected_final_data[selected_final_data['ICU']==1]
+
+Vital_Non_ICU_Admitted_data = Non_ICU_Admitted_data[['BLOODPRESSURE_DIASTOLIC_MEAN',
+       'RESPIRATORY_RATE_MEAN', 'TEMPERATURE_MEAN', 'OXYGEN_SATURATION_MEAN',
+       'BLOODPRESSURE_SISTOLIC_MIN', 'HEART_RATE_MIN', 'RESPIRATORY_RATE_MIN',
+       'TEMPERATURE_MIN', 'BLOODPRESSURE_DIASTOLIC_MAX',
+       'BLOODPRESSURE_SISTOLIC_MAX', 'HEART_RATE_MAX', 'OXYGEN_SATURATION_MAX',
+       'HEART_RATE_DIFF', 'RESPIRATORY_RATE_DIFF', 'TEMPERATURE_DIFF']]
+
+Vital_ICU_Admitted_data = ICU_Admitted_data[['BLOODPRESSURE_DIASTOLIC_MEAN',
+       'RESPIRATORY_RATE_MEAN', 'TEMPERATURE_MEAN', 'OXYGEN_SATURATION_MEAN',
+       'BLOODPRESSURE_SISTOLIC_MIN', 'HEART_RATE_MIN', 'RESPIRATORY_RATE_MIN',
+       'TEMPERATURE_MIN', 'BLOODPRESSURE_DIASTOLIC_MAX',
+       'BLOODPRESSURE_SISTOLIC_MAX', 'HEART_RATE_MAX', 'OXYGEN_SATURATION_MAX',
+       'HEART_RATE_DIFF', 'RESPIRATORY_RATE_DIFF', 'TEMPERATURE_DIFF']]
+
+
+Lab_Non_ICU_Admitted_data = Non_ICU_Admitted_data[['HTN', 'BIC_VENOUS_MEAN', 'CALCIUM_MEAN',
+       'CREATININ_MEAN', 'GLUCOSE_MEAN', 'INR_MEAN', 'LACTATE_MEAN',
+       'LEUKOCYTES_MEAN', 'LINFOCITOS_MEAN', 'NEUTROPHILES_MEAN',
+       'PC02_VENOUS_MEAN', 'PCR_MEAN', 'PLATELETS_MEAN', 'SAT02_VENOUS_MEAN',
+       'SODIUM_MEAN', 'UREA_MEAN']]
+Lab_ICU_Admitted_data = ICU_Admitted_data[['HTN', 'BIC_VENOUS_MEAN', 'CALCIUM_MEAN',
+       'CREATININ_MEAN', 'GLUCOSE_MEAN', 'INR_MEAN', 'LACTATE_MEAN',
+       'LEUKOCYTES_MEAN', 'LINFOCITOS_MEAN', 'NEUTROPHILES_MEAN',
+       'PC02_VENOUS_MEAN', 'PCR_MEAN', 'PLATELETS_MEAN', 'SAT02_VENOUS_MEAN',
+       'SODIUM_MEAN', 'UREA_MEAN']]
+
+
+# set width of bar 
+barWidth = 0.25
+fig = plt.subplots(figsize =(20, 10)) 
+   
+vital_non_ICU = np.array(Vital_Non_ICU_Admitted_data.mean(axis=0)) 
+vital_ICU = np.array(Vital_ICU_Admitted_data.mean(axis=0)) 
+   
+# Set position of bar on X axis 
+br1 = np.arange(len(vital_ICU)) + (barWidth*0.5)
+br2 = [x + barWidth for x in br1]  
+   
+# Make the plot 
+plt.bar(br2, vital_ICU, color ='r', width = barWidth, edgecolor ='grey', label ='ICU Admitted') 
+plt.bar(br1, vital_non_ICU, color ='b', width = barWidth, edgecolor ='grey', label ='NOT Admitted') 
+
+   
+plt.xlabel('Features', fontweight ='bold') 
+plt.ylabel('Normalized Values', fontweight ='bold') 
+plt.xticks([r + barWidth for r in range(len(vital_ICU))], ['BLOODPRESSURE_DIASTOLIC_MEAN',
+       'RESPIRATORY_RATE_MEAN', 'TEMPERATURE_MEAN', 'OXYGEN_SATURATION_MEAN',
+       'BLOODPRESSURE_SISTOLIC_MIN', 'HEART_RATE_MIN', 'RESPIRATORY_RATE_MIN',
+       'TEMPERATURE_MIN', 'BLOODPRESSURE_DIASTOLIC_MAX',
+       'BLOODPRESSURE_SISTOLIC_MAX', 'HEART_RATE_MAX', 'OXYGEN_SATURATION_MAX',
+       'HEART_RATE_DIFF', 'RESPIRATORY_RATE_DIFF', 'TEMPERATURE_DIFF'], rotation = 90) 
+
+plt.legend()
+plt.title("Vital Signs of Covid19 Patients")
+plt.show()
+
+
+# set width of bar 
+barWidth = 0.25
+fig = plt.subplots(figsize =(20, 10)) 
+   
+lab_non_ICU = np.array(Lab_Non_ICU_Admitted_data.mean(axis=0)) 
+lab_ICU = np.array(Lab_ICU_Admitted_data.mean(axis=0)) 
+   
+# Set position of bar on X axis 
+br1 = np.arange(len(lab_ICU)) + (barWidth*0.5)
+br2 = [x + barWidth for x in br1]  
+   
+# Make the plot 
+plt.bar(br2, lab_ICU, color ='r', width = barWidth, edgecolor ='grey', label ='ICU Admitted') 
+plt.bar(br1, lab_non_ICU, color ='b', width = barWidth, edgecolor ='grey', label ='NOT Admitted') 
+
+   
+plt.xlabel('Features', fontweight ='bold') 
+plt.ylabel('Normalized Value', fontweight ='bold') 
+plt.legend()
+plt.xticks([r + barWidth for r in range(len(lab_ICU))], ['HTN', 'BIC_VENOUS_MEAN', 'CALCIUM_MEAN',
+       'CREATININ_MEAN', 'GLUCOSE_MEAN', 'INR_MEAN', 'LACTATE_MEAN',
+       'LEUKOCYTES_MEAN', 'LINFOCITOS_MEAN', 'NEUTROPHILES_MEAN',
+       'PC02_VENOUS_MEAN', 'PCR_MEAN', 'PLATELETS_MEAN', 'SAT02_VENOUS_MEAN',
+       'SODIUM_MEAN', 'UREA_MEAN'], rotation = 90) 
+plt.title("Lab Test Results of Covid19 patients")
+plt.show()
+
+X_data = np.array(selected_final_data.drop(['ICU'], axis = 1))
+Y_data = np.array(selected_final_data[['ICU']])
+print(X_data.shape)
+print(Y_data.shape)
+from sklearn.decomposition import PCA 
+
+labels = []
+for i in Y_data:
+  if(i[0]==0):
+    labels.append(0)
+  else:
+    labels.append(1)
+print(X_data)
+Y_data = np.array(labels)
+
+#pca = PCA(0.80)
+#X_data = pca.fit_transform(X_data)
+print("pca ", X_data.shape)
+model = TSNE(n_components = 2, random_state = 0) 
+  
+tsne_data = model.fit_transform(X_data) 
+
+
+# creating a new data frame which 
+# help us in ploting the result data 
+tsne_data = np.vstack((tsne_data.T, Y_data)).T 
+tsne_df = pd.DataFrame(data = tsne_data, 
+     columns =("Dim_1", "Dim_2","label")) 
+  
+# Ploting the result of tsne 
+sns.FacetGrid(tsne_df, hue ="label", size = 6).map( 
+       plt.scatter, 'Dim_1', 'Dim_2', s = 100).add_legend() 
+  
+plt.show()
+
+selected_final_data.head()
+
+print(X_data)
+print(Y_data)
+
+"""## Training and Testing using various classifiers
+
+Importing Libraries
+"""
+
+from sklearn.linear_model import LogisticRegressionCV
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import KFold
+from sklearn.model_selection import train_test_split
+from sklearn.naive_bayes import GaussianNB
+from sklearn.linear_model import SGDClassifier
+from sklearn.preprocessing import StandardScaler
+from sklearn.pipeline import make_pipeline
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.datasets import make_classification
+from sklearn import svm
+from sklearn import tree
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import roc_auc_score
+from sklearn.model_selection import GridSearchCV
+from sklearn.tree import DecisionTreeClassifier
+import matplotlib.pyplot as plt 
+from sklearn.metrics import log_loss
+from sklearn import tree
+import graphviz
+from sklearn.neural_network import MLPClassifier
+
+"""Shape of Datasets"""
+
+print(X_data.shape)
+print(Y_data.shape)
+
+def ass(y_true,y_pred):
+  tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
+  accuracy=(tp+tn)/(tp+fp+fn+tn)
+  specificity = tn/(tn+fp)
+  sensitivity=tp/(tp+fn)
+  print("Accuracy:",accuracy*100)
+  print("Sensitivity:",sensitivity*100)
+  print("Specificity:",specificity*100)
+  print("ROC_AUC_Score:",roc_auc_score(y_true, y_pred)*100)
+
+"""Splitting Data into Training Data and Testing Data"""
+
+X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.30, random_state=1)
+
+"""Performing Logistic Regression with Cross Validation Estimator"""
+
+lgc=make_pipeline(LogisticRegressionCV(cv=5,random_state=1,max_iter=5000))
+lgc.fit(X_train, Y_train)
+y_pred=lgc.predict(X_test)
+ass(Y_test,y_pred)
+
+"""Performing Gaussian Naive Bayes """
+
+gnb=make_pipeline(GaussianNB())
+gnb.fit(X_train,Y_train)
+y_pred=gnb.predict(X_test)
+ass(Y_test,y_pred)
+
+"""Finding Optimal Depth (SGD Classifier)"""
+
+mx=-1
+ri=-1
+for i in range(1,10000):
+  sgd= make_pipeline(SGDClassifier(random_state=i))
+  sgd.fit(X_train,Y_train)
+  pmx=mx
+  mx=max(mx,sgd.score(X_test,Y_test))
+  if(pmx!=mx):
+    ri=i
+print(ri)
+
+"""Performing SGD classifier with optimal Depth"""
+
+sgd= make_pipeline(SGDClassifier(random_state=ri))
+sgd.fit(X_train,Y_train)
+y_pred=sgd.predict(X_test)
+ass(Y_test,y_pred)
+
+"""Performing SVM ( Supoort Vector Machine ) classification on the given data"""
+
+SVM_object = make_pipeline(svm.SVC(kernel='linear'))
+SVM_object.fit(X_train,Y_train)
+y_pred=SVM_object.predict(X_test)
+ass(Y_test,y_pred)
+
+"""Performing Decision tree classification
+
+"""
+
+DT_object=tree.DecisionTreeClassifier(criterion='entropy',max_depth=4,max_leaf_nodes=10)
+DT_object.fit(X_train,Y_train)
+y_pred=DT_object.predict(X_test)
+ass(Y_test,y_pred)
+
+from sklearn import tree
+import graphviz
+text_representation = tree.export_text(DT_object)
+print(text_representation)
+
+features=['AGE_ABOVE65', 'DISEASE GROUPING 2', 'DISEASE GROUPING 3',
+       'DISEASE GROUPING 4', 'HTN', 'BIC_VENOUS_MEAN', 'CALCIUM_MEAN',
+       'CREATININ_MEAN', 'GLUCOSE_MEAN', 'INR_MEAN', 'LACTATE_MEAN',
+       'LEUKOCYTES_MEAN', 'LINFOCITOS_MEAN', 'NEUTROPHILES_MEAN',
+       'PC02_VENOUS_MEAN', 'PCR_MEAN', 'PLATELETS_MEAN', 'SAT02_VENOUS_MEAN',
+       'SODIUM_MEAN', 'UREA_MEAN', 'BLOODPRESSURE_DIASTOLIC_MEAN',
+       'RESPIRATORY_RATE_MEAN', 'TEMPERATURE_MEAN', 'OXYGEN_SATURATION_MEAN',
+       'BLOODPRESSURE_SISTOLIC_MIN', 'HEART_RATE_MIN', 'RESPIRATORY_RATE_MIN',
+       'TEMPERATURE_MIN', 'BLOODPRESSURE_DIASTOLIC_MAX',
+       'BLOODPRESSURE_SISTOLIC_MAX', 'HEART_RATE_MAX', 'OXYGEN_SATURATION_MAX',
+       'BLOODPRESSURE_DIASTOLIC_DIFF', 'BLOODPRESSURE_SISTOLIC_DIFF',
+       'HEART_RATE_DIFF', 'RESPIRATORY_RATE_DIFF', 'TEMPERATURE_DIFF',
+       'OXYGEN_SATURATION_DIFF', 'AGE_PERCENTIL_10th', 'AGE_PERCENTIL_20th',
+       'AGE_PERCENTIL_80th', 'AGE_PERCENTIL_90th']
+classes=['Non-ICU','ICU']
+dot_data = tree.export_graphviz(DT_object, out_file=None, 
+                                feature_names=features,  
+                                class_names=classes,
+                                filled=True)
+graph = graphviz.Source(dot_data, format="png") 
+graph
+
+"""Performing K-Nearest Neighbour Classifier 
+
+"""
+
+KNN_object=make_pipeline(KNeighborsClassifier(n_neighbors=25,p=1))
+KNN_object.fit(X_train,Y_train)
+y_pred=KNN_object.predict(X_test)
+ass(Y_test,y_pred)
+
+"""Performing Random Forest Classifier"""
+
+RF_object = RandomForestClassifier(criterion='gini',random_state=23,max_depth=6,bootstrap=True)
+RF_object.fit(X_train,Y_train)
+y_pred=RF_object.predict(X_test)
+ass(Y_test,y_pred)
+
+"""##Performing Grid Search on Various ML Algorithm
+
+Grid Search on Decision Tree
+"""
+
+param_grid = {'criterion':['entropy','gini'],'max_depth':np.arange(1,30),'max_leaf_nodes':np.arange(3,20),'random_state':[1,2]}
+GS_DT=GridSearchCV(DecisionTreeClassifier(), param_grid,cv=5)
+GS_DT.fit(X_train,Y_train)
+GS_DT.best_params_
+
+GS_DT.score(X_test,Y_test)
+
+dt_train_score=[]
+dt_test_score=[]
+for i in np.arange(1, 30):
+  param_grid = {'criterion':['entropy','gini'],'max_depth': [i],'max_leaf_nodes':np.arange(3,20),'random_state':[1,2]}
+  GS_DT=GridSearchCV(DecisionTreeClassifier(), param_grid,cv=5)
+  GS_DT.fit(X_train,Y_train)
+  y_train_pred=GS_DT.predict(X_train)
+  y_pred=GS_DT.predict(X_test)
+  dt_train_score.append(log_loss(Y_train,y_train_pred))
+  dt_test_score.append(log_loss(Y_test,y_pred))
+
+plt.title("Decision Tree Classifier : Error vs Depth")
+plt.xlabel("Depth")
+plt.ylabel("Error")
+plt.plot(np.arange(1,30),dt_train_score,label="Training Error")
+plt.plot(np.arange(1,30),dt_test_score,label="Testing Error")
+plt.legend()
+plt.plot()
+
+""" Best kernel Performance using Grid Search"""
+
+param_grid = {'kernel':['linear','poly','sigmoid','rbf'],'gamma':['scale','auto'],'random_state':[1,2,3]}
+GS_SVM=GridSearchCV(svm.SVC(), param_grid,cv=5)
+GS_SVM.fit(X_train,Y_train)
+GS_SVM.best_params_
+
+GS_SVM.score(X_test,Y_test)
+
+dt_train_score=[]
+dt_test_score=[]
+for i in ['linear','poly','sigmoid','rbf']:
+  param_grid = {'kernel':[i],'gamma':['scale','auto'],'random_state':[1,2,3]}
+  GS_SVM=GridSearchCV(svm.SVC(), param_grid,cv=5)
+  GS_SVM.fit(X_train,Y_train)
+  y_train_pred=GS_SVM.predict(X_train)
+  y_pred=GS_SVM.predict(X_test)
+  dt_train_score.append(log_loss(Y_train,y_train_pred))
+  dt_test_score.append(log_loss(Y_test,y_pred))
+
+plt.title("SVM: Error vs kernel")
+plt.xlabel("Kernel")
+plt.ylabel("Error")
+plt.plot(['linear','poly','sigmoid','rbf'],dt_train_score,label="Training Error")
+plt.plot(['linear','poly','sigmoid','rbf'],dt_test_score,label="Testing Error")
+plt.legend()
+plt.plot()
+
+"""Grid Search on K nearest neighbour"""
+
+param_grid = {'n_neighbors':[10,15,20,25,30,35,40],'leaf_size':np.arange(3,20),'p':[1,2]}
+GS_KNN=GridSearchCV(KNeighborsClassifier(), param_grid,cv=5)
+GS_KNN.fit(X_train,Y_train)
+GS_KNN.best_params_
+
+GS_KNN.score(X_test,Y_test)
+
+knn_train_score=[]
+knn_test_score=[]
+for i in [10,15,20,25,30,35,40]:
+  param_grid = {'n_neighbors': [i],'leaf_size':np.arange(3,20),'p':[1,2]}
+  GS_KNN=GridSearchCV(KNeighborsClassifier(), param_grid,cv=5)
+  GS_KNN.fit(X_train,Y_train)
+  y_train_pred=GS_KNN.predict(X_train)
+  y_pred=GS_KNN.predict(X_test)
+  knn_train_score.append(log_loss(Y_train,y_train_pred))
+  knn_test_score.append(log_loss(Y_test,y_pred))
+
+plt.title("K-Neighbours Classifier: Error vs Number of Neighbors ")
+plt.xlabel("Number of Neighbors")
+plt.ylabel("Error")
+plt.plot([10,15,20,25,30,35,40],knn_train_score,label="Training Error")
+plt.plot([10,15,20,25,30,35,40],knn_test_score,label="Testing Error")
+plt.legend()
+plt.plot()
+
+"""Grid search on Random Forest Classifier"""
+
+param_grid = {'criterion':['gini','entropy'],'max_depth': [6],'random_state':[23]}
+GS_RF=GridSearchCV(RandomForestClassifier(), param_grid,cv=5)
+GS_RF.fit(X_train,Y_train)
+GS_RF.best_params_
+
+GS_RF.score(X_test,Y_test)
+
+rf_train_score=[]
+rf_test_score=[]
+for i in np.arange(1, 30):
+  param_grid = {'criterion':['gini','entropy'],'max_depth': [i],'random_state':[23]}
+  GS_RF=GridSearchCV(RandomForestClassifier(), param_grid,cv=5)
+  GS_RF.fit(X_train,Y_train)
+  y_train_pred=GS_RF.predict(X_train)
+  y_pred=GS_RF.predict(X_test)
+  rf_train_score.append(log_loss(Y_train,y_train_pred))
+  rf_test_score.append(log_loss(Y_test,y_pred))
+
+plt.title("Random Forest Classifier : Error vs Max Depth")
+plt.xlabel("Max Depth")
+plt.ylabel("Error")
+plt.plot(np.arange(1,30),rf_train_score,label="Training Error")
+plt.plot(np.arange(1,30),rf_test_score,label="Testing Error")
+plt.legend()
+plt.plot()
+
+"""Training model with different activation functions and finding model with best accuracy"""
+
+best=1
+acc=-1
+for a in ["identity", "logistic", "tanh", "relu"]:
+    model = MLPClassifier(activation=a,max_iter=10000, batch_size=64,alpha=0.1,random_state=1).fit(X_train,Y_train)
+    y_pred = model.predict(X_test)
+    print(a)
+    ass(Y_test,y_pred)
+    score = model.score(X_test,Y_test)
+    if score>acc:
+      acc=score
+      best = a
+    #print(a," - ",model.score(X_test,Y_test))
+print(best,acc)
+
+"""Performing Grid search on the model we got from the above"""
+
+rf_train_score=[]
+rf_test_score=[]
+a=[0.001,0.01,0.1]
+for i in range(len(a)):
+  param_grid = {'activation':[best],'max_iter': [10000],'batch_size':[64],'alpha':[0.1],'learning_rate_init':[a[i]],'random_state':[1]}
+  GS=GridSearchCV(MLPClassifier(), param_grid)
+  GS.fit(X_train,Y_train)
+  y_train_pred=GS.predict(X_train)
+  y_pred=GS.predict(X_test)
+  rf_train_score.append(log_loss(Y_train,y_train_pred))
+  rf_test_score.append(log_loss(Y_test,y_pred))
+
+plt.title(" MLPClassifier Error vs Learning rate")
+plt.xlabel("Learning rate")
+plt.ylabel("Error")
+plt.plot([0.001,0.01,0.1],rf_train_score,label="Training Error")
+plt.plot([0.001,0.01,0.1],rf_test_score,label="Testing Error")
+plt.legend()
+plt.plot()
\ No newline at end of file