--- a +++ b/covid19_icu_prediction.py @@ -0,0 +1,734 @@ +# -*- coding: utf-8 -*- +"""COVID19_ICU_Prediction.ipynb + +Automatically generated by Colaboratory. + +Original file is located at + https://colab.research.google.com/drive/11cMcxeMqpI_dQjuo31iPkSDOf0kTSWHP + +#**Machine Learning Project** + +***Title: Predicting ICU admission of confirmed COVID-19 cases*** + +The COVID-19 pandemic has shown us the +unpreparedness of our current healthcare system and +services. We need to optimize the allocation of medical +resources to maximize the utilization of resources. We are +preparing this Machine Learning model based on the +clinical data of confirmed COVID-19 cases. This will help +us to predict the need of ICU for a patient in advance. By +this information hospitals can plan the flow of operations +and take critical decisions like shifting patient to another +hospital or arrangement of resources within the time so +that the lives of patients can be saved. + +##Libraries and Packages +List of all the packages that is used in the notebook +""" + +import tensorflow as tf +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from sklearn.manifold import TSNE +from sklearn.decomposition import PCA + +pd.set_option('display.max_columns', None) + +"""Downloading Dataset + +""" + +!wget -O "Kaggle_Sirio_Libanes_ICU_Prediction.xlsx" "https://drive.google.com/uc?export=download&id=1_shaH6SQajy1zrnALzim9jGaRmF3PLIn" + +"""##Reading Dataset +Reading the dataset from the given CSV file. +""" + +data = pd.read_excel("Kaggle_Sirio_Libanes_ICU_Prediction.xlsx") +data + +"""##Data Pre-Processing +Converting the data into usable format. +Following modifications has been done to the data to get most out of it: +1. Binary hotcoding to convert not float columns. +2. Marking Window 0-2 as 1 if the patient was admitted to ICU in any of the future windows. +3. Removing all the records of the windows in which patients were actually admitted to the ICU (windows with ICU label 1 before the step 2). +4. Filling the NaN values of window 0-2 with the help of mean of values in all the windows of that patient. +5. Removing all the rows still having NaN values. + +""" + +print(data.dtypes) +data.select_dtypes(object) + +without_ICU_column = data.drop('ICU', axis = 1) #seperating the ICU lable column +ICU_column = data['ICU'] +colums_to_convert = data.select_dtypes(object).columns #finding columns that are not of type float or int +colums_to_convert + +without_ICU_column = pd.get_dummies(without_ICU_column, columns = colums_to_convert) #performing hotcoding +without_ICU_column.head() + +data_expand = pd.concat([without_ICU_column, ICU_column], axis = 1) #adding the ICU column again at the last position +data_expand.head(5) + +column_names = data_expand.columns +arr = data_expand.to_numpy() +print(arr) +i=0 +ICU_admitted_rows = [] +while(i<len(arr)): #loop to record the rows in which patient is admitted to the ICU and adding 1 label to the previous rows. + for j in range(5): + if(arr[i+j][-1]==1): + for k in range(j): + arr[i+k][-1]=1 + for toremove in range(i+j,i+5): + ICU_admitted_rows.append(toremove) + break + i+=5 +print(ICU_admitted_rows) +deletedcount = 0 +for rowToRemove in ICU_admitted_rows: #removing the rows in which patient was admitted to the ICU + arr = np.delete(arr, rowToRemove-deletedcount, axis=0) + deletedcount+=1 +df = pd.DataFrame(arr, columns = column_names) +df.head(10) + +#Filling missing values +pd.options.mode.chained_assignment = None +edited_dfs_list = [] +max_patient_id = df['PATIENT_VISIT_IDENTIFIER'].max() +for i in range(int(max_patient_id)): #keeping only the first window that is 0-2 for every patient and filling NaN values with mean of all windows + tempdf = df[df['PATIENT_VISIT_IDENTIFIER']==i] + if(len(tempdf)!=0): + tempdf.fillna(tempdf.mean(), inplace=True) + tempdf = tempdf.iloc[[0]] + edited_dfs_list.append(tempdf) + + +final_data = pd.concat(edited_dfs_list) +final_data.head(30) + +final_data = final_data.drop(['GENDER','PATIENT_VISIT_IDENTIFIER','WINDOW_0-2', 'WINDOW_2-4', 'WINDOW_4-6', 'WINDOW_6-12', 'WINDOW_ABOVE_12'],axis = 1) +final_data.head() + +final_data.describe() + +final_data = final_data.dropna(axis = 0) #Now we must have to drop the rows having nan values as there is no data in any window to fill it. + +"""##Data Analysis +Visualising the pre preoessed data and trying to get the intution about different characterstics. +""" + +final_data.describe() + +ICU_admission_distribution = final_data['ICU'].value_counts() +print("Total Patients after pre processing: ", sum(ICU_admission_distribution)) +print("Distribution of ICU admissions") +print("Patients who were not admitted to ICU: ",ICU_admission_distribution[0]) +print("Patients who were admitted to ICU: ",ICU_admission_distribution[1]) +labels= ['Admitted to ICU', 'Not Admitted to ICU'] +colors=['tomato', 'deepskyblue'] +sizes= [ICU_admission_distribution[1], ICU_admission_distribution[0]] +plt.pie(sizes,labels=labels, colors=colors, startangle=90, autopct='%1.1f%%') +plt.title("ICU Distribution of data") +plt.axis('equal') +plt.show() + +Age_distribution = final_data['AGE_ABOVE65'].value_counts() +print("Age Distribution") +print("Patients below age 65: ",Age_distribution[0]) +print("Patients above age 65: ",Age_distribution[1]) +labels= ['Below 65', 'Above 65'] +colors=['lightgreen', 'violet'] +sizes= [Age_distribution[0], Age_distribution[1]] +plt.pie(sizes,labels=labels, colors=colors, startangle=90, autopct='%1.1f%%') +plt.axis('equal') +plt.title("Age Distribution of data") +plt.show() + +ICU_Admitted_data = final_data[final_data['ICU']==1] +Age_distribution = ICU_Admitted_data['AGE_ABOVE65'].value_counts() +print("Age Distribution") +print("Patients below age 65: ",Age_distribution[0]) +print("Patients above age 65: ",Age_distribution[1]) +labels= ['Below 65', 'Above 65'] +colors=['orange', 'cyan'] +sizes= [Age_distribution[0], Age_distribution[1]] +plt.pie(sizes,labels=labels, colors=colors, startangle=90, autopct='%1.1f%%') +plt.axis('equal') +plt.title("Age Distribution of ICU Admitted patients") +plt.show() + +x = [[],[]] +x[0].append(final_data['AGE_PERCENTIL_10th'].value_counts()[1]) +x[0].append(final_data['AGE_PERCENTIL_20th'].value_counts()[1]) +x[0].append(final_data['AGE_PERCENTIL_30th'].value_counts()[1]) +x[0].append(final_data['AGE_PERCENTIL_40th'].value_counts()[1]) +x[0].append(final_data['AGE_PERCENTIL_50th'].value_counts()[1]) +x[0].append(final_data['AGE_PERCENTIL_60th'].value_counts()[1]) +x[0].append(final_data['AGE_PERCENTIL_70th'].value_counts()[1]) +x[0].append(final_data['AGE_PERCENTIL_80th'].value_counts()[1]) +x[0].append(final_data['AGE_PERCENTIL_90th'].value_counts()[1]) +x[0].append(final_data['AGE_PERCENTIL_Above 90th'].value_counts()[1]) + +x[1].append(ICU_Admitted_data['AGE_PERCENTIL_10th'].value_counts()[1]) +x[1].append(ICU_Admitted_data['AGE_PERCENTIL_20th'].value_counts()[1]) +x[1].append(ICU_Admitted_data['AGE_PERCENTIL_30th'].value_counts()[1]) +x[1].append(ICU_Admitted_data['AGE_PERCENTIL_40th'].value_counts()[1]) +x[1].append(ICU_Admitted_data['AGE_PERCENTIL_50th'].value_counts()[1]) +x[1].append(ICU_Admitted_data['AGE_PERCENTIL_60th'].value_counts()[1]) +x[1].append(ICU_Admitted_data['AGE_PERCENTIL_70th'].value_counts()[1]) +x[1].append(ICU_Admitted_data['AGE_PERCENTIL_80th'].value_counts()[1]) +x[1].append(ICU_Admitted_data['AGE_PERCENTIL_90th'].value_counts()[1]) +x[1].append(ICU_Admitted_data['AGE_PERCENTIL_Above 90th'].value_counts()[1]) + +a = [] +c=1 +for i in x[0]: + a.extend([c*10]*i) + c+=1 +plt.hist(a, 20, label='Total') +b = [] +c=1 +for i in x[1]: + b.extend([c*10]*i) + c+=1 +print(x) +plt.hist(b, 20, label='ICU Admitted') +plt.xticks([10,20,30,40,50,60,70,80,90,100],['AGE_PERCENTIL_10th','AGE_PERCENTIL_20th','AGE_PERCENTIL_30th','AGE_PERCENTIL_40th','AGE_PERCENTIL_50th','AGE_PERCENTIL_60th','AGE_PERCENTIL_70th','AGE_PERCENTIL_80th','AGE_PERCENTIL_90th','AGE_PERCENTIL_Above 90'], rotation = 70) +plt.legend() +plt.ylabel('Frequency') +plt.title('Age Distribution Total and ICU Admitted') +plt.show() + +Diesease_Grouping_1 = final_data['DISEASE GROUPING 1'].value_counts() +Diesease_Grouping_2 = final_data['DISEASE GROUPING 2'].value_counts() +Diesease_Grouping_3 = final_data['DISEASE GROUPING 3'].value_counts() +Diesease_Grouping_4 = final_data['DISEASE GROUPING 4'].value_counts() +Diesease_Grouping_5 = final_data['DISEASE GROUPING 5'].value_counts() +Diesease_Grouping_6 = final_data['DISEASE GROUPING 6'].value_counts() +HTN_total = final_data['HTN'].value_counts() +Immunocompromised_total = final_data['IMMUNOCOMPROMISED'].value_counts() +Other_total = final_data['OTHER'].value_counts() + +ICU_Diesease_Grouping_1 = ICU_Admitted_data['DISEASE GROUPING 1'].value_counts() +ICU_Diesease_Grouping_2 = ICU_Admitted_data['DISEASE GROUPING 2'].value_counts() +ICU_Diesease_Grouping_3 = ICU_Admitted_data['DISEASE GROUPING 3'].value_counts() +ICU_Diesease_Grouping_4 = ICU_Admitted_data['DISEASE GROUPING 4'].value_counts() +ICU_Diesease_Grouping_5 = ICU_Admitted_data['DISEASE GROUPING 5'].value_counts() +ICU_Diesease_Grouping_6 = ICU_Admitted_data['DISEASE GROUPING 6'].value_counts() +HTN_ICU = ICU_Admitted_data['HTN'].value_counts() +Immunocompromised_ICU = ICU_Admitted_data['IMMUNOCOMPROMISED'].value_counts() +Other_ICU = ICU_Admitted_data['OTHER'].value_counts() + +x = np.array([[Diesease_Grouping_1[1],Diesease_Grouping_2[1],Diesease_Grouping_3[1],Diesease_Grouping_4[1],Diesease_Grouping_5[1],Diesease_Grouping_6[1],HTN_total[1], Immunocompromised_total[1]],[ICU_Diesease_Grouping_1[1],ICU_Diesease_Grouping_2[1],ICU_Diesease_Grouping_3[1],ICU_Diesease_Grouping_4[1],ICU_Diesease_Grouping_5[1],ICU_Diesease_Grouping_6[1],HTN_ICU[1], Immunocompromised_ICU[1]]]) +a = [] +c=1 +for i in x[0]: + a.extend([c]*i) + c+=1 +plt.hist(a, 15, label='Total') +b = [] +c=1 +for i in x[1]: + b.extend([c]*i) + c+=1 +print(x) +plt.hist(b, 15, label='ICU Admitted') +plt.xticks([1,2,3,4,5,6,7,8,9],['Diesease_Grouping_1','Diesease_Grouping_2','Diesease_Grouping_3','Diesease_Grouping_4','Diesease_Grouping_5','Diesease_Grouping_6', 'Hypertension', 'Immunocompromised'], rotation = 70) +plt.legend() +plt.ylabel('Frequency') +plt.title('Disease Distribution Total and ICU Admitted') +plt.show() + +import seaborn as sns +corr = final_data.corr() +corr.shape +plt.subplots(figsize=(100,100)) +ax = sns.heatmap( + corr, + vmin=-1, vmax=1, center=0, + cmap=sns.diverging_palette(20, 220, n=200), + square=True +) +ax.set_xticklabels( + ax.get_xticklabels(), + rotation=90, + horizontalalignment='right' +); +corr.tail() + +corr.shape +ICU_corr = corr.iloc[236] +ICU_corr.describe() + +ICU_corr = np.array(ICU_corr) +selection = [] +for i in ICU_corr: + if(i): + if(i>0.11): + selection.append(True) + elif(i<-0.12): + selection.append(True) + else: + selection.append(False) + else: + selection.append(False) + +print(len(selection), selection.count(True)) +selection = np.array(selection) +selected_final_data = final_data.loc[:, selection] +selected_final_data.head() + +selected_final_data = selected_final_data[['AGE_ABOVE65', 'DISEASE GROUPING 2', 'DISEASE GROUPING 3', 'DISEASE GROUPING 4', + 'HTN', 'BIC_VENOUS_MEAN', 'CALCIUM_MEAN' , 'CREATININ_MEAN', 'GLUCOSE_MEAN', 'INR_MEAN', + 'LACTATE_MEAN', 'LEUKOCYTES_MEAN', 'LINFOCITOS_MEAN', 'NEUTROPHILES_MEAN', 'PC02_VENOUS_MEAN', + 'PCR_MEAN', 'PLATELETS_MEAN', 'SAT02_VENOUS_MEAN', 'SODIUM_MEAN', 'UREA_MEAN', 'BLOODPRESSURE_DIASTOLIC_MEAN', + 'RESPIRATORY_RATE_MEAN', 'TEMPERATURE_MEAN', 'OXYGEN_SATURATION_MEAN', 'BLOODPRESSURE_SISTOLIC_MIN', + 'HEART_RATE_MIN', 'RESPIRATORY_RATE_MIN', 'TEMPERATURE_MIN', 'BLOODPRESSURE_DIASTOLIC_MAX', 'BLOODPRESSURE_SISTOLIC_MAX', + 'HEART_RATE_MAX', 'OXYGEN_SATURATION_MAX', 'BLOODPRESSURE_DIASTOLIC_DIFF', 'BLOODPRESSURE_SISTOLIC_DIFF', + 'HEART_RATE_DIFF', 'RESPIRATORY_RATE_DIFF', 'TEMPERATURE_DIFF', 'OXYGEN_SATURATION_DIFF', + 'AGE_PERCENTIL_10th', 'AGE_PERCENTIL_20th', 'AGE_PERCENTIL_80th', 'AGE_PERCENTIL_90th', 'ICU']] + +print(selected_final_data.shape) +selected_final_data.head() + +corr = selected_final_data.corr() +corr.shape +plt.subplots(figsize=(30,30)) +ax = sns.heatmap( + corr, + vmin=-1, vmax=1, center=0, + cmap=sns.diverging_palette(20, 220, n=200), + square=True +) +ax.set_xticklabels( + ax.get_xticklabels(), + rotation=90, + horizontalalignment='right' +); +corr.tail() + +selected_final_data.columns + +Non_ICU_Admitted_data = selected_final_data[selected_final_data['ICU']==0] +ICU_Admitted_data = selected_final_data[selected_final_data['ICU']==1] + +Vital_Non_ICU_Admitted_data = Non_ICU_Admitted_data[['BLOODPRESSURE_DIASTOLIC_MEAN', + 'RESPIRATORY_RATE_MEAN', 'TEMPERATURE_MEAN', 'OXYGEN_SATURATION_MEAN', + 'BLOODPRESSURE_SISTOLIC_MIN', 'HEART_RATE_MIN', 'RESPIRATORY_RATE_MIN', + 'TEMPERATURE_MIN', 'BLOODPRESSURE_DIASTOLIC_MAX', + 'BLOODPRESSURE_SISTOLIC_MAX', 'HEART_RATE_MAX', 'OXYGEN_SATURATION_MAX', + 'HEART_RATE_DIFF', 'RESPIRATORY_RATE_DIFF', 'TEMPERATURE_DIFF']] + +Vital_ICU_Admitted_data = ICU_Admitted_data[['BLOODPRESSURE_DIASTOLIC_MEAN', + 'RESPIRATORY_RATE_MEAN', 'TEMPERATURE_MEAN', 'OXYGEN_SATURATION_MEAN', + 'BLOODPRESSURE_SISTOLIC_MIN', 'HEART_RATE_MIN', 'RESPIRATORY_RATE_MIN', + 'TEMPERATURE_MIN', 'BLOODPRESSURE_DIASTOLIC_MAX', + 'BLOODPRESSURE_SISTOLIC_MAX', 'HEART_RATE_MAX', 'OXYGEN_SATURATION_MAX', + 'HEART_RATE_DIFF', 'RESPIRATORY_RATE_DIFF', 'TEMPERATURE_DIFF']] + + +Lab_Non_ICU_Admitted_data = Non_ICU_Admitted_data[['HTN', 'BIC_VENOUS_MEAN', 'CALCIUM_MEAN', + 'CREATININ_MEAN', 'GLUCOSE_MEAN', 'INR_MEAN', 'LACTATE_MEAN', + 'LEUKOCYTES_MEAN', 'LINFOCITOS_MEAN', 'NEUTROPHILES_MEAN', + 'PC02_VENOUS_MEAN', 'PCR_MEAN', 'PLATELETS_MEAN', 'SAT02_VENOUS_MEAN', + 'SODIUM_MEAN', 'UREA_MEAN']] +Lab_ICU_Admitted_data = ICU_Admitted_data[['HTN', 'BIC_VENOUS_MEAN', 'CALCIUM_MEAN', + 'CREATININ_MEAN', 'GLUCOSE_MEAN', 'INR_MEAN', 'LACTATE_MEAN', + 'LEUKOCYTES_MEAN', 'LINFOCITOS_MEAN', 'NEUTROPHILES_MEAN', + 'PC02_VENOUS_MEAN', 'PCR_MEAN', 'PLATELETS_MEAN', 'SAT02_VENOUS_MEAN', + 'SODIUM_MEAN', 'UREA_MEAN']] + + +# set width of bar +barWidth = 0.25 +fig = plt.subplots(figsize =(20, 10)) + +vital_non_ICU = np.array(Vital_Non_ICU_Admitted_data.mean(axis=0)) +vital_ICU = np.array(Vital_ICU_Admitted_data.mean(axis=0)) + +# Set position of bar on X axis +br1 = np.arange(len(vital_ICU)) + (barWidth*0.5) +br2 = [x + barWidth for x in br1] + +# Make the plot +plt.bar(br2, vital_ICU, color ='r', width = barWidth, edgecolor ='grey', label ='ICU Admitted') +plt.bar(br1, vital_non_ICU, color ='b', width = barWidth, edgecolor ='grey', label ='NOT Admitted') + + +plt.xlabel('Features', fontweight ='bold') +plt.ylabel('Normalized Values', fontweight ='bold') +plt.xticks([r + barWidth for r in range(len(vital_ICU))], ['BLOODPRESSURE_DIASTOLIC_MEAN', + 'RESPIRATORY_RATE_MEAN', 'TEMPERATURE_MEAN', 'OXYGEN_SATURATION_MEAN', + 'BLOODPRESSURE_SISTOLIC_MIN', 'HEART_RATE_MIN', 'RESPIRATORY_RATE_MIN', + 'TEMPERATURE_MIN', 'BLOODPRESSURE_DIASTOLIC_MAX', + 'BLOODPRESSURE_SISTOLIC_MAX', 'HEART_RATE_MAX', 'OXYGEN_SATURATION_MAX', + 'HEART_RATE_DIFF', 'RESPIRATORY_RATE_DIFF', 'TEMPERATURE_DIFF'], rotation = 90) + +plt.legend() +plt.title("Vital Signs of Covid19 Patients") +plt.show() + + +# set width of bar +barWidth = 0.25 +fig = plt.subplots(figsize =(20, 10)) + +lab_non_ICU = np.array(Lab_Non_ICU_Admitted_data.mean(axis=0)) +lab_ICU = np.array(Lab_ICU_Admitted_data.mean(axis=0)) + +# Set position of bar on X axis +br1 = np.arange(len(lab_ICU)) + (barWidth*0.5) +br2 = [x + barWidth for x in br1] + +# Make the plot +plt.bar(br2, lab_ICU, color ='r', width = barWidth, edgecolor ='grey', label ='ICU Admitted') +plt.bar(br1, lab_non_ICU, color ='b', width = barWidth, edgecolor ='grey', label ='NOT Admitted') + + +plt.xlabel('Features', fontweight ='bold') +plt.ylabel('Normalized Value', fontweight ='bold') +plt.legend() +plt.xticks([r + barWidth for r in range(len(lab_ICU))], ['HTN', 'BIC_VENOUS_MEAN', 'CALCIUM_MEAN', + 'CREATININ_MEAN', 'GLUCOSE_MEAN', 'INR_MEAN', 'LACTATE_MEAN', + 'LEUKOCYTES_MEAN', 'LINFOCITOS_MEAN', 'NEUTROPHILES_MEAN', + 'PC02_VENOUS_MEAN', 'PCR_MEAN', 'PLATELETS_MEAN', 'SAT02_VENOUS_MEAN', + 'SODIUM_MEAN', 'UREA_MEAN'], rotation = 90) +plt.title("Lab Test Results of Covid19 patients") +plt.show() + +X_data = np.array(selected_final_data.drop(['ICU'], axis = 1)) +Y_data = np.array(selected_final_data[['ICU']]) +print(X_data.shape) +print(Y_data.shape) +from sklearn.decomposition import PCA + +labels = [] +for i in Y_data: + if(i[0]==0): + labels.append(0) + else: + labels.append(1) +print(X_data) +Y_data = np.array(labels) + +#pca = PCA(0.80) +#X_data = pca.fit_transform(X_data) +print("pca ", X_data.shape) +model = TSNE(n_components = 2, random_state = 0) + +tsne_data = model.fit_transform(X_data) + + +# creating a new data frame which +# help us in ploting the result data +tsne_data = np.vstack((tsne_data.T, Y_data)).T +tsne_df = pd.DataFrame(data = tsne_data, + columns =("Dim_1", "Dim_2","label")) + +# Ploting the result of tsne +sns.FacetGrid(tsne_df, hue ="label", size = 6).map( + plt.scatter, 'Dim_1', 'Dim_2', s = 100).add_legend() + +plt.show() + +selected_final_data.head() + +print(X_data) +print(Y_data) + +"""## Training and Testing using various classifiers + +Importing Libraries +""" + +from sklearn.linear_model import LogisticRegressionCV +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import KFold +from sklearn.model_selection import train_test_split +from sklearn.naive_bayes import GaussianNB +from sklearn.linear_model import SGDClassifier +from sklearn.preprocessing import StandardScaler +from sklearn.pipeline import make_pipeline +from sklearn.ensemble import RandomForestClassifier +from sklearn.datasets import make_classification +from sklearn import svm +from sklearn import tree +from sklearn.neighbors import KNeighborsClassifier +from sklearn.metrics import confusion_matrix +from sklearn.metrics import roc_auc_score +from sklearn.model_selection import GridSearchCV +from sklearn.tree import DecisionTreeClassifier +import matplotlib.pyplot as plt +from sklearn.metrics import log_loss +from sklearn import tree +import graphviz +from sklearn.neural_network import MLPClassifier + +"""Shape of Datasets""" + +print(X_data.shape) +print(Y_data.shape) + +def ass(y_true,y_pred): + tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() + accuracy=(tp+tn)/(tp+fp+fn+tn) + specificity = tn/(tn+fp) + sensitivity=tp/(tp+fn) + print("Accuracy:",accuracy*100) + print("Sensitivity:",sensitivity*100) + print("Specificity:",specificity*100) + print("ROC_AUC_Score:",roc_auc_score(y_true, y_pred)*100) + +"""Splitting Data into Training Data and Testing Data""" + +X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.30, random_state=1) + +"""Performing Logistic Regression with Cross Validation Estimator""" + +lgc=make_pipeline(LogisticRegressionCV(cv=5,random_state=1,max_iter=5000)) +lgc.fit(X_train, Y_train) +y_pred=lgc.predict(X_test) +ass(Y_test,y_pred) + +"""Performing Gaussian Naive Bayes """ + +gnb=make_pipeline(GaussianNB()) +gnb.fit(X_train,Y_train) +y_pred=gnb.predict(X_test) +ass(Y_test,y_pred) + +"""Finding Optimal Depth (SGD Classifier)""" + +mx=-1 +ri=-1 +for i in range(1,10000): + sgd= make_pipeline(SGDClassifier(random_state=i)) + sgd.fit(X_train,Y_train) + pmx=mx + mx=max(mx,sgd.score(X_test,Y_test)) + if(pmx!=mx): + ri=i +print(ri) + +"""Performing SGD classifier with optimal Depth""" + +sgd= make_pipeline(SGDClassifier(random_state=ri)) +sgd.fit(X_train,Y_train) +y_pred=sgd.predict(X_test) +ass(Y_test,y_pred) + +"""Performing SVM ( Supoort Vector Machine ) classification on the given data""" + +SVM_object = make_pipeline(svm.SVC(kernel='linear')) +SVM_object.fit(X_train,Y_train) +y_pred=SVM_object.predict(X_test) +ass(Y_test,y_pred) + +"""Performing Decision tree classification + +""" + +DT_object=tree.DecisionTreeClassifier(criterion='entropy',max_depth=4,max_leaf_nodes=10) +DT_object.fit(X_train,Y_train) +y_pred=DT_object.predict(X_test) +ass(Y_test,y_pred) + +from sklearn import tree +import graphviz +text_representation = tree.export_text(DT_object) +print(text_representation) + +features=['AGE_ABOVE65', 'DISEASE GROUPING 2', 'DISEASE GROUPING 3', + 'DISEASE GROUPING 4', 'HTN', 'BIC_VENOUS_MEAN', 'CALCIUM_MEAN', + 'CREATININ_MEAN', 'GLUCOSE_MEAN', 'INR_MEAN', 'LACTATE_MEAN', + 'LEUKOCYTES_MEAN', 'LINFOCITOS_MEAN', 'NEUTROPHILES_MEAN', + 'PC02_VENOUS_MEAN', 'PCR_MEAN', 'PLATELETS_MEAN', 'SAT02_VENOUS_MEAN', + 'SODIUM_MEAN', 'UREA_MEAN', 'BLOODPRESSURE_DIASTOLIC_MEAN', + 'RESPIRATORY_RATE_MEAN', 'TEMPERATURE_MEAN', 'OXYGEN_SATURATION_MEAN', + 'BLOODPRESSURE_SISTOLIC_MIN', 'HEART_RATE_MIN', 'RESPIRATORY_RATE_MIN', + 'TEMPERATURE_MIN', 'BLOODPRESSURE_DIASTOLIC_MAX', + 'BLOODPRESSURE_SISTOLIC_MAX', 'HEART_RATE_MAX', 'OXYGEN_SATURATION_MAX', + 'BLOODPRESSURE_DIASTOLIC_DIFF', 'BLOODPRESSURE_SISTOLIC_DIFF', + 'HEART_RATE_DIFF', 'RESPIRATORY_RATE_DIFF', 'TEMPERATURE_DIFF', + 'OXYGEN_SATURATION_DIFF', 'AGE_PERCENTIL_10th', 'AGE_PERCENTIL_20th', + 'AGE_PERCENTIL_80th', 'AGE_PERCENTIL_90th'] +classes=['Non-ICU','ICU'] +dot_data = tree.export_graphviz(DT_object, out_file=None, + feature_names=features, + class_names=classes, + filled=True) +graph = graphviz.Source(dot_data, format="png") +graph + +"""Performing K-Nearest Neighbour Classifier + +""" + +KNN_object=make_pipeline(KNeighborsClassifier(n_neighbors=25,p=1)) +KNN_object.fit(X_train,Y_train) +y_pred=KNN_object.predict(X_test) +ass(Y_test,y_pred) + +"""Performing Random Forest Classifier""" + +RF_object = RandomForestClassifier(criterion='gini',random_state=23,max_depth=6,bootstrap=True) +RF_object.fit(X_train,Y_train) +y_pred=RF_object.predict(X_test) +ass(Y_test,y_pred) + +"""##Performing Grid Search on Various ML Algorithm + +Grid Search on Decision Tree +""" + +param_grid = {'criterion':['entropy','gini'],'max_depth':np.arange(1,30),'max_leaf_nodes':np.arange(3,20),'random_state':[1,2]} +GS_DT=GridSearchCV(DecisionTreeClassifier(), param_grid,cv=5) +GS_DT.fit(X_train,Y_train) +GS_DT.best_params_ + +GS_DT.score(X_test,Y_test) + +dt_train_score=[] +dt_test_score=[] +for i in np.arange(1, 30): + param_grid = {'criterion':['entropy','gini'],'max_depth': [i],'max_leaf_nodes':np.arange(3,20),'random_state':[1,2]} + GS_DT=GridSearchCV(DecisionTreeClassifier(), param_grid,cv=5) + GS_DT.fit(X_train,Y_train) + y_train_pred=GS_DT.predict(X_train) + y_pred=GS_DT.predict(X_test) + dt_train_score.append(log_loss(Y_train,y_train_pred)) + dt_test_score.append(log_loss(Y_test,y_pred)) + +plt.title("Decision Tree Classifier : Error vs Depth") +plt.xlabel("Depth") +plt.ylabel("Error") +plt.plot(np.arange(1,30),dt_train_score,label="Training Error") +plt.plot(np.arange(1,30),dt_test_score,label="Testing Error") +plt.legend() +plt.plot() + +""" Best kernel Performance using Grid Search""" + +param_grid = {'kernel':['linear','poly','sigmoid','rbf'],'gamma':['scale','auto'],'random_state':[1,2,3]} +GS_SVM=GridSearchCV(svm.SVC(), param_grid,cv=5) +GS_SVM.fit(X_train,Y_train) +GS_SVM.best_params_ + +GS_SVM.score(X_test,Y_test) + +dt_train_score=[] +dt_test_score=[] +for i in ['linear','poly','sigmoid','rbf']: + param_grid = {'kernel':[i],'gamma':['scale','auto'],'random_state':[1,2,3]} + GS_SVM=GridSearchCV(svm.SVC(), param_grid,cv=5) + GS_SVM.fit(X_train,Y_train) + y_train_pred=GS_SVM.predict(X_train) + y_pred=GS_SVM.predict(X_test) + dt_train_score.append(log_loss(Y_train,y_train_pred)) + dt_test_score.append(log_loss(Y_test,y_pred)) + +plt.title("SVM: Error vs kernel") +plt.xlabel("Kernel") +plt.ylabel("Error") +plt.plot(['linear','poly','sigmoid','rbf'],dt_train_score,label="Training Error") +plt.plot(['linear','poly','sigmoid','rbf'],dt_test_score,label="Testing Error") +plt.legend() +plt.plot() + +"""Grid Search on K nearest neighbour""" + +param_grid = {'n_neighbors':[10,15,20,25,30,35,40],'leaf_size':np.arange(3,20),'p':[1,2]} +GS_KNN=GridSearchCV(KNeighborsClassifier(), param_grid,cv=5) +GS_KNN.fit(X_train,Y_train) +GS_KNN.best_params_ + +GS_KNN.score(X_test,Y_test) + +knn_train_score=[] +knn_test_score=[] +for i in [10,15,20,25,30,35,40]: + param_grid = {'n_neighbors': [i],'leaf_size':np.arange(3,20),'p':[1,2]} + GS_KNN=GridSearchCV(KNeighborsClassifier(), param_grid,cv=5) + GS_KNN.fit(X_train,Y_train) + y_train_pred=GS_KNN.predict(X_train) + y_pred=GS_KNN.predict(X_test) + knn_train_score.append(log_loss(Y_train,y_train_pred)) + knn_test_score.append(log_loss(Y_test,y_pred)) + +plt.title("K-Neighbours Classifier: Error vs Number of Neighbors ") +plt.xlabel("Number of Neighbors") +plt.ylabel("Error") +plt.plot([10,15,20,25,30,35,40],knn_train_score,label="Training Error") +plt.plot([10,15,20,25,30,35,40],knn_test_score,label="Testing Error") +plt.legend() +plt.plot() + +"""Grid search on Random Forest Classifier""" + +param_grid = {'criterion':['gini','entropy'],'max_depth': [6],'random_state':[23]} +GS_RF=GridSearchCV(RandomForestClassifier(), param_grid,cv=5) +GS_RF.fit(X_train,Y_train) +GS_RF.best_params_ + +GS_RF.score(X_test,Y_test) + +rf_train_score=[] +rf_test_score=[] +for i in np.arange(1, 30): + param_grid = {'criterion':['gini','entropy'],'max_depth': [i],'random_state':[23]} + GS_RF=GridSearchCV(RandomForestClassifier(), param_grid,cv=5) + GS_RF.fit(X_train,Y_train) + y_train_pred=GS_RF.predict(X_train) + y_pred=GS_RF.predict(X_test) + rf_train_score.append(log_loss(Y_train,y_train_pred)) + rf_test_score.append(log_loss(Y_test,y_pred)) + +plt.title("Random Forest Classifier : Error vs Max Depth") +plt.xlabel("Max Depth") +plt.ylabel("Error") +plt.plot(np.arange(1,30),rf_train_score,label="Training Error") +plt.plot(np.arange(1,30),rf_test_score,label="Testing Error") +plt.legend() +plt.plot() + +"""Training model with different activation functions and finding model with best accuracy""" + +best=1 +acc=-1 +for a in ["identity", "logistic", "tanh", "relu"]: + model = MLPClassifier(activation=a,max_iter=10000, batch_size=64,alpha=0.1,random_state=1).fit(X_train,Y_train) + y_pred = model.predict(X_test) + print(a) + ass(Y_test,y_pred) + score = model.score(X_test,Y_test) + if score>acc: + acc=score + best = a + #print(a," - ",model.score(X_test,Y_test)) +print(best,acc) + +"""Performing Grid search on the model we got from the above""" + +rf_train_score=[] +rf_test_score=[] +a=[0.001,0.01,0.1] +for i in range(len(a)): + param_grid = {'activation':[best],'max_iter': [10000],'batch_size':[64],'alpha':[0.1],'learning_rate_init':[a[i]],'random_state':[1]} + GS=GridSearchCV(MLPClassifier(), param_grid) + GS.fit(X_train,Y_train) + y_train_pred=GS.predict(X_train) + y_pred=GS.predict(X_test) + rf_train_score.append(log_loss(Y_train,y_train_pred)) + rf_test_score.append(log_loss(Y_test,y_pred)) + +plt.title(" MLPClassifier Error vs Learning rate") +plt.xlabel("Learning rate") +plt.ylabel("Error") +plt.plot([0.001,0.01,0.1],rf_train_score,label="Training Error") +plt.plot([0.001,0.01,0.1],rf_test_score,label="Testing Error") +plt.legend() +plt.plot() \ No newline at end of file