# -*- coding: utf-8 -*-
"""ML_to_Predicting_ICU_Mortality.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1kCmZcWOIuTCLSRpW5mn6JtLkE-Vl-k8V
#Predicting ICU Mortality and Identifying Key Clinical Risk Factors Using Machine Learning
#Background:
In intensive care units (ICUs), timely and accurate identification of patients at high risk of mortality is crucial for guiding treatment decisions and optimizing care. While ICU clinicians rely on their experience and standard clinical scoring systems (like APACHE) to predict outcomes, there is potential for machine learning (ML) to enhance predictive accuracy by analyzing patterns in large datasets with multiple clinical variables.
#Objective:
The goal of this project is to develop a machine learning model that predicts ICU mortality based on clinical parameters recorded during a patient’s stay. Additionally, the project aims to uncover the key clinical features most strongly associated with patient outcomes, which can provide valuable insights into ICU patient management.
"""
######Importing some Library
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
######importing Data and overviewing the Data
health_data= pd.read_csv("/content/dataset.csv")
pd.set_option('display.max_columns', None)
health_data.head(10)
health_data.columns
health_data.describe()
health_data.info()
#### Data Cleanining
new_data = health_data.drop("Unnamed: 83", axis=1)
new_data.info()
###### Checking the percentage of missing Data
pd.set_option("display.max_rows", None)
missing_data_percentage = new_data.isna().mean() * 100
missing_data_percentage
sns.heatmap(new_data.isna(), cbar=False, cmap='viridis', yticklabels=False)
plt.title('Missing Data Heatmap')
plt.show()
clean_data= new_data.dropna()
clean_data.info()
#####calculationg percentage of data lost during cleaning
data_diferrence= new_data["patient_id"].count()-clean_data["patient_id"].count()
percentage_loss= data_diferrence/new_data["patient_id"].count()*100
print(f"Percentage of Data lost after cleaning is {percentage_loss}%")
###### Need to export the clean data and upload to SQL database
clean_data.to_csv("C:\\Users\\iyand\\Downloads\\CleanICUdata.csv", index= False )
#### pulling the data back from SQL and Doing exploratory Data Analysis
sns.countplot(x="hospital_death", data=clean_data)
sns.countplot(x="hospital_death", data=health_data)
"""Before and after data cleaning, there is an imbalance destribution in target variable, i.e the hospital death distribution"""
####### Let check for distribution of columns with object value and classification
object_columns = clean_data.select_dtypes(include='object').columns
print(object_columns)
plt.figure(figsize=(10, 5))
sns.countplot(x="ethnicity", data=health_data , hue="hospital_death")
plt.show()
plt.figure(figsize=(10, 5))
sns.countplot(x="gender", data=health_data , hue="hospital_death")
plt.show()
plt.figure(figsize=(10, 5))
sns.countplot(x="icu_stay_type", data=health_data , hue="hospital_death")
plt.show()
plt.figure(figsize=(14, 5))
sns.countplot(x="icu_admit_source", data=health_data , hue="hospital_death")
plt.show()
plt.figure(figsize=(18, 5))
sns.countplot(x="apache_3j_bodysystem", data=health_data , hue="hospital_death")
plt.show()
death_counts = health_data.groupby(['apache_3j_bodysystem', 'hospital_death']).size().unstack()
death_percentage = death_counts.div(death_counts.sum(axis=1), axis=0) * 100
death_percentage = death_percentage.reset_index()
death_percentage_melted = death_percentage.melt(id_vars='apache_3j_bodysystem', var_name='hospital_death', value_name='percentage')
plt.figure(figsize=(18, 5))
barplot = sns.barplot(data=death_percentage_melted, x='apache_3j_bodysystem', y='percentage', hue='hospital_death', palette='viridis')
plt.title('Percentage of Hospital Deaths by APACHE 3J Body System')
plt.xlabel('APACHE 3J Body System')
plt.ylabel('Percentage')
plt.legend(title='Hospital Death', labels=['Alive (0)', 'Deceased (1)'])
plt.xticks(rotation=45)
for p in barplot.patches:
percentage = f"{p.get_height():.1f}%"
barplot.annotate(percentage, (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='bottom')
plt.show()
plt.figure(figsize=(18, 5))
sns.countplot(x="apache_2_bodysystem", data=health_data , hue="hospital_death")
plt.show()
#######
clean_data.hist(figsize=(52,50))
plt.show()
#### Feature importance of each column determination using SKlearn
plt.figure(figsize=(10, 12))
from sklearn.ensemble import RandomForestClassifier
Feature_model = RandomForestClassifier()
X1 = clean_data.drop(columns=["hospital_death", "ethnicity", "gender", "icu_admit_source",
"icu_stay_type", "icu_type", "apache_3j_bodysystem",
"apache_2_bodysystem"])
y1 = clean_data["hospital_death"]
Feature_model.fit(X1, y1)
Feature_importance = pd.Series(Feature_model.feature_importances_, index=X1.columns)
Feature_importance.nlargest(50).plot(kind='barh')
plt.title("Top 50 Feature Importances")
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.show()
"""The Acute Physiology and Chronic Health Evaluation (APACHE) IV model can predict the intensive care unit (ICU) length of stay (LOS) in critically ill patients. That this happen to be the most important features in the data set"""
sns.kdeplot(data=clean_data, x='apache_4a_icu_death_prob', hue='hospital_death')
plt.title("KDE distribution of apache_4a_icu_death_prob ")
sns.kdeplot(data=clean_data, x='apache_4a_hospital_death_prob', hue='hospital_death')
plt.title("KDE distribution of apache_4a_hospital_death_prob ")
correlation = X1.corr()
correlation
plt.figure(figsize=(20, 20))
sns.heatmap(correlation)
plt.show()
"""#Data Processing For ML training"""
clean_data.head(15)
#### one hot encoding for the categorical data in string
###['ethnicity', 'gender', 'icu_admit_source', 'icu_stay_type', 'icu_type','apache_3j_bodysystem', 'apache_2_bodysystem']
clean_data1=clean_data.copy()
one_hot_encode_data= pd.get_dummies(clean_data1, columns= ['ethnicity', 'gender', 'icu_admit_source', 'icu_stay_type', 'icu_type','apache_3j_bodysystem', 'apache_2_bodysystem'], drop_first=False )
one_hot_encode_data.head(15)
encoded_data=one_hot_encode_data.astype(int)
encoded_data.head(10)
from scipy import stats
z_score= stats.zscore(encoded_data.select_dtypes(include= ["float", "int64"]))
outlier = (z_score>3) | (z_score< -3)
df_ourliers = encoded_data[outlier.any(axis=1)]
df_ourliers
#######listing features with ouliers
outlier_counts = outlier.sum(axis=0)
sorted_outlier_counts = outlier_counts.sort_values(ascending=False)
print(sorted_outlier_counts)
plt.figure(figsize=(25, 8))
sns.barplot(x=sorted_outlier_counts.index, y=sorted_outlier_counts.values)
plt.xticks(rotation=90)
plt.xlabel('Columns')
plt.ylabel('Number of Outliers')
plt.title('Number of Outliers per Column in 56k patient data')
plt.show()
"""In the context of this medical dataset, I have chosen not to remove the outliers due to their potential clinical significance. Medical data is inherently variable, reflecting the complex and diverse conditions of individual patients. Outliers in this dataset may represent critically ill patients or rare, yet important, clinical conditions. Removing these values could lead to a loss of meaningful information, particularly in a healthcare setting where extreme values can indicate vital health states or key medical emergencies that are essential for accurate diagnosis and treatment."""
print(sorted_outlier_counts.head(82).index)
outlier_columns = encoded_data[['apache_2_bodysystem_Metabolic', 'apache_3j_bodysystem_Metabolic',
'icu_type_Neuro ICU', 'hospital_death', 'icu_type_CCU-CTICU',
'icu_stay_type_admit', 'icu_type_SICU', 'icu_stay_type_transfer',
'gcs_motor_apache', 'icu_type_Cardiac ICU', 'ethnicity_Other/Unknown',
'icu_type_CSICU', 'apache_3j_bodysystem_Trauma',
'apache_2_bodysystem_Trauma', 'ethnicity_Hispanic', 'temp_apache',
'apache_2_bodysystem_Undefined diagnoses', 'icu_type_CTICU',
'icu_admit_source_Other Hospital', 'arf_apache', 'immunosuppression',
'apache_2_bodysystem_Renal/Genitourinary',
'apache_3j_bodysystem_Genitourinary', 'solid_tumor_with_metastasis',
'd1_glucose_max', 'd1_temp_min', 'd1_resprate_max', 'pre_icu_los_days',
'd1_spo2_min', 'bmi', 'cirrhosis', 'd1_glucose_min', 'h1_resprate_max',
'weight', 'hepatic_failure', 'apache_4a_hospital_death_prob',
'apache_4a_icu_death_prob', 'h1_spo2_min', 'd1_diasbp_max',
'd1_diasbp_noninvasive_max', 'apache_3j_bodysystem_Musculoskeletal/Skin',
'd1_mbp_max', 'ethnicity_Asian', 'd1_mbp_noninvasive_max', 'h1_diasbp_max',
'h1_diasbp_noninvasive_max', 'h1_resprate_min', 'apache_3j_diagnosis',
'ethnicity_Native American', 'd1_heartrate_min', 'h1_mbp_noninvasive_max',
'h1_mbp_max', 'd1_spo2_max', 'h1_spo2_max', 'd1_heartrate_max', 'leukemia',
'h1_sysbp_max', 'h1_sysbp_noninvasive_max', 'apache_3j_bodysystem_Hematological',
'apache_2_bodysystem_Haematologic', 'd1_sysbp_max',
'd1_sysbp_noninvasive_max', 'h1_heartrate_max',
'icu_admit_source_Other ICU', 'd1_resprate_min', 'h1_mbp_min',
'h1_mbp_noninvasive_min', 'h1_diasbp_min', 'h1_diasbp_noninvasive_min',
'd1_mbp_min', 'd1_mbp_noninvasive_min', 'd1_sysbp_min',
'd1_sysbp_noninvasive_min', 'height', 'd1_potassium_max', 'lymphoma',
'icu_stay_type_readmit', 'd1_diasbp_noninvasive_min', 'd1_diasbp_min',
'apache_2_bodysystem_Undefined Diagnoses',
'apache_3j_bodysystem_Gynecological', 'aids']]
plt.figure(figsize=(25, 8))
sns.violinplot(data=outlier_columns, orient="v")
plt.xticks(rotation=90)
plt.xlabel('Columns')
plt.ylabel('Value Distribution')
plt.title('Violin Plot of Outlier Columns')
plt.show()
plt.figure(figsize=(25, 10))
sns.boxplot(data=outlier_columns, orient="v")
plt.xticks(rotation=90)
plt.xlabel('Columns')
plt.ylabel('Value Distribution')
plt.title('box Plot of Outlier Columns')
plt.show()
#####Train-Test Split for ML process
encoded_data.head(5)
encoded_data1= encoded_data.copy()
X= encoded_data1.drop("hospital_death", axis= 1)
y= encoded_data1["hospital_death"]
#############
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X.shape
y.shape
##### Training ML model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
LG_model= LogisticRegression(max_iter=1000)
LG_model.fit(X_train, y_train)
y_pred = LG_model.predict(X_test)
#####evaluation
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))
####Trying of Random Forest Model
from sklearn.ensemble import RandomForestClassifier
rf_model= RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
#######
print("Confusion Matrix:")
print(confusion_matrix(y_test, rf_pred))
print("\nClassification Report:")
print(classification_report(y_test, rf_pred))
print("Accuracy Score:", accuracy_score(y_test, rf_pred))
"""During the Analysis, we find out there was imbalance distribution in the target variable, So i will be doing SMOTE"""
plt.figure(figsize=(6, 4))
sns.countplot(x=y)
plt.title('Class Distribution')
plt.xlabel('Hospital Death')
plt.ylabel('Count')
plt.show()
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print("Original dataset shape:", y_train.value_counts())
print("Resampled dataset shape:", y_train_resampled.value_counts())
from sklearn.ensemble import GradientBoostingClassifier
gb_model= GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train_resampled, y_train_resampled)
####
gb_pred= gb_model.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, gb_pred))
print("\nClassification Report:")
print(classification_report(y_test, gb_pred))
print("Accuracy Score:", accuracy_score(y_test, gb_pred))
importances = gb_model.feature_importances_
indices = np.argsort(importances)[::-1]
print("\nFeature importances:")
for f in range(X.shape[1]):
print(f"{X.columns[indices[f]]}: {importances[indices[f]]:.4f}")
from sklearn.ensemble import RandomForestClassifier
r_model= RandomForestClassifier(n_estimators=100, random_state=42)
r_model.fit(X_train_resampled, y_train_resampled)
r_pred = r_model.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, r_pred))
print("\nClassification Report:")
print(classification_report(y_test, r_pred))
print("Accuracy Score:", accuracy_score(y_test, r_pred))
"""Conclusion
This project successfully developed machine learning models to predict mortality in Intensive Care Unit (ICU) patients using various clinical parameters. Both Logistic Regression and Random Forest classifiers demonstrated strong predictive performance, achieving accuracy scores of approximately 91.5% and 92.1%, respectively. However, the models exhibited challenges in correctly identifying patients at high risk of mortality (class 1), as indicated by lower precision and recall values for this class.
Following the application of SMOTE (Synthetic Minority Over-sampling Technique) to address class imbalance, the models maintained competitive accuracy, but the challenge of low recall for the minority class persisted, suggesting that while more balanced data can enhance overall model performance, targeted efforts are still needed to improve predictions for high-risk patients.
Feature importance analysis revealed that the Glasgow Coma Scale (GCS) scores, particularly the motor component, were significant predictors of mortality, followed by other clinical parameters such as temperature and gender. These insights highlight the importance of specific clinical indicators in predicting ICU outcomes, providing valuable information for clinicians in patient management.
"""