# **Import dependencies and Load the dataset**

In [19]:
# Refernce [1]: https://digitalhumanities.hkust.edu.hk/tutorials/learn-python-from-zero-for-absolute-beginner-1-data-cleaning/
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, precision_recall_fscore_support
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

data = pd.read_csv("/content/drive/MyDrive/SPH 6004/Assignment1_data.csv")

# **Data Cleaning**

In [20]:
# Refernce [2]: https://miamioh.edu/centers-institutes/center-for-analytics-data-science/students/coding-tutorials/python/data-cleaning.html
missing_values = data.isnull().sum()

# Refernce [3]: https://digitalhumanities.hkust.edu.hk/tutorials/learn-python-from-zero-for-absolute-beginner-1-data-cleaning/
# Refernce [4]: https://note.nkmk.me/en/python-pandas-nan-judge-count/
missing_values_summary = pd.DataFrame(missing_values, columns=['Missing Values'])
missing_values_summary = missing_values_summary[missing_values_summary['Missing Values'] > 0]
missing_values_summary.sort_values(by='Missing Values', ascending=False, inplace=True)

missing_values_summary.head(20)

Unnamed: 0,Missing Values
thrombin_max,50829
thrombin_min,50829
d_dimer_min,50811
d_dimer_max,50811
ggt_max,50448
ggt_min,50448
globulin_min,50235
globulin_max,50235
bicarbonate_min,50071
bicarbonate_max,50071


In [21]:
# Refernce [5]: https://www.sciencedirect.com/science/article/pii/S0895435618308710
# Refernce [6]: https://stackoverflow.com/questions/65775141/remove-rows-with-more-than-percentage-of-missing-data-for-majority-class-samples
threshold = 0.5 * len(data)

# Refernce [7]: https://www.statology.org/pandas-exclude-column/
# Refernce [8]: https://www.datacamp.com/tutorial/pandas-drop-column?utm_source=google&utm_medium=paid_search&utm_campaignid=19589720821&utm_adgroupid=157156375191&utm_device=c&utm_keyword=&utm_matchtype=&utm_network=g&utm_adpostion=&utm_creative=691747307431&utm_targetid=dsa-2218886984100&utm_loc_interest_ms=&utm_loc_physical_ms=9062543&utm_content=&utm_campaign=230119_1-sea~dsa~tofu_2-b2c_3-row-p1_4-prc_5-na_6-na_7-le_8-pdsh-go_9-na_10-na_11-na-feb24&gad_source=1&gclid=CjwKCAiAxaCvBhBaEiwAvsLmWGh-6fy-rTQ96ZwE3t9Fisrs4seiXS8GQvJkeQYp5J_Dj4IoGtEA8BoCTucQAvD_BwE
columns_to_exclude = missing_values_summary[missing_values_summary['Missing Values'] > threshold].index
data_cleaned = data.drop(columns=columns_to_exclude)

original_shape = data.shape
cleaned_shape = data_cleaned.shape
original_shape, cleaned_shape, columns_to_exclude.tolist()

((50920, 162),
 (50920, 87),
 ['thrombin_max',
  'thrombin_min',
  'd_dimer_min',
  'd_dimer_max',
  'ggt_max',
  'ggt_min',
  'globulin_min',
  'globulin_max',
  'bicarbonate_min',
  'bicarbonate_max',
  'methemoglobin_min',
  'methemoglobin_max',
  'total_protein_max',
  'total_protein_min',
  'carboxyhemoglobin_min',
  'carboxyhemoglobin_max',
  'bilirubin_indirect_min',
  'bilirubin_indirect_max',
  'nrbc_min',
  'nrbc_max',
  'bilirubin_direct_min',
  'bilirubin_direct_max',
  'amylase_min',
  'amylase_max',
  'aado2_max',
  'aado2_min',
  'atyps_min',
  'atyps_max',
  'metas_max',
  'metas_min',
  'bands_min',
  'bands_max',
  'temperature_min.1',
  'temperature_max.1',
  'imm_granulocytes_min',
  'imm_granulocytes_max',
  'chloride_max',
  'chloride_min',
  'hemoglobin_max',
  'hemoglobin_min',
  'hematocrit_max',
  'hematocrit_min',
  'ck_mb_min',
  'ck_mb_max',
  'ld_ldh_min',
  'ld_ldh_max',
  'sodium_min',
  'sodium_max',
  'fibrinogen_min',
  'fibrinogen_max',
  'so2_max',


In [22]:
# Refernce [9]: https://insightsoftware.com/blog/how-to-handle-missing-data-values-while-data-cleaning/
missing_values_cleaned = data_cleaned.isnull().sum()
missing_values_cleaned_summary = pd.DataFrame(missing_values_cleaned[missing_values_cleaned > 0], columns=['Missing Values'])
missing_values_cleaned_summary.sort_values(by='Missing Values', ascending=False, inplace=True)

missing_values_cleaned_summary.head(20)

Unnamed: 0,Missing Values
lactate_max,23759
lactate_min,23759
abs_monocytes_min,20095
abs_basophils_min,20095
abs_basophils_max,20095
abs_monocytes_max,20095
abs_eosinophils_min,20094
abs_eosinophils_max,20094
abs_neutrophils_max,20094
abs_neutrophils_min,20094


In [23]:
# Refernce [10]: https://scikit-learn.org/stable/modules/impute.html
# Refernce [11]: https://www.geeksforgeeks.org/how-to-fill-nan-values-with-mean-in-pandas/

for column in missing_values_cleaned_summary.index:
    if data_cleaned[column].dtype in ['float64', 'int64']:  # Only impute the column that is numerical!
        median_value = data_cleaned[column].median()
        data_cleaned[column].fillna(median_value, inplace=True)

# https://www.linkedin.com/advice/0/what-some-best-practices-dealing-missing-values-imputation
recheck_missing_values = data_cleaned.isnull().sum()
missing_values_after_imputation = recheck_missing_values[recheck_missing_values > 0]

missing_values_after_imputation

Series([], dtype: int64)

# **Feature Selection and Model Training**

In [24]:
# Refernce [12]: https://www.analyticsvidhya.com/blog/2020/06/feature-selection-techniques-machine-learning/
# Refernce [13]: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection
features = ['gender', 'admission_age', 'race', 'heart_rate_mean', 'sbp_mean', 'dbp_mean', 'lactate_min', 'lactate_max']
target = 'aki'

X = data_cleaned[features]
y = data_cleaned[target]

# Refernce [14]: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
categorical_features = ['gender', 'race']
numerical_features = ['admission_age', 'heart_rate_mean', 'sbp_mean', 'dbp_mean', 'lactate_min', 'lactate_max']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

X_prepared = preprocessor.fit_transform(X)

# Refernce [15]: https://towardsdatascience.com/stratified-sampling-you-may-have-been-splitting-your-dataset-all-wrong-8cfdd0d32502
X_train, X_test, y_train, y_test = train_test_split(X_prepared, y, test_size=0.2, random_state=42, stratify=y)

X_train.shape, X_test.shape

((40736, 41), (10184, 41))

# **Model Prediction and Evaluation**

## **Logistic Regression**



In [25]:
# Refernce [16]: https://scikit-learn.org/stable/modules/grid_search.html
# Refernce [17]: https://drbeane.github.io/python_dsci/pages/grid_search.html
C_values = [0.001, 0.01, 0.1, 1, 10, 100]

param_grid = {'C': C_values}
lr_model = LogisticRegression(penalty='l1', solver='liblinear')
grid_search = GridSearchCV(lr_model, param_grid, cv=5)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')

accuracy, precision, recall, f1

(0.40298507462686567,
 0.3320444093843265,
 0.40298507462686567,
 0.3342230247225089)

## **Decision Tree**

In [26]:
# Refernce [18]: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
decision_tree = DecisionTreeClassifier(random_state=42)

decision_tree.fit(X_train, y_train)

y_pred_dt = decision_tree.predict(X_test)

accuracy_dt = accuracy_score(y_test, y_pred_dt)
precision_dt, recall_dt, f1_dt, _ = precision_recall_fscore_support(y_test, y_pred_dt, average='weighted')

accuracy_dt, precision_dt, recall_dt, f1_dt

(0.31922623723487825,
 0.31932298016417965,
 0.31922623723487825,
 0.3192673787031557)

## **Random Forest**

In [28]:
# Refernce [19]: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
random_forest = RandomForestClassifier(random_state=42)

random_forest.fit(X_train, y_train)

y_pred_rf = random_forest.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf, recall_rf, f1_rf, _ = precision_recall_fscore_support(y_test, y_pred_rf, average='weighted')

accuracy_rf, precision_rf, recall_rf, f1_rf

(0.3881578947368421, 0.358609063596094, 0.3881578947368421, 0.3568783712222866)

## **SVM**

In [30]:
# Refernce [20]: https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html
# Refernce [21]: https://www.datatechnotes.com/2020/07/classification-example-with-linearsvm-in-python.html
# Refernce [22]: https://medium.com/@mrconnor/understanding-support-vector-machines-through-code-a-detailed-guide-692d0061d78b
svm_model = LinearSVC(random_state=42, max_iter=100000)

svm_model.fit(X_train, y_train)

y_pred_svm = svm_model.predict(X_test)


accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm, recall_svm, f1_svm, _ = precision_recall_fscore_support(y_test, y_pred_svm, average='weighted')

accuracy_svm, precision_svm, recall_svm, f1_svm

(0.40327965435978, 0.33123320296781494, 0.40327965435978, 0.3331245901794984)