{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"mount_file_id":"1HsRV9AY76wqGbLLY1XaC3tMPAIUq6GgO","authorship_tag":"ABX9TyMiXZY3Q7CeR932hy15h6zW"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","source":["# **Import dependencies and Load the dataset**"],"metadata":{"id":"gRWEL5oFUXLp"}},{"cell_type":"code","execution_count":19,"metadata":{"id":"97l_e_AHtEqO","executionInfo":{"status":"ok","timestamp":1709825818620,"user_tz":-480,"elapsed":2419,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}}},"outputs":[],"source":["# Refernce [1]: https://digitalhumanities.hkust.edu.hk/tutorials/learn-python-from-zero-for-absolute-beginner-1-data-cleaning/\n","import pandas as pd\n","from sklearn.model_selection import train_test_split, GridSearchCV\n","from sklearn.linear_model import LogisticRegression\n","from sklearn.tree import DecisionTreeClassifier\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.svm import LinearSVC\n","from sklearn.metrics import accuracy_score, roc_auc_score, precision_recall_fscore_support\n","from sklearn.preprocessing import StandardScaler, OneHotEncoder\n","from sklearn.compose import ColumnTransformer\n","from sklearn.pipeline import Pipeline\n","\n","data = pd.read_csv(\"/content/drive/MyDrive/SPH 6004/Assignment1_data.csv\")"]},{"cell_type":"markdown","source":["# **Data Cleaning**"],"metadata":{"id":"VPCXMBzDUii4"}},{"cell_type":"code","source":["# Refernce [2]: https://miamioh.edu/centers-institutes/center-for-analytics-data-science/students/coding-tutorials/python/data-cleaning.html\n","missing_values = data.isnull().sum()\n","\n","# Refernce [3]: https://digitalhumanities.hkust.edu.hk/tutorials/learn-python-from-zero-for-absolute-beginner-1-data-cleaning/\n","# Refernce [4]: https://note.nkmk.me/en/python-pandas-nan-judge-count/\n","missing_values_summary = pd.DataFrame(missing_values, columns=['Missing Values'])\n","missing_values_summary = missing_values_summary[missing_values_summary['Missing Values'] > 0]\n","missing_values_summary.sort_values(by='Missing Values', ascending=False, inplace=True)\n","\n","missing_values_summary.head(20)"],"metadata":{"id":"HAvMsNS1Uh8X","colab":{"base_uri":"https://localhost:8080/","height":669},"executionInfo":{"status":"ok","timestamp":1709825824655,"user_tz":-480,"elapsed":812,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}},"outputId":"16e30a52-5d6e-4dc2-eebd-8295bb7fd066"},"execution_count":20,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" Missing Values\n","thrombin_max 50829\n","thrombin_min 50829\n","d_dimer_min 50811\n","d_dimer_max 50811\n","ggt_max 50448\n","ggt_min 50448\n","globulin_min 50235\n","globulin_max 50235\n","bicarbonate_min 50071\n","bicarbonate_max 50071\n","methemoglobin_min 49820\n","methemoglobin_max 49820\n","total_protein_max 49761\n","total_protein_min 49761\n","carboxyhemoglobin_min 49724\n","carboxyhemoglobin_max 49724\n","bilirubin_indirect_min 48823\n","bilirubin_indirect_max 48823\n","nrbc_min 48815\n","nrbc_max 48815"],"text/html":["\n","
\n","
\n","\n","
\n"," \n"," \n"," | \n"," Missing Values | \n","
\n"," \n"," \n"," \n"," thrombin_max | \n"," 50829 | \n","
\n"," \n"," thrombin_min | \n"," 50829 | \n","
\n"," \n"," d_dimer_min | \n"," 50811 | \n","
\n"," \n"," d_dimer_max | \n"," 50811 | \n","
\n"," \n"," ggt_max | \n"," 50448 | \n","
\n"," \n"," ggt_min | \n"," 50448 | \n","
\n"," \n"," globulin_min | \n"," 50235 | \n","
\n"," \n"," globulin_max | \n"," 50235 | \n","
\n"," \n"," bicarbonate_min | \n"," 50071 | \n","
\n"," \n"," bicarbonate_max | \n"," 50071 | \n","
\n"," \n"," methemoglobin_min | \n"," 49820 | \n","
\n"," \n"," methemoglobin_max | \n"," 49820 | \n","
\n"," \n"," total_protein_max | \n"," 49761 | \n","
\n"," \n"," total_protein_min | \n"," 49761 | \n","
\n"," \n"," carboxyhemoglobin_min | \n"," 49724 | \n","
\n"," \n"," carboxyhemoglobin_max | \n"," 49724 | \n","
\n"," \n"," bilirubin_indirect_min | \n"," 48823 | \n","
\n"," \n"," bilirubin_indirect_max | \n"," 48823 | \n","
\n"," \n"," nrbc_min | \n"," 48815 | \n","
\n"," \n"," nrbc_max | \n"," 48815 | \n","
\n"," \n","
\n","
\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","variable_name":"missing_values_summary","summary":"{\n \"name\": \"missing_values_summary\",\n \"rows\": 157,\n \"fields\": [\n {\n \"column\": \"Missing Values\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 18864,\n \"min\": 79,\n \"max\": 50829,\n \"num_unique_values\": 70,\n \"samples\": [\n 36178,\n 50829,\n 1020\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":20}]},{"cell_type":"code","source":["# Refernce [5]: https://www.sciencedirect.com/science/article/pii/S0895435618308710\n","# Refernce [6]: https://stackoverflow.com/questions/65775141/remove-rows-with-more-than-percentage-of-missing-data-for-majority-class-samples\n","threshold = 0.5 * len(data)\n","\n","# Refernce [7]: https://www.statology.org/pandas-exclude-column/\n","# Refernce [8]: https://www.datacamp.com/tutorial/pandas-drop-column?utm_source=google&utm_medium=paid_search&utm_campaignid=19589720821&utm_adgroupid=157156375191&utm_device=c&utm_keyword=&utm_matchtype=&utm_network=g&utm_adpostion=&utm_creative=691747307431&utm_targetid=dsa-2218886984100&utm_loc_interest_ms=&utm_loc_physical_ms=9062543&utm_content=&utm_campaign=230119_1-sea~dsa~tofu_2-b2c_3-row-p1_4-prc_5-na_6-na_7-le_8-pdsh-go_9-na_10-na_11-na-feb24&gad_source=1&gclid=CjwKCAiAxaCvBhBaEiwAvsLmWGh-6fy-rTQ96ZwE3t9Fisrs4seiXS8GQvJkeQYp5J_Dj4IoGtEA8BoCTucQAvD_BwE\n","columns_to_exclude = missing_values_summary[missing_values_summary['Missing Values'] > threshold].index\n","data_cleaned = data.drop(columns=columns_to_exclude)\n","\n","original_shape = data.shape\n","cleaned_shape = data_cleaned.shape\n","original_shape, cleaned_shape, columns_to_exclude.tolist()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"5GM28kJYy1Hq","executionInfo":{"status":"ok","timestamp":1709825831008,"user_tz":-480,"elapsed":520,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}},"outputId":"8d3ece2c-b957-4ee9-fe85-46e9a3ab91ff"},"execution_count":21,"outputs":[{"output_type":"execute_result","data":{"text/plain":["((50920, 162),\n"," (50920, 87),\n"," ['thrombin_max',\n"," 'thrombin_min',\n"," 'd_dimer_min',\n"," 'd_dimer_max',\n"," 'ggt_max',\n"," 'ggt_min',\n"," 'globulin_min',\n"," 'globulin_max',\n"," 'bicarbonate_min',\n"," 'bicarbonate_max',\n"," 'methemoglobin_min',\n"," 'methemoglobin_max',\n"," 'total_protein_max',\n"," 'total_protein_min',\n"," 'carboxyhemoglobin_min',\n"," 'carboxyhemoglobin_max',\n"," 'bilirubin_indirect_min',\n"," 'bilirubin_indirect_max',\n"," 'nrbc_min',\n"," 'nrbc_max',\n"," 'bilirubin_direct_min',\n"," 'bilirubin_direct_max',\n"," 'amylase_min',\n"," 'amylase_max',\n"," 'aado2_max',\n"," 'aado2_min',\n"," 'atyps_min',\n"," 'atyps_max',\n"," 'metas_max',\n"," 'metas_min',\n"," 'bands_min',\n"," 'bands_max',\n"," 'temperature_min.1',\n"," 'temperature_max.1',\n"," 'imm_granulocytes_min',\n"," 'imm_granulocytes_max',\n"," 'chloride_max',\n"," 'chloride_min',\n"," 'hemoglobin_max',\n"," 'hemoglobin_min',\n"," 'hematocrit_max',\n"," 'hematocrit_min',\n"," 'ck_mb_min',\n"," 'ck_mb_max',\n"," 'ld_ldh_min',\n"," 'ld_ldh_max',\n"," 'sodium_min',\n"," 'sodium_max',\n"," 'fibrinogen_min',\n"," 'fibrinogen_max',\n"," 'so2_max',\n"," 'so2_min',\n"," 'ck_cpk_min',\n"," 'ck_cpk_max',\n"," 'glucose_min.1',\n"," 'glucose_max.1',\n"," 'potassium_min',\n"," 'potassium_max',\n"," 'albumin_max',\n"," 'albumin_min',\n"," 'calcium_max',\n"," 'calcium_min',\n"," 'pao2fio2ratio_min',\n"," 'pao2fio2ratio_max',\n"," 'aado2_calc_max',\n"," 'aado2_calc_min',\n"," 'bilirubin_total_max',\n"," 'bilirubin_total_min',\n"," 'alp_max',\n"," 'alp_min',\n"," 'height',\n"," 'alt_min',\n"," 'alt_max',\n"," 'ast_min',\n"," 'ast_max'])"]},"metadata":{},"execution_count":21}]},{"cell_type":"code","source":["# Refernce [9]: https://insightsoftware.com/blog/how-to-handle-missing-data-values-while-data-cleaning/\n","missing_values_cleaned = data_cleaned.isnull().sum()\n","missing_values_cleaned_summary = pd.DataFrame(missing_values_cleaned[missing_values_cleaned > 0], columns=['Missing Values'])\n","missing_values_cleaned_summary.sort_values(by='Missing Values', ascending=False, inplace=True)\n","\n","missing_values_cleaned_summary.head(20)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":669},"id":"ZH1-ffBFy558","executionInfo":{"status":"ok","timestamp":1709825836714,"user_tz":-480,"elapsed":692,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}},"outputId":"fe1d46f7-dacf-4c4d-a107-de986dee8e03"},"execution_count":22,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" Missing Values\n","lactate_max 23759\n","lactate_min 23759\n","abs_monocytes_min 20095\n","abs_basophils_min 20095\n","abs_basophils_max 20095\n","abs_monocytes_max 20095\n","abs_eosinophils_min 20094\n","abs_eosinophils_max 20094\n","abs_neutrophils_max 20094\n","abs_neutrophils_min 20094\n","abs_lymphocytes_min 20082\n","abs_lymphocytes_max 20082\n","ph_min 19748\n","ph_max 19748\n","totalco2_max 19744\n","totalco2_min 19744\n","baseexcess_max 19744\n","baseexcess_min 19744\n","pco2_max 19744\n","pco2_min 19744"],"text/html":["\n"," \n","
\n","\n","
\n"," \n"," \n"," | \n"," Missing Values | \n","
\n"," \n"," \n"," \n"," lactate_max | \n"," 23759 | \n","
\n"," \n"," lactate_min | \n"," 23759 | \n","
\n"," \n"," abs_monocytes_min | \n"," 20095 | \n","
\n"," \n"," abs_basophils_min | \n"," 20095 | \n","
\n"," \n"," abs_basophils_max | \n"," 20095 | \n","
\n"," \n"," abs_monocytes_max | \n"," 20095 | \n","
\n"," \n"," abs_eosinophils_min | \n"," 20094 | \n","
\n"," \n"," abs_eosinophils_max | \n"," 20094 | \n","
\n"," \n"," abs_neutrophils_max | \n"," 20094 | \n","
\n"," \n"," abs_neutrophils_min | \n"," 20094 | \n","
\n"," \n"," abs_lymphocytes_min | \n"," 20082 | \n","
\n"," \n"," abs_lymphocytes_max | \n"," 20082 | \n","
\n"," \n"," ph_min | \n"," 19748 | \n","
\n"," \n"," ph_max | \n"," 19748 | \n","
\n"," \n"," totalco2_max | \n"," 19744 | \n","
\n"," \n"," totalco2_min | \n"," 19744 | \n","
\n"," \n"," baseexcess_max | \n"," 19744 | \n","
\n"," \n"," baseexcess_min | \n"," 19744 | \n","
\n"," \n"," pco2_max | \n"," 19744 | \n","
\n"," \n"," pco2_min | \n"," 19744 | \n","
\n"," \n","
\n","
\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","variable_name":"missing_values_cleaned_summary","summary":"{\n \"name\": \"missing_values_cleaned_summary\",\n \"rows\": 82,\n \"fields\": [\n {\n \"column\": \"Missing Values\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 8672,\n \"min\": 79,\n \"max\": 23759,\n \"num_unique_values\": 34,\n \"samples\": [\n 662,\n 602,\n 378\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":22}]},{"cell_type":"code","source":["# Refernce [10]: https://scikit-learn.org/stable/modules/impute.html\n","# Refernce [11]: https://www.geeksforgeeks.org/how-to-fill-nan-values-with-mean-in-pandas/\n","\n","for column in missing_values_cleaned_summary.index:\n"," if data_cleaned[column].dtype in ['float64', 'int64']: # Only impute the column that is numerical!\n"," median_value = data_cleaned[column].median()\n"," data_cleaned[column].fillna(median_value, inplace=True)\n","\n","# https://www.linkedin.com/advice/0/what-some-best-practices-dealing-missing-values-imputation\n","recheck_missing_values = data_cleaned.isnull().sum()\n","missing_values_after_imputation = recheck_missing_values[recheck_missing_values > 0]\n","\n","missing_values_after_imputation"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"LT38Zi6kzMWb","executionInfo":{"status":"ok","timestamp":1709825840077,"user_tz":-480,"elapsed":3,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}},"outputId":"00bccd9c-b356-4578-c447-728caf5a5d3e"},"execution_count":23,"outputs":[{"output_type":"execute_result","data":{"text/plain":["Series([], dtype: int64)"]},"metadata":{},"execution_count":23}]},{"cell_type":"markdown","source":["# **Feature Selection and Model Training**"],"metadata":{"id":"Sp9k3u_o4tPT"}},{"cell_type":"code","source":["# Refernce [12]: https://www.analyticsvidhya.com/blog/2020/06/feature-selection-techniques-machine-learning/\n","# Refernce [13]: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection\n","features = ['gender', 'admission_age', 'race', 'heart_rate_mean', 'sbp_mean', 'dbp_mean', 'lactate_min', 'lactate_max']\n","target = 'aki'\n","\n","X = data_cleaned[features]\n","y = data_cleaned[target]\n","\n","# Refernce [14]: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html\n","categorical_features = ['gender', 'race']\n","numerical_features = ['admission_age', 'heart_rate_mean', 'sbp_mean', 'dbp_mean', 'lactate_min', 'lactate_max']\n","\n","preprocessor = ColumnTransformer(\n"," transformers=[\n"," ('num', StandardScaler(), numerical_features),\n"," ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])\n","\n","X_prepared = preprocessor.fit_transform(X)\n","\n","# Refernce [15]: https://towardsdatascience.com/stratified-sampling-you-may-have-been-splitting-your-dataset-all-wrong-8cfdd0d32502\n","X_train, X_test, y_train, y_test = train_test_split(X_prepared, y, test_size=0.2, random_state=42, stratify=y)\n","\n","X_train.shape, X_test.shape"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"M33yYmKQzSqT","executionInfo":{"status":"ok","timestamp":1709825843858,"user_tz":-480,"elapsed":593,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}},"outputId":"856f5704-ce79-4e15-d744-3e14086b0eeb"},"execution_count":24,"outputs":[{"output_type":"execute_result","data":{"text/plain":["((40736, 41), (10184, 41))"]},"metadata":{},"execution_count":24}]},{"cell_type":"markdown","source":["# **Model Prediction and Evaluation**"],"metadata":{"id":"4ZV1qCg9zkb-"}},{"cell_type":"markdown","source":["## **Logistic Regression**\n","\n"],"metadata":{"id":"FDdmNMyrkK4E"}},{"cell_type":"code","source":["# Refernce [16]: https://scikit-learn.org/stable/modules/grid_search.html\n","# Refernce [17]: https://drbeane.github.io/python_dsci/pages/grid_search.html\n","C_values = [0.001, 0.01, 0.1, 1, 10, 100]\n","\n","param_grid = {'C': C_values}\n","lr_model = LogisticRegression(penalty='l1', solver='liblinear')\n","grid_search = GridSearchCV(lr_model, param_grid, cv=5)\n","\n","grid_search.fit(X_train, y_train)\n","\n","best_model = grid_search.best_estimator_\n","\n","y_pred = best_model.predict(X_test)\n","\n","accuracy = accuracy_score(y_test, y_pred)\n","precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')\n","\n","accuracy, precision, recall, f1"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"2-D6mNOyzama","executionInfo":{"status":"ok","timestamp":1709825986657,"user_tz":-480,"elapsed":138445,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}},"outputId":"3f5257cc-766f-4eec-b42f-c6ed2c73bba7"},"execution_count":25,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(0.40298507462686567,\n"," 0.3320444093843265,\n"," 0.40298507462686567,\n"," 0.3342230247225089)"]},"metadata":{},"execution_count":25}]},{"cell_type":"markdown","source":["## **Decision Tree**"],"metadata":{"id":"zkgcrMqXlQiZ"}},{"cell_type":"code","source":["# Refernce [18]: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html\n","decision_tree = DecisionTreeClassifier(random_state=42)\n","\n","decision_tree.fit(X_train, y_train)\n","\n","y_pred_dt = decision_tree.predict(X_test)\n","\n","accuracy_dt = accuracy_score(y_test, y_pred_dt)\n","precision_dt, recall_dt, f1_dt, _ = precision_recall_fscore_support(y_test, y_pred_dt, average='weighted')\n","\n","accuracy_dt, precision_dt, recall_dt, f1_dt"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"sg-aD14elS_6","executionInfo":{"status":"ok","timestamp":1709826029626,"user_tz":-480,"elapsed":3960,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}},"outputId":"62a1b1b5-633f-43dc-82ba-5de8379d51ee"},"execution_count":26,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(0.31922623723487825,\n"," 0.31932298016417965,\n"," 0.31922623723487825,\n"," 0.3192673787031557)"]},"metadata":{},"execution_count":26}]},{"cell_type":"markdown","source":["## **Random Forest**"],"metadata":{"id":"Uev9GBLtWzVA"}},{"cell_type":"code","source":["# Refernce [19]: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html\n","random_forest = RandomForestClassifier(random_state=42)\n","\n","random_forest.fit(X_train, y_train)\n","\n","y_pred_rf = random_forest.predict(X_test)\n","\n","accuracy_rf = accuracy_score(y_test, y_pred_rf)\n","precision_rf, recall_rf, f1_rf, _ = precision_recall_fscore_support(y_test, y_pred_rf, average='weighted')\n","\n","accuracy_rf, precision_rf, recall_rf, f1_rf"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"y4wB0NCnW64T","executionInfo":{"status":"ok","timestamp":1709826158783,"user_tz":-480,"elapsed":109714,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}},"outputId":"29a5f0e9-ae1c-45f8-af5a-63b1b194ea70"},"execution_count":28,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(0.3881578947368421, 0.358609063596094, 0.3881578947368421, 0.3568783712222866)"]},"metadata":{},"execution_count":28}]},{"cell_type":"markdown","source":["## **SVM**"],"metadata":{"id":"s5aosjeOke50"}},{"cell_type":"code","source":["# Refernce [20]: https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html\n","# Refernce [21]: https://www.datatechnotes.com/2020/07/classification-example-with-linearsvm-in-python.html\n","# Refernce [22]: https://medium.com/@mrconnor/understanding-support-vector-machines-through-code-a-detailed-guide-692d0061d78b\n","svm_model = LinearSVC(random_state=42, max_iter=100000)\n","\n","svm_model.fit(X_train, y_train)\n","\n","y_pred_svm = svm_model.predict(X_test)\n","\n","\n","accuracy_svm = accuracy_score(y_test, y_pred_svm)\n","precision_svm, recall_svm, f1_svm, _ = precision_recall_fscore_support(y_test, y_pred_svm, average='weighted')\n","\n","accuracy_svm, precision_svm, recall_svm, f1_svm"],"metadata":{"id":"hJujwmMHkjCT","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1709827788707,"user_tz":-480,"elapsed":1363556,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}},"outputId":"5019fc46-3ee0-4b35-e60b-04dd7ee534a0"},"execution_count":30,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(0.40327965435978, 0.33123320296781494, 0.40327965435978, 0.3331245901794984)"]},"metadata":{},"execution_count":30}]}]}