1 lines (1 with data), 37.0 kB
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"mount_file_id":"1HsRV9AY76wqGbLLY1XaC3tMPAIUq6GgO","authorship_tag":"ABX9TyMiXZY3Q7CeR932hy15h6zW"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","source":["# **Import dependencies and Load the dataset**"],"metadata":{"id":"gRWEL5oFUXLp"}},{"cell_type":"code","execution_count":19,"metadata":{"id":"97l_e_AHtEqO","executionInfo":{"status":"ok","timestamp":1709825818620,"user_tz":-480,"elapsed":2419,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}}},"outputs":[],"source":["# Refernce [1]: https://digitalhumanities.hkust.edu.hk/tutorials/learn-python-from-zero-for-absolute-beginner-1-data-cleaning/\n","import pandas as pd\n","from sklearn.model_selection import train_test_split, GridSearchCV\n","from sklearn.linear_model import LogisticRegression\n","from sklearn.tree import DecisionTreeClassifier\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.svm import LinearSVC\n","from sklearn.metrics import accuracy_score, roc_auc_score, precision_recall_fscore_support\n","from sklearn.preprocessing import StandardScaler, OneHotEncoder\n","from sklearn.compose import ColumnTransformer\n","from sklearn.pipeline import Pipeline\n","\n","data = pd.read_csv(\"/content/drive/MyDrive/SPH 6004/Assignment1_data.csv\")"]},{"cell_type":"markdown","source":["# **Data Cleaning**"],"metadata":{"id":"VPCXMBzDUii4"}},{"cell_type":"code","source":["# Refernce [2]: https://miamioh.edu/centers-institutes/center-for-analytics-data-science/students/coding-tutorials/python/data-cleaning.html\n","missing_values = data.isnull().sum()\n","\n","# Refernce [3]: https://digitalhumanities.hkust.edu.hk/tutorials/learn-python-from-zero-for-absolute-beginner-1-data-cleaning/\n","# Refernce [4]: https://note.nkmk.me/en/python-pandas-nan-judge-count/\n","missing_values_summary = pd.DataFrame(missing_values, columns=['Missing Values'])\n","missing_values_summary = missing_values_summary[missing_values_summary['Missing Values'] > 0]\n","missing_values_summary.sort_values(by='Missing Values', ascending=False, inplace=True)\n","\n","missing_values_summary.head(20)"],"metadata":{"id":"HAvMsNS1Uh8X","colab":{"base_uri":"https://localhost:8080/","height":669},"executionInfo":{"status":"ok","timestamp":1709825824655,"user_tz":-480,"elapsed":812,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}},"outputId":"16e30a52-5d6e-4dc2-eebd-8295bb7fd066"},"execution_count":20,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" Missing Values\n","thrombin_max 50829\n","thrombin_min 50829\n","d_dimer_min 50811\n","d_dimer_max 50811\n","ggt_max 50448\n","ggt_min 50448\n","globulin_min 50235\n","globulin_max 50235\n","bicarbonate_min 50071\n","bicarbonate_max 50071\n","methemoglobin_min 49820\n","methemoglobin_max 49820\n","total_protein_max 49761\n","total_protein_min 49761\n","carboxyhemoglobin_min 49724\n","carboxyhemoglobin_max 49724\n","bilirubin_indirect_min 48823\n","bilirubin_indirect_max 48823\n","nrbc_min 48815\n","nrbc_max 48815"],"text/html":["\n"," <div id=\"df-06ee9473-c953-4a07-a699-2d780750fa36\" class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>Missing Values</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>thrombin_max</th>\n"," <td>50829</td>\n"," </tr>\n"," <tr>\n"," <th>thrombin_min</th>\n"," <td>50829</td>\n"," </tr>\n"," <tr>\n"," <th>d_dimer_min</th>\n"," <td>50811</td>\n"," </tr>\n"," <tr>\n"," <th>d_dimer_max</th>\n"," <td>50811</td>\n"," </tr>\n"," <tr>\n"," <th>ggt_max</th>\n"," <td>50448</td>\n"," </tr>\n"," <tr>\n"," <th>ggt_min</th>\n"," <td>50448</td>\n"," </tr>\n"," <tr>\n"," <th>globulin_min</th>\n"," <td>50235</td>\n"," </tr>\n"," <tr>\n"," <th>globulin_max</th>\n"," <td>50235</td>\n"," </tr>\n"," <tr>\n"," <th>bicarbonate_min</th>\n"," <td>50071</td>\n"," </tr>\n"," <tr>\n"," <th>bicarbonate_max</th>\n"," <td>50071</td>\n"," </tr>\n"," <tr>\n"," <th>methemoglobin_min</th>\n"," <td>49820</td>\n"," </tr>\n"," <tr>\n"," <th>methemoglobin_max</th>\n"," <td>49820</td>\n"," </tr>\n"," <tr>\n"," <th>total_protein_max</th>\n"," <td>49761</td>\n"," </tr>\n"," <tr>\n"," <th>total_protein_min</th>\n"," <td>49761</td>\n"," </tr>\n"," <tr>\n"," <th>carboxyhemoglobin_min</th>\n"," <td>49724</td>\n"," </tr>\n"," <tr>\n"," <th>carboxyhemoglobin_max</th>\n"," <td>49724</td>\n"," </tr>\n"," <tr>\n"," <th>bilirubin_indirect_min</th>\n"," <td>48823</td>\n"," </tr>\n"," <tr>\n"," <th>bilirubin_indirect_max</th>\n"," <td>48823</td>\n"," </tr>\n"," <tr>\n"," <th>nrbc_min</th>\n"," <td>48815</td>\n"," </tr>\n"," <tr>\n"," <th>nrbc_max</th>\n"," <td>48815</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>\n"," <div class=\"colab-df-buttons\">\n","\n"," <div class=\"colab-df-container\">\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-06ee9473-c953-4a07-a699-2d780750fa36')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n","\n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n"," <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n"," </svg>\n"," </button>\n","\n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," .colab-df-buttons div {\n"," margin-bottom: 4px;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-06ee9473-c953-4a07-a699-2d780750fa36 button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-06ee9473-c953-4a07-a699-2d780750fa36');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n","\n","\n","<div id=\"df-b5f38fab-c1e9-4a34-a6b0-bd26d2669705\">\n"," <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-b5f38fab-c1e9-4a34-a6b0-bd26d2669705')\"\n"," title=\"Suggest charts\"\n"," style=\"display:none;\">\n","\n","<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <g>\n"," <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n"," </g>\n","</svg>\n"," </button>\n","\n","<style>\n"," .colab-df-quickchart {\n"," --bg-color: #E8F0FE;\n"," --fill-color: #1967D2;\n"," --hover-bg-color: #E2EBFA;\n"," --hover-fill-color: #174EA6;\n"," --disabled-fill-color: #AAA;\n"," --disabled-bg-color: #DDD;\n"," }\n","\n"," [theme=dark] .colab-df-quickchart {\n"," --bg-color: #3B4455;\n"," --fill-color: #D2E3FC;\n"," --hover-bg-color: #434B5C;\n"," --hover-fill-color: #FFFFFF;\n"," --disabled-bg-color: #3B4455;\n"," --disabled-fill-color: #666;\n"," }\n","\n"," .colab-df-quickchart {\n"," background-color: var(--bg-color);\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: var(--fill-color);\n"," height: 32px;\n"," padding: 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-quickchart:hover {\n"," background-color: var(--hover-bg-color);\n"," box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: var(--button-hover-fill-color);\n"," }\n","\n"," .colab-df-quickchart-complete:disabled,\n"," .colab-df-quickchart-complete:disabled:hover {\n"," background-color: var(--disabled-bg-color);\n"," fill: var(--disabled-fill-color);\n"," box-shadow: none;\n"," }\n","\n"," .colab-df-spinner {\n"," border: 2px solid var(--fill-color);\n"," border-color: transparent;\n"," border-bottom-color: var(--fill-color);\n"," animation:\n"," spin 1s steps(1) infinite;\n"," }\n","\n"," @keyframes spin {\n"," 0% {\n"," border-color: transparent;\n"," border-bottom-color: var(--fill-color);\n"," border-left-color: var(--fill-color);\n"," }\n"," 20% {\n"," border-color: transparent;\n"," border-left-color: var(--fill-color);\n"," border-top-color: var(--fill-color);\n"," }\n"," 30% {\n"," border-color: transparent;\n"," border-left-color: var(--fill-color);\n"," border-top-color: var(--fill-color);\n"," border-right-color: var(--fill-color);\n"," }\n"," 40% {\n"," border-color: transparent;\n"," border-right-color: var(--fill-color);\n"," border-top-color: var(--fill-color);\n"," }\n"," 60% {\n"," border-color: transparent;\n"," border-right-color: var(--fill-color);\n"," }\n"," 80% {\n"," border-color: transparent;\n"," border-right-color: var(--fill-color);\n"," border-bottom-color: var(--fill-color);\n"," }\n"," 90% {\n"," border-color: transparent;\n"," border-bottom-color: var(--fill-color);\n"," }\n"," }\n","</style>\n","\n"," <script>\n"," async function quickchart(key) {\n"," const quickchartButtonEl =\n"," document.querySelector('#' + key + ' button');\n"," quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n"," quickchartButtonEl.classList.add('colab-df-spinner');\n"," try {\n"," const charts = await google.colab.kernel.invokeFunction(\n"," 'suggestCharts', [key], {});\n"," } catch (error) {\n"," console.error('Error during call to suggestCharts:', error);\n"," }\n"," quickchartButtonEl.classList.remove('colab-df-spinner');\n"," quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n"," }\n"," (() => {\n"," let quickchartButtonEl =\n"," document.querySelector('#df-b5f38fab-c1e9-4a34-a6b0-bd26d2669705 button');\n"," quickchartButtonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n"," })();\n"," </script>\n","</div>\n","\n"," </div>\n"," </div>\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","variable_name":"missing_values_summary","summary":"{\n \"name\": \"missing_values_summary\",\n \"rows\": 157,\n \"fields\": [\n {\n \"column\": \"Missing Values\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 18864,\n \"min\": 79,\n \"max\": 50829,\n \"num_unique_values\": 70,\n \"samples\": [\n 36178,\n 50829,\n 1020\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":20}]},{"cell_type":"code","source":["# Refernce [5]: https://www.sciencedirect.com/science/article/pii/S0895435618308710\n","# Refernce [6]: https://stackoverflow.com/questions/65775141/remove-rows-with-more-than-percentage-of-missing-data-for-majority-class-samples\n","threshold = 0.5 * len(data)\n","\n","# Refernce [7]: https://www.statology.org/pandas-exclude-column/\n","# Refernce [8]: https://www.datacamp.com/tutorial/pandas-drop-column?utm_source=google&utm_medium=paid_search&utm_campaignid=19589720821&utm_adgroupid=157156375191&utm_device=c&utm_keyword=&utm_matchtype=&utm_network=g&utm_adpostion=&utm_creative=691747307431&utm_targetid=dsa-2218886984100&utm_loc_interest_ms=&utm_loc_physical_ms=9062543&utm_content=&utm_campaign=230119_1-sea~dsa~tofu_2-b2c_3-row-p1_4-prc_5-na_6-na_7-le_8-pdsh-go_9-na_10-na_11-na-feb24&gad_source=1&gclid=CjwKCAiAxaCvBhBaEiwAvsLmWGh-6fy-rTQ96ZwE3t9Fisrs4seiXS8GQvJkeQYp5J_Dj4IoGtEA8BoCTucQAvD_BwE\n","columns_to_exclude = missing_values_summary[missing_values_summary['Missing Values'] > threshold].index\n","data_cleaned = data.drop(columns=columns_to_exclude)\n","\n","original_shape = data.shape\n","cleaned_shape = data_cleaned.shape\n","original_shape, cleaned_shape, columns_to_exclude.tolist()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"5GM28kJYy1Hq","executionInfo":{"status":"ok","timestamp":1709825831008,"user_tz":-480,"elapsed":520,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}},"outputId":"8d3ece2c-b957-4ee9-fe85-46e9a3ab91ff"},"execution_count":21,"outputs":[{"output_type":"execute_result","data":{"text/plain":["((50920, 162),\n"," (50920, 87),\n"," ['thrombin_max',\n"," 'thrombin_min',\n"," 'd_dimer_min',\n"," 'd_dimer_max',\n"," 'ggt_max',\n"," 'ggt_min',\n"," 'globulin_min',\n"," 'globulin_max',\n"," 'bicarbonate_min',\n"," 'bicarbonate_max',\n"," 'methemoglobin_min',\n"," 'methemoglobin_max',\n"," 'total_protein_max',\n"," 'total_protein_min',\n"," 'carboxyhemoglobin_min',\n"," 'carboxyhemoglobin_max',\n"," 'bilirubin_indirect_min',\n"," 'bilirubin_indirect_max',\n"," 'nrbc_min',\n"," 'nrbc_max',\n"," 'bilirubin_direct_min',\n"," 'bilirubin_direct_max',\n"," 'amylase_min',\n"," 'amylase_max',\n"," 'aado2_max',\n"," 'aado2_min',\n"," 'atyps_min',\n"," 'atyps_max',\n"," 'metas_max',\n"," 'metas_min',\n"," 'bands_min',\n"," 'bands_max',\n"," 'temperature_min.1',\n"," 'temperature_max.1',\n"," 'imm_granulocytes_min',\n"," 'imm_granulocytes_max',\n"," 'chloride_max',\n"," 'chloride_min',\n"," 'hemoglobin_max',\n"," 'hemoglobin_min',\n"," 'hematocrit_max',\n"," 'hematocrit_min',\n"," 'ck_mb_min',\n"," 'ck_mb_max',\n"," 'ld_ldh_min',\n"," 'ld_ldh_max',\n"," 'sodium_min',\n"," 'sodium_max',\n"," 'fibrinogen_min',\n"," 'fibrinogen_max',\n"," 'so2_max',\n"," 'so2_min',\n"," 'ck_cpk_min',\n"," 'ck_cpk_max',\n"," 'glucose_min.1',\n"," 'glucose_max.1',\n"," 'potassium_min',\n"," 'potassium_max',\n"," 'albumin_max',\n"," 'albumin_min',\n"," 'calcium_max',\n"," 'calcium_min',\n"," 'pao2fio2ratio_min',\n"," 'pao2fio2ratio_max',\n"," 'aado2_calc_max',\n"," 'aado2_calc_min',\n"," 'bilirubin_total_max',\n"," 'bilirubin_total_min',\n"," 'alp_max',\n"," 'alp_min',\n"," 'height',\n"," 'alt_min',\n"," 'alt_max',\n"," 'ast_min',\n"," 'ast_max'])"]},"metadata":{},"execution_count":21}]},{"cell_type":"code","source":["# Refernce [9]: https://insightsoftware.com/blog/how-to-handle-missing-data-values-while-data-cleaning/\n","missing_values_cleaned = data_cleaned.isnull().sum()\n","missing_values_cleaned_summary = pd.DataFrame(missing_values_cleaned[missing_values_cleaned > 0], columns=['Missing Values'])\n","missing_values_cleaned_summary.sort_values(by='Missing Values', ascending=False, inplace=True)\n","\n","missing_values_cleaned_summary.head(20)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":669},"id":"ZH1-ffBFy558","executionInfo":{"status":"ok","timestamp":1709825836714,"user_tz":-480,"elapsed":692,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}},"outputId":"fe1d46f7-dacf-4c4d-a107-de986dee8e03"},"execution_count":22,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" Missing Values\n","lactate_max 23759\n","lactate_min 23759\n","abs_monocytes_min 20095\n","abs_basophils_min 20095\n","abs_basophils_max 20095\n","abs_monocytes_max 20095\n","abs_eosinophils_min 20094\n","abs_eosinophils_max 20094\n","abs_neutrophils_max 20094\n","abs_neutrophils_min 20094\n","abs_lymphocytes_min 20082\n","abs_lymphocytes_max 20082\n","ph_min 19748\n","ph_max 19748\n","totalco2_max 19744\n","totalco2_min 19744\n","baseexcess_max 19744\n","baseexcess_min 19744\n","pco2_max 19744\n","pco2_min 19744"],"text/html":["\n"," <div id=\"df-4d2ad525-73a0-49e4-aadc-79a7e8bb75b5\" class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>Missing Values</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>lactate_max</th>\n"," <td>23759</td>\n"," </tr>\n"," <tr>\n"," <th>lactate_min</th>\n"," <td>23759</td>\n"," </tr>\n"," <tr>\n"," <th>abs_monocytes_min</th>\n"," <td>20095</td>\n"," </tr>\n"," <tr>\n"," <th>abs_basophils_min</th>\n"," <td>20095</td>\n"," </tr>\n"," <tr>\n"," <th>abs_basophils_max</th>\n"," <td>20095</td>\n"," </tr>\n"," <tr>\n"," <th>abs_monocytes_max</th>\n"," <td>20095</td>\n"," </tr>\n"," <tr>\n"," <th>abs_eosinophils_min</th>\n"," <td>20094</td>\n"," </tr>\n"," <tr>\n"," <th>abs_eosinophils_max</th>\n"," <td>20094</td>\n"," </tr>\n"," <tr>\n"," <th>abs_neutrophils_max</th>\n"," <td>20094</td>\n"," </tr>\n"," <tr>\n"," <th>abs_neutrophils_min</th>\n"," <td>20094</td>\n"," </tr>\n"," <tr>\n"," <th>abs_lymphocytes_min</th>\n"," <td>20082</td>\n"," </tr>\n"," <tr>\n"," <th>abs_lymphocytes_max</th>\n"," <td>20082</td>\n"," </tr>\n"," <tr>\n"," <th>ph_min</th>\n"," <td>19748</td>\n"," </tr>\n"," <tr>\n"," <th>ph_max</th>\n"," <td>19748</td>\n"," </tr>\n"," <tr>\n"," <th>totalco2_max</th>\n"," <td>19744</td>\n"," </tr>\n"," <tr>\n"," <th>totalco2_min</th>\n"," <td>19744</td>\n"," </tr>\n"," <tr>\n"," <th>baseexcess_max</th>\n"," <td>19744</td>\n"," </tr>\n"," <tr>\n"," <th>baseexcess_min</th>\n"," <td>19744</td>\n"," </tr>\n"," <tr>\n"," <th>pco2_max</th>\n"," <td>19744</td>\n"," </tr>\n"," <tr>\n"," <th>pco2_min</th>\n"," <td>19744</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>\n"," <div class=\"colab-df-buttons\">\n","\n"," <div class=\"colab-df-container\">\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-4d2ad525-73a0-49e4-aadc-79a7e8bb75b5')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n","\n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n"," <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n"," </svg>\n"," </button>\n","\n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," .colab-df-buttons div {\n"," margin-bottom: 4px;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-4d2ad525-73a0-49e4-aadc-79a7e8bb75b5 button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-4d2ad525-73a0-49e4-aadc-79a7e8bb75b5');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n","\n","\n","<div id=\"df-43f9ba52-de94-422a-b5b6-72b71c5bbb8c\">\n"," <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-43f9ba52-de94-422a-b5b6-72b71c5bbb8c')\"\n"," title=\"Suggest charts\"\n"," style=\"display:none;\">\n","\n","<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <g>\n"," <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n"," </g>\n","</svg>\n"," </button>\n","\n","<style>\n"," .colab-df-quickchart {\n"," --bg-color: #E8F0FE;\n"," --fill-color: #1967D2;\n"," --hover-bg-color: #E2EBFA;\n"," --hover-fill-color: #174EA6;\n"," --disabled-fill-color: #AAA;\n"," --disabled-bg-color: #DDD;\n"," }\n","\n"," [theme=dark] .colab-df-quickchart {\n"," --bg-color: #3B4455;\n"," --fill-color: #D2E3FC;\n"," --hover-bg-color: #434B5C;\n"," --hover-fill-color: #FFFFFF;\n"," --disabled-bg-color: #3B4455;\n"," --disabled-fill-color: #666;\n"," }\n","\n"," .colab-df-quickchart {\n"," background-color: var(--bg-color);\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: var(--fill-color);\n"," height: 32px;\n"," padding: 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-quickchart:hover {\n"," background-color: var(--hover-bg-color);\n"," box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: var(--button-hover-fill-color);\n"," }\n","\n"," .colab-df-quickchart-complete:disabled,\n"," .colab-df-quickchart-complete:disabled:hover {\n"," background-color: var(--disabled-bg-color);\n"," fill: var(--disabled-fill-color);\n"," box-shadow: none;\n"," }\n","\n"," .colab-df-spinner {\n"," border: 2px solid var(--fill-color);\n"," border-color: transparent;\n"," border-bottom-color: var(--fill-color);\n"," animation:\n"," spin 1s steps(1) infinite;\n"," }\n","\n"," @keyframes spin {\n"," 0% {\n"," border-color: transparent;\n"," border-bottom-color: var(--fill-color);\n"," border-left-color: var(--fill-color);\n"," }\n"," 20% {\n"," border-color: transparent;\n"," border-left-color: var(--fill-color);\n"," border-top-color: var(--fill-color);\n"," }\n"," 30% {\n"," border-color: transparent;\n"," border-left-color: var(--fill-color);\n"," border-top-color: var(--fill-color);\n"," border-right-color: var(--fill-color);\n"," }\n"," 40% {\n"," border-color: transparent;\n"," border-right-color: var(--fill-color);\n"," border-top-color: var(--fill-color);\n"," }\n"," 60% {\n"," border-color: transparent;\n"," border-right-color: var(--fill-color);\n"," }\n"," 80% {\n"," border-color: transparent;\n"," border-right-color: var(--fill-color);\n"," border-bottom-color: var(--fill-color);\n"," }\n"," 90% {\n"," border-color: transparent;\n"," border-bottom-color: var(--fill-color);\n"," }\n"," }\n","</style>\n","\n"," <script>\n"," async function quickchart(key) {\n"," const quickchartButtonEl =\n"," document.querySelector('#' + key + ' button');\n"," quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n"," quickchartButtonEl.classList.add('colab-df-spinner');\n"," try {\n"," const charts = await google.colab.kernel.invokeFunction(\n"," 'suggestCharts', [key], {});\n"," } catch (error) {\n"," console.error('Error during call to suggestCharts:', error);\n"," }\n"," quickchartButtonEl.classList.remove('colab-df-spinner');\n"," quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n"," }\n"," (() => {\n"," let quickchartButtonEl =\n"," document.querySelector('#df-43f9ba52-de94-422a-b5b6-72b71c5bbb8c button');\n"," quickchartButtonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n"," })();\n"," </script>\n","</div>\n","\n"," </div>\n"," </div>\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","variable_name":"missing_values_cleaned_summary","summary":"{\n \"name\": \"missing_values_cleaned_summary\",\n \"rows\": 82,\n \"fields\": [\n {\n \"column\": \"Missing Values\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 8672,\n \"min\": 79,\n \"max\": 23759,\n \"num_unique_values\": 34,\n \"samples\": [\n 662,\n 602,\n 378\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":22}]},{"cell_type":"code","source":["# Refernce [10]: https://scikit-learn.org/stable/modules/impute.html\n","# Refernce [11]: https://www.geeksforgeeks.org/how-to-fill-nan-values-with-mean-in-pandas/\n","\n","for column in missing_values_cleaned_summary.index:\n"," if data_cleaned[column].dtype in ['float64', 'int64']: # Only impute the column that is numerical!\n"," median_value = data_cleaned[column].median()\n"," data_cleaned[column].fillna(median_value, inplace=True)\n","\n","# https://www.linkedin.com/advice/0/what-some-best-practices-dealing-missing-values-imputation\n","recheck_missing_values = data_cleaned.isnull().sum()\n","missing_values_after_imputation = recheck_missing_values[recheck_missing_values > 0]\n","\n","missing_values_after_imputation"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"LT38Zi6kzMWb","executionInfo":{"status":"ok","timestamp":1709825840077,"user_tz":-480,"elapsed":3,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}},"outputId":"00bccd9c-b356-4578-c447-728caf5a5d3e"},"execution_count":23,"outputs":[{"output_type":"execute_result","data":{"text/plain":["Series([], dtype: int64)"]},"metadata":{},"execution_count":23}]},{"cell_type":"markdown","source":["# **Feature Selection and Model Training**"],"metadata":{"id":"Sp9k3u_o4tPT"}},{"cell_type":"code","source":["# Refernce [12]: https://www.analyticsvidhya.com/blog/2020/06/feature-selection-techniques-machine-learning/\n","# Refernce [13]: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection\n","features = ['gender', 'admission_age', 'race', 'heart_rate_mean', 'sbp_mean', 'dbp_mean', 'lactate_min', 'lactate_max']\n","target = 'aki'\n","\n","X = data_cleaned[features]\n","y = data_cleaned[target]\n","\n","# Refernce [14]: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html\n","categorical_features = ['gender', 'race']\n","numerical_features = ['admission_age', 'heart_rate_mean', 'sbp_mean', 'dbp_mean', 'lactate_min', 'lactate_max']\n","\n","preprocessor = ColumnTransformer(\n"," transformers=[\n"," ('num', StandardScaler(), numerical_features),\n"," ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])\n","\n","X_prepared = preprocessor.fit_transform(X)\n","\n","# Refernce [15]: https://towardsdatascience.com/stratified-sampling-you-may-have-been-splitting-your-dataset-all-wrong-8cfdd0d32502\n","X_train, X_test, y_train, y_test = train_test_split(X_prepared, y, test_size=0.2, random_state=42, stratify=y)\n","\n","X_train.shape, X_test.shape"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"M33yYmKQzSqT","executionInfo":{"status":"ok","timestamp":1709825843858,"user_tz":-480,"elapsed":593,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}},"outputId":"856f5704-ce79-4e15-d744-3e14086b0eeb"},"execution_count":24,"outputs":[{"output_type":"execute_result","data":{"text/plain":["((40736, 41), (10184, 41))"]},"metadata":{},"execution_count":24}]},{"cell_type":"markdown","source":["# **Model Prediction and Evaluation**"],"metadata":{"id":"4ZV1qCg9zkb-"}},{"cell_type":"markdown","source":["## **Logistic Regression**\n","\n"],"metadata":{"id":"FDdmNMyrkK4E"}},{"cell_type":"code","source":["# Refernce [16]: https://scikit-learn.org/stable/modules/grid_search.html\n","# Refernce [17]: https://drbeane.github.io/python_dsci/pages/grid_search.html\n","C_values = [0.001, 0.01, 0.1, 1, 10, 100]\n","\n","param_grid = {'C': C_values}\n","lr_model = LogisticRegression(penalty='l1', solver='liblinear')\n","grid_search = GridSearchCV(lr_model, param_grid, cv=5)\n","\n","grid_search.fit(X_train, y_train)\n","\n","best_model = grid_search.best_estimator_\n","\n","y_pred = best_model.predict(X_test)\n","\n","accuracy = accuracy_score(y_test, y_pred)\n","precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')\n","\n","accuracy, precision, recall, f1"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"2-D6mNOyzama","executionInfo":{"status":"ok","timestamp":1709825986657,"user_tz":-480,"elapsed":138445,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}},"outputId":"3f5257cc-766f-4eec-b42f-c6ed2c73bba7"},"execution_count":25,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(0.40298507462686567,\n"," 0.3320444093843265,\n"," 0.40298507462686567,\n"," 0.3342230247225089)"]},"metadata":{},"execution_count":25}]},{"cell_type":"markdown","source":["## **Decision Tree**"],"metadata":{"id":"zkgcrMqXlQiZ"}},{"cell_type":"code","source":["# Refernce [18]: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html\n","decision_tree = DecisionTreeClassifier(random_state=42)\n","\n","decision_tree.fit(X_train, y_train)\n","\n","y_pred_dt = decision_tree.predict(X_test)\n","\n","accuracy_dt = accuracy_score(y_test, y_pred_dt)\n","precision_dt, recall_dt, f1_dt, _ = precision_recall_fscore_support(y_test, y_pred_dt, average='weighted')\n","\n","accuracy_dt, precision_dt, recall_dt, f1_dt"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"sg-aD14elS_6","executionInfo":{"status":"ok","timestamp":1709826029626,"user_tz":-480,"elapsed":3960,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}},"outputId":"62a1b1b5-633f-43dc-82ba-5de8379d51ee"},"execution_count":26,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(0.31922623723487825,\n"," 0.31932298016417965,\n"," 0.31922623723487825,\n"," 0.3192673787031557)"]},"metadata":{},"execution_count":26}]},{"cell_type":"markdown","source":["## **Random Forest**"],"metadata":{"id":"Uev9GBLtWzVA"}},{"cell_type":"code","source":["# Refernce [19]: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html\n","random_forest = RandomForestClassifier(random_state=42)\n","\n","random_forest.fit(X_train, y_train)\n","\n","y_pred_rf = random_forest.predict(X_test)\n","\n","accuracy_rf = accuracy_score(y_test, y_pred_rf)\n","precision_rf, recall_rf, f1_rf, _ = precision_recall_fscore_support(y_test, y_pred_rf, average='weighted')\n","\n","accuracy_rf, precision_rf, recall_rf, f1_rf"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"y4wB0NCnW64T","executionInfo":{"status":"ok","timestamp":1709826158783,"user_tz":-480,"elapsed":109714,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}},"outputId":"29a5f0e9-ae1c-45f8-af5a-63b1b194ea70"},"execution_count":28,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(0.3881578947368421, 0.358609063596094, 0.3881578947368421, 0.3568783712222866)"]},"metadata":{},"execution_count":28}]},{"cell_type":"markdown","source":["## **SVM**"],"metadata":{"id":"s5aosjeOke50"}},{"cell_type":"code","source":["# Refernce [20]: https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html\n","# Refernce [21]: https://www.datatechnotes.com/2020/07/classification-example-with-linearsvm-in-python.html\n","# Refernce [22]: https://medium.com/@mrconnor/understanding-support-vector-machines-through-code-a-detailed-guide-692d0061d78b\n","svm_model = LinearSVC(random_state=42, max_iter=100000)\n","\n","svm_model.fit(X_train, y_train)\n","\n","y_pred_svm = svm_model.predict(X_test)\n","\n","\n","accuracy_svm = accuracy_score(y_test, y_pred_svm)\n","precision_svm, recall_svm, f1_svm, _ = precision_recall_fscore_support(y_test, y_pred_svm, average='weighted')\n","\n","accuracy_svm, precision_svm, recall_svm, f1_svm"],"metadata":{"id":"hJujwmMHkjCT","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1709827788707,"user_tz":-480,"elapsed":1363556,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}},"outputId":"5019fc46-3ee0-4b35-e60b-04dd7ee534a0"},"execution_count":30,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(0.40327965435978, 0.33123320296781494, 0.40327965435978, 0.3331245901794984)"]},"metadata":{},"execution_count":30}]}]}