Diff of /Assignment1_code.ipynb [000000] .. [396da9]

Switch to unified view

a b/Assignment1_code.ipynb
1
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"mount_file_id":"1HsRV9AY76wqGbLLY1XaC3tMPAIUq6GgO","authorship_tag":"ABX9TyMiXZY3Q7CeR932hy15h6zW"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","source":["# **Import dependencies and Load the dataset**"],"metadata":{"id":"gRWEL5oFUXLp"}},{"cell_type":"code","execution_count":19,"metadata":{"id":"97l_e_AHtEqO","executionInfo":{"status":"ok","timestamp":1709825818620,"user_tz":-480,"elapsed":2419,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}}},"outputs":[],"source":["# Refernce [1]: https://digitalhumanities.hkust.edu.hk/tutorials/learn-python-from-zero-for-absolute-beginner-1-data-cleaning/\n","import pandas as pd\n","from sklearn.model_selection import train_test_split, GridSearchCV\n","from sklearn.linear_model import LogisticRegression\n","from sklearn.tree import DecisionTreeClassifier\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.svm import LinearSVC\n","from sklearn.metrics import accuracy_score, roc_auc_score, precision_recall_fscore_support\n","from sklearn.preprocessing import StandardScaler, OneHotEncoder\n","from sklearn.compose import ColumnTransformer\n","from sklearn.pipeline import Pipeline\n","\n","data = pd.read_csv(\"/content/drive/MyDrive/SPH 6004/Assignment1_data.csv\")"]},{"cell_type":"markdown","source":["# **Data Cleaning**"],"metadata":{"id":"VPCXMBzDUii4"}},{"cell_type":"code","source":["# Refernce [2]: https://miamioh.edu/centers-institutes/center-for-analytics-data-science/students/coding-tutorials/python/data-cleaning.html\n","missing_values = data.isnull().sum()\n","\n","# Refernce [3]: https://digitalhumanities.hkust.edu.hk/tutorials/learn-python-from-zero-for-absolute-beginner-1-data-cleaning/\n","# Refernce [4]: https://note.nkmk.me/en/python-pandas-nan-judge-count/\n","missing_values_summary = pd.DataFrame(missing_values, columns=['Missing Values'])\n","missing_values_summary = missing_values_summary[missing_values_summary['Missing Values'] > 0]\n","missing_values_summary.sort_values(by='Missing Values', ascending=False, inplace=True)\n","\n","missing_values_summary.head(20)"],"metadata":{"id":"HAvMsNS1Uh8X","colab":{"base_uri":"https://localhost:8080/","height":669},"executionInfo":{"status":"ok","timestamp":1709825824655,"user_tz":-480,"elapsed":812,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}},"outputId":"16e30a52-5d6e-4dc2-eebd-8295bb7fd066"},"execution_count":20,"outputs":[{"output_type":"execute_result","data":{"text/plain":["                        Missing Values\n","thrombin_max                     50829\n","thrombin_min                     50829\n","d_dimer_min                      50811\n","d_dimer_max                      50811\n","ggt_max                          50448\n","ggt_min                          50448\n","globulin_min                     50235\n","globulin_max                     50235\n","bicarbonate_min                  50071\n","bicarbonate_max                  50071\n","methemoglobin_min                49820\n","methemoglobin_max                49820\n","total_protein_max                49761\n","total_protein_min                49761\n","carboxyhemoglobin_min            49724\n","carboxyhemoglobin_max            49724\n","bilirubin_indirect_min           48823\n","bilirubin_indirect_max           48823\n","nrbc_min                         48815\n","nrbc_max                         48815"],"text/html":["\n","  <div id=\"df-06ee9473-c953-4a07-a699-2d780750fa36\" class=\"colab-df-container\">\n","    <div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>Missing Values</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>thrombin_max</th>\n","      <td>50829</td>\n","    </tr>\n","    <tr>\n","      <th>thrombin_min</th>\n","      <td>50829</td>\n","    </tr>\n","    <tr>\n","      <th>d_dimer_min</th>\n","      <td>50811</td>\n","    </tr>\n","    <tr>\n","      <th>d_dimer_max</th>\n","      <td>50811</td>\n","    </tr>\n","    <tr>\n","      <th>ggt_max</th>\n","      <td>50448</td>\n","    </tr>\n","    <tr>\n","      <th>ggt_min</th>\n","      <td>50448</td>\n","    </tr>\n","    <tr>\n","      <th>globulin_min</th>\n","      <td>50235</td>\n","    </tr>\n","    <tr>\n","      <th>globulin_max</th>\n","      <td>50235</td>\n","    </tr>\n","    <tr>\n","      <th>bicarbonate_min</th>\n","      <td>50071</td>\n","    </tr>\n","    <tr>\n","      <th>bicarbonate_max</th>\n","      <td>50071</td>\n","    </tr>\n","    <tr>\n","      <th>methemoglobin_min</th>\n","      <td>49820</td>\n","    </tr>\n","    <tr>\n","      <th>methemoglobin_max</th>\n","      <td>49820</td>\n","    </tr>\n","    <tr>\n","      <th>total_protein_max</th>\n","      <td>49761</td>\n","    </tr>\n","    <tr>\n","      <th>total_protein_min</th>\n","      <td>49761</td>\n","    </tr>\n","    <tr>\n","      <th>carboxyhemoglobin_min</th>\n","      <td>49724</td>\n","    </tr>\n","    <tr>\n","      <th>carboxyhemoglobin_max</th>\n","      <td>49724</td>\n","    </tr>\n","    <tr>\n","      <th>bilirubin_indirect_min</th>\n","      <td>48823</td>\n","    </tr>\n","    <tr>\n","      <th>bilirubin_indirect_max</th>\n","      <td>48823</td>\n","    </tr>\n","    <tr>\n","      <th>nrbc_min</th>\n","      <td>48815</td>\n","    </tr>\n","    <tr>\n","      <th>nrbc_max</th>\n","      <td>48815</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>\n","    <div class=\"colab-df-buttons\">\n","\n","  <div class=\"colab-df-container\">\n","    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-06ee9473-c953-4a07-a699-2d780750fa36')\"\n","            title=\"Convert this dataframe to an interactive table.\"\n","            style=\"display:none;\">\n","\n","  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n","    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n","  </svg>\n","    </button>\n","\n","  <style>\n","    .colab-df-container {\n","      display:flex;\n","      gap: 12px;\n","    }\n","\n","    .colab-df-convert {\n","      background-color: #E8F0FE;\n","      border: none;\n","      border-radius: 50%;\n","      cursor: pointer;\n","      display: none;\n","      fill: #1967D2;\n","      height: 32px;\n","      padding: 0 0 0 0;\n","      width: 32px;\n","    }\n","\n","    .colab-df-convert:hover {\n","      background-color: #E2EBFA;\n","      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n","      fill: #174EA6;\n","    }\n","\n","    .colab-df-buttons div {\n","      margin-bottom: 4px;\n","    }\n","\n","    [theme=dark] .colab-df-convert {\n","      background-color: #3B4455;\n","      fill: #D2E3FC;\n","    }\n","\n","    [theme=dark] .colab-df-convert:hover {\n","      background-color: #434B5C;\n","      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n","      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n","      fill: #FFFFFF;\n","    }\n","  </style>\n","\n","    <script>\n","      const buttonEl =\n","        document.querySelector('#df-06ee9473-c953-4a07-a699-2d780750fa36 button.colab-df-convert');\n","      buttonEl.style.display =\n","        google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n","      async function convertToInteractive(key) {\n","        const element = document.querySelector('#df-06ee9473-c953-4a07-a699-2d780750fa36');\n","        const dataTable =\n","          await google.colab.kernel.invokeFunction('convertToInteractive',\n","                                                    [key], {});\n","        if (!dataTable) return;\n","\n","        const docLinkHtml = 'Like what you see? Visit the ' +\n","          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n","          + ' to learn more about interactive tables.';\n","        element.innerHTML = '';\n","        dataTable['output_type'] = 'display_data';\n","        await google.colab.output.renderOutput(dataTable, element);\n","        const docLink = document.createElement('div');\n","        docLink.innerHTML = docLinkHtml;\n","        element.appendChild(docLink);\n","      }\n","    </script>\n","  </div>\n","\n","\n","<div id=\"df-b5f38fab-c1e9-4a34-a6b0-bd26d2669705\">\n","  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-b5f38fab-c1e9-4a34-a6b0-bd26d2669705')\"\n","            title=\"Suggest charts\"\n","            style=\"display:none;\">\n","\n","<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n","     width=\"24px\">\n","    <g>\n","        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n","    </g>\n","</svg>\n","  </button>\n","\n","<style>\n","  .colab-df-quickchart {\n","      --bg-color: #E8F0FE;\n","      --fill-color: #1967D2;\n","      --hover-bg-color: #E2EBFA;\n","      --hover-fill-color: #174EA6;\n","      --disabled-fill-color: #AAA;\n","      --disabled-bg-color: #DDD;\n","  }\n","\n","  [theme=dark] .colab-df-quickchart {\n","      --bg-color: #3B4455;\n","      --fill-color: #D2E3FC;\n","      --hover-bg-color: #434B5C;\n","      --hover-fill-color: #FFFFFF;\n","      --disabled-bg-color: #3B4455;\n","      --disabled-fill-color: #666;\n","  }\n","\n","  .colab-df-quickchart {\n","    background-color: var(--bg-color);\n","    border: none;\n","    border-radius: 50%;\n","    cursor: pointer;\n","    display: none;\n","    fill: var(--fill-color);\n","    height: 32px;\n","    padding: 0;\n","    width: 32px;\n","  }\n","\n","  .colab-df-quickchart:hover {\n","    background-color: var(--hover-bg-color);\n","    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n","    fill: var(--button-hover-fill-color);\n","  }\n","\n","  .colab-df-quickchart-complete:disabled,\n","  .colab-df-quickchart-complete:disabled:hover {\n","    background-color: var(--disabled-bg-color);\n","    fill: var(--disabled-fill-color);\n","    box-shadow: none;\n","  }\n","\n","  .colab-df-spinner {\n","    border: 2px solid var(--fill-color);\n","    border-color: transparent;\n","    border-bottom-color: var(--fill-color);\n","    animation:\n","      spin 1s steps(1) infinite;\n","  }\n","\n","  @keyframes spin {\n","    0% {\n","      border-color: transparent;\n","      border-bottom-color: var(--fill-color);\n","      border-left-color: var(--fill-color);\n","    }\n","    20% {\n","      border-color: transparent;\n","      border-left-color: var(--fill-color);\n","      border-top-color: var(--fill-color);\n","    }\n","    30% {\n","      border-color: transparent;\n","      border-left-color: var(--fill-color);\n","      border-top-color: var(--fill-color);\n","      border-right-color: var(--fill-color);\n","    }\n","    40% {\n","      border-color: transparent;\n","      border-right-color: var(--fill-color);\n","      border-top-color: var(--fill-color);\n","    }\n","    60% {\n","      border-color: transparent;\n","      border-right-color: var(--fill-color);\n","    }\n","    80% {\n","      border-color: transparent;\n","      border-right-color: var(--fill-color);\n","      border-bottom-color: var(--fill-color);\n","    }\n","    90% {\n","      border-color: transparent;\n","      border-bottom-color: var(--fill-color);\n","    }\n","  }\n","</style>\n","\n","  <script>\n","    async function quickchart(key) {\n","      const quickchartButtonEl =\n","        document.querySelector('#' + key + ' button');\n","      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n","      quickchartButtonEl.classList.add('colab-df-spinner');\n","      try {\n","        const charts = await google.colab.kernel.invokeFunction(\n","            'suggestCharts', [key], {});\n","      } catch (error) {\n","        console.error('Error during call to suggestCharts:', error);\n","      }\n","      quickchartButtonEl.classList.remove('colab-df-spinner');\n","      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n","    }\n","    (() => {\n","      let quickchartButtonEl =\n","        document.querySelector('#df-b5f38fab-c1e9-4a34-a6b0-bd26d2669705 button');\n","      quickchartButtonEl.style.display =\n","        google.colab.kernel.accessAllowed ? 'block' : 'none';\n","    })();\n","  </script>\n","</div>\n","\n","    </div>\n","  </div>\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","variable_name":"missing_values_summary","summary":"{\n  \"name\": \"missing_values_summary\",\n  \"rows\": 157,\n  \"fields\": [\n    {\n      \"column\": \"Missing Values\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 18864,\n        \"min\": 79,\n        \"max\": 50829,\n        \"num_unique_values\": 70,\n        \"samples\": [\n          36178,\n          50829,\n          1020\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"}},"metadata":{},"execution_count":20}]},{"cell_type":"code","source":["# Refernce [5]: https://www.sciencedirect.com/science/article/pii/S0895435618308710\n","# Refernce [6]: https://stackoverflow.com/questions/65775141/remove-rows-with-more-than-percentage-of-missing-data-for-majority-class-samples\n","threshold = 0.5 * len(data)\n","\n","# Refernce [7]: https://www.statology.org/pandas-exclude-column/\n","# Refernce [8]: https://www.datacamp.com/tutorial/pandas-drop-column?utm_source=google&utm_medium=paid_search&utm_campaignid=19589720821&utm_adgroupid=157156375191&utm_device=c&utm_keyword=&utm_matchtype=&utm_network=g&utm_adpostion=&utm_creative=691747307431&utm_targetid=dsa-2218886984100&utm_loc_interest_ms=&utm_loc_physical_ms=9062543&utm_content=&utm_campaign=230119_1-sea~dsa~tofu_2-b2c_3-row-p1_4-prc_5-na_6-na_7-le_8-pdsh-go_9-na_10-na_11-na-feb24&gad_source=1&gclid=CjwKCAiAxaCvBhBaEiwAvsLmWGh-6fy-rTQ96ZwE3t9Fisrs4seiXS8GQvJkeQYp5J_Dj4IoGtEA8BoCTucQAvD_BwE\n","columns_to_exclude = missing_values_summary[missing_values_summary['Missing Values'] > threshold].index\n","data_cleaned = data.drop(columns=columns_to_exclude)\n","\n","original_shape = data.shape\n","cleaned_shape = data_cleaned.shape\n","original_shape, cleaned_shape, columns_to_exclude.tolist()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"5GM28kJYy1Hq","executionInfo":{"status":"ok","timestamp":1709825831008,"user_tz":-480,"elapsed":520,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}},"outputId":"8d3ece2c-b957-4ee9-fe85-46e9a3ab91ff"},"execution_count":21,"outputs":[{"output_type":"execute_result","data":{"text/plain":["((50920, 162),\n"," (50920, 87),\n"," ['thrombin_max',\n","  'thrombin_min',\n","  'd_dimer_min',\n","  'd_dimer_max',\n","  'ggt_max',\n","  'ggt_min',\n","  'globulin_min',\n","  'globulin_max',\n","  'bicarbonate_min',\n","  'bicarbonate_max',\n","  'methemoglobin_min',\n","  'methemoglobin_max',\n","  'total_protein_max',\n","  'total_protein_min',\n","  'carboxyhemoglobin_min',\n","  'carboxyhemoglobin_max',\n","  'bilirubin_indirect_min',\n","  'bilirubin_indirect_max',\n","  'nrbc_min',\n","  'nrbc_max',\n","  'bilirubin_direct_min',\n","  'bilirubin_direct_max',\n","  'amylase_min',\n","  'amylase_max',\n","  'aado2_max',\n","  'aado2_min',\n","  'atyps_min',\n","  'atyps_max',\n","  'metas_max',\n","  'metas_min',\n","  'bands_min',\n","  'bands_max',\n","  'temperature_min.1',\n","  'temperature_max.1',\n","  'imm_granulocytes_min',\n","  'imm_granulocytes_max',\n","  'chloride_max',\n","  'chloride_min',\n","  'hemoglobin_max',\n","  'hemoglobin_min',\n","  'hematocrit_max',\n","  'hematocrit_min',\n","  'ck_mb_min',\n","  'ck_mb_max',\n","  'ld_ldh_min',\n","  'ld_ldh_max',\n","  'sodium_min',\n","  'sodium_max',\n","  'fibrinogen_min',\n","  'fibrinogen_max',\n","  'so2_max',\n","  'so2_min',\n","  'ck_cpk_min',\n","  'ck_cpk_max',\n","  'glucose_min.1',\n","  'glucose_max.1',\n","  'potassium_min',\n","  'potassium_max',\n","  'albumin_max',\n","  'albumin_min',\n","  'calcium_max',\n","  'calcium_min',\n","  'pao2fio2ratio_min',\n","  'pao2fio2ratio_max',\n","  'aado2_calc_max',\n","  'aado2_calc_min',\n","  'bilirubin_total_max',\n","  'bilirubin_total_min',\n","  'alp_max',\n","  'alp_min',\n","  'height',\n","  'alt_min',\n","  'alt_max',\n","  'ast_min',\n","  'ast_max'])"]},"metadata":{},"execution_count":21}]},{"cell_type":"code","source":["# Refernce [9]: https://insightsoftware.com/blog/how-to-handle-missing-data-values-while-data-cleaning/\n","missing_values_cleaned = data_cleaned.isnull().sum()\n","missing_values_cleaned_summary = pd.DataFrame(missing_values_cleaned[missing_values_cleaned > 0], columns=['Missing Values'])\n","missing_values_cleaned_summary.sort_values(by='Missing Values', ascending=False, inplace=True)\n","\n","missing_values_cleaned_summary.head(20)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":669},"id":"ZH1-ffBFy558","executionInfo":{"status":"ok","timestamp":1709825836714,"user_tz":-480,"elapsed":692,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}},"outputId":"fe1d46f7-dacf-4c4d-a107-de986dee8e03"},"execution_count":22,"outputs":[{"output_type":"execute_result","data":{"text/plain":["                     Missing Values\n","lactate_max                   23759\n","lactate_min                   23759\n","abs_monocytes_min             20095\n","abs_basophils_min             20095\n","abs_basophils_max             20095\n","abs_monocytes_max             20095\n","abs_eosinophils_min           20094\n","abs_eosinophils_max           20094\n","abs_neutrophils_max           20094\n","abs_neutrophils_min           20094\n","abs_lymphocytes_min           20082\n","abs_lymphocytes_max           20082\n","ph_min                        19748\n","ph_max                        19748\n","totalco2_max                  19744\n","totalco2_min                  19744\n","baseexcess_max                19744\n","baseexcess_min                19744\n","pco2_max                      19744\n","pco2_min                      19744"],"text/html":["\n","  <div id=\"df-4d2ad525-73a0-49e4-aadc-79a7e8bb75b5\" class=\"colab-df-container\">\n","    <div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>Missing Values</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>lactate_max</th>\n","      <td>23759</td>\n","    </tr>\n","    <tr>\n","      <th>lactate_min</th>\n","      <td>23759</td>\n","    </tr>\n","    <tr>\n","      <th>abs_monocytes_min</th>\n","      <td>20095</td>\n","    </tr>\n","    <tr>\n","      <th>abs_basophils_min</th>\n","      <td>20095</td>\n","    </tr>\n","    <tr>\n","      <th>abs_basophils_max</th>\n","      <td>20095</td>\n","    </tr>\n","    <tr>\n","      <th>abs_monocytes_max</th>\n","      <td>20095</td>\n","    </tr>\n","    <tr>\n","      <th>abs_eosinophils_min</th>\n","      <td>20094</td>\n","    </tr>\n","    <tr>\n","      <th>abs_eosinophils_max</th>\n","      <td>20094</td>\n","    </tr>\n","    <tr>\n","      <th>abs_neutrophils_max</th>\n","      <td>20094</td>\n","    </tr>\n","    <tr>\n","      <th>abs_neutrophils_min</th>\n","      <td>20094</td>\n","    </tr>\n","    <tr>\n","      <th>abs_lymphocytes_min</th>\n","      <td>20082</td>\n","    </tr>\n","    <tr>\n","      <th>abs_lymphocytes_max</th>\n","      <td>20082</td>\n","    </tr>\n","    <tr>\n","      <th>ph_min</th>\n","      <td>19748</td>\n","    </tr>\n","    <tr>\n","      <th>ph_max</th>\n","      <td>19748</td>\n","    </tr>\n","    <tr>\n","      <th>totalco2_max</th>\n","      <td>19744</td>\n","    </tr>\n","    <tr>\n","      <th>totalco2_min</th>\n","      <td>19744</td>\n","    </tr>\n","    <tr>\n","      <th>baseexcess_max</th>\n","      <td>19744</td>\n","    </tr>\n","    <tr>\n","      <th>baseexcess_min</th>\n","      <td>19744</td>\n","    </tr>\n","    <tr>\n","      <th>pco2_max</th>\n","      <td>19744</td>\n","    </tr>\n","    <tr>\n","      <th>pco2_min</th>\n","      <td>19744</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>\n","    <div class=\"colab-df-buttons\">\n","\n","  <div class=\"colab-df-container\">\n","    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-4d2ad525-73a0-49e4-aadc-79a7e8bb75b5')\"\n","            title=\"Convert this dataframe to an interactive table.\"\n","            style=\"display:none;\">\n","\n","  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n","    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n","  </svg>\n","    </button>\n","\n","  <style>\n","    .colab-df-container {\n","      display:flex;\n","      gap: 12px;\n","    }\n","\n","    .colab-df-convert {\n","      background-color: #E8F0FE;\n","      border: none;\n","      border-radius: 50%;\n","      cursor: pointer;\n","      display: none;\n","      fill: #1967D2;\n","      height: 32px;\n","      padding: 0 0 0 0;\n","      width: 32px;\n","    }\n","\n","    .colab-df-convert:hover {\n","      background-color: #E2EBFA;\n","      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n","      fill: #174EA6;\n","    }\n","\n","    .colab-df-buttons div {\n","      margin-bottom: 4px;\n","    }\n","\n","    [theme=dark] .colab-df-convert {\n","      background-color: #3B4455;\n","      fill: #D2E3FC;\n","    }\n","\n","    [theme=dark] .colab-df-convert:hover {\n","      background-color: #434B5C;\n","      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n","      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n","      fill: #FFFFFF;\n","    }\n","  </style>\n","\n","    <script>\n","      const buttonEl =\n","        document.querySelector('#df-4d2ad525-73a0-49e4-aadc-79a7e8bb75b5 button.colab-df-convert');\n","      buttonEl.style.display =\n","        google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n","      async function convertToInteractive(key) {\n","        const element = document.querySelector('#df-4d2ad525-73a0-49e4-aadc-79a7e8bb75b5');\n","        const dataTable =\n","          await google.colab.kernel.invokeFunction('convertToInteractive',\n","                                                    [key], {});\n","        if (!dataTable) return;\n","\n","        const docLinkHtml = 'Like what you see? Visit the ' +\n","          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n","          + ' to learn more about interactive tables.';\n","        element.innerHTML = '';\n","        dataTable['output_type'] = 'display_data';\n","        await google.colab.output.renderOutput(dataTable, element);\n","        const docLink = document.createElement('div');\n","        docLink.innerHTML = docLinkHtml;\n","        element.appendChild(docLink);\n","      }\n","    </script>\n","  </div>\n","\n","\n","<div id=\"df-43f9ba52-de94-422a-b5b6-72b71c5bbb8c\">\n","  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-43f9ba52-de94-422a-b5b6-72b71c5bbb8c')\"\n","            title=\"Suggest charts\"\n","            style=\"display:none;\">\n","\n","<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n","     width=\"24px\">\n","    <g>\n","        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n","    </g>\n","</svg>\n","  </button>\n","\n","<style>\n","  .colab-df-quickchart {\n","      --bg-color: #E8F0FE;\n","      --fill-color: #1967D2;\n","      --hover-bg-color: #E2EBFA;\n","      --hover-fill-color: #174EA6;\n","      --disabled-fill-color: #AAA;\n","      --disabled-bg-color: #DDD;\n","  }\n","\n","  [theme=dark] .colab-df-quickchart {\n","      --bg-color: #3B4455;\n","      --fill-color: #D2E3FC;\n","      --hover-bg-color: #434B5C;\n","      --hover-fill-color: #FFFFFF;\n","      --disabled-bg-color: #3B4455;\n","      --disabled-fill-color: #666;\n","  }\n","\n","  .colab-df-quickchart {\n","    background-color: var(--bg-color);\n","    border: none;\n","    border-radius: 50%;\n","    cursor: pointer;\n","    display: none;\n","    fill: var(--fill-color);\n","    height: 32px;\n","    padding: 0;\n","    width: 32px;\n","  }\n","\n","  .colab-df-quickchart:hover {\n","    background-color: var(--hover-bg-color);\n","    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n","    fill: var(--button-hover-fill-color);\n","  }\n","\n","  .colab-df-quickchart-complete:disabled,\n","  .colab-df-quickchart-complete:disabled:hover {\n","    background-color: var(--disabled-bg-color);\n","    fill: var(--disabled-fill-color);\n","    box-shadow: none;\n","  }\n","\n","  .colab-df-spinner {\n","    border: 2px solid var(--fill-color);\n","    border-color: transparent;\n","    border-bottom-color: var(--fill-color);\n","    animation:\n","      spin 1s steps(1) infinite;\n","  }\n","\n","  @keyframes spin {\n","    0% {\n","      border-color: transparent;\n","      border-bottom-color: var(--fill-color);\n","      border-left-color: var(--fill-color);\n","    }\n","    20% {\n","      border-color: transparent;\n","      border-left-color: var(--fill-color);\n","      border-top-color: var(--fill-color);\n","    }\n","    30% {\n","      border-color: transparent;\n","      border-left-color: var(--fill-color);\n","      border-top-color: var(--fill-color);\n","      border-right-color: var(--fill-color);\n","    }\n","    40% {\n","      border-color: transparent;\n","      border-right-color: var(--fill-color);\n","      border-top-color: var(--fill-color);\n","    }\n","    60% {\n","      border-color: transparent;\n","      border-right-color: var(--fill-color);\n","    }\n","    80% {\n","      border-color: transparent;\n","      border-right-color: var(--fill-color);\n","      border-bottom-color: var(--fill-color);\n","    }\n","    90% {\n","      border-color: transparent;\n","      border-bottom-color: var(--fill-color);\n","    }\n","  }\n","</style>\n","\n","  <script>\n","    async function quickchart(key) {\n","      const quickchartButtonEl =\n","        document.querySelector('#' + key + ' button');\n","      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n","      quickchartButtonEl.classList.add('colab-df-spinner');\n","      try {\n","        const charts = await google.colab.kernel.invokeFunction(\n","            'suggestCharts', [key], {});\n","      } catch (error) {\n","        console.error('Error during call to suggestCharts:', error);\n","      }\n","      quickchartButtonEl.classList.remove('colab-df-spinner');\n","      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n","    }\n","    (() => {\n","      let quickchartButtonEl =\n","        document.querySelector('#df-43f9ba52-de94-422a-b5b6-72b71c5bbb8c button');\n","      quickchartButtonEl.style.display =\n","        google.colab.kernel.accessAllowed ? 'block' : 'none';\n","    })();\n","  </script>\n","</div>\n","\n","    </div>\n","  </div>\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","variable_name":"missing_values_cleaned_summary","summary":"{\n  \"name\": \"missing_values_cleaned_summary\",\n  \"rows\": 82,\n  \"fields\": [\n    {\n      \"column\": \"Missing Values\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 8672,\n        \"min\": 79,\n        \"max\": 23759,\n        \"num_unique_values\": 34,\n        \"samples\": [\n          662,\n          602,\n          378\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"}},"metadata":{},"execution_count":22}]},{"cell_type":"code","source":["# Refernce [10]: https://scikit-learn.org/stable/modules/impute.html\n","# Refernce [11]: https://www.geeksforgeeks.org/how-to-fill-nan-values-with-mean-in-pandas/\n","\n","for column in missing_values_cleaned_summary.index:\n","    if data_cleaned[column].dtype in ['float64', 'int64']:  # Only impute the column that is numerical!\n","        median_value = data_cleaned[column].median()\n","        data_cleaned[column].fillna(median_value, inplace=True)\n","\n","# https://www.linkedin.com/advice/0/what-some-best-practices-dealing-missing-values-imputation\n","recheck_missing_values = data_cleaned.isnull().sum()\n","missing_values_after_imputation = recheck_missing_values[recheck_missing_values > 0]\n","\n","missing_values_after_imputation"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"LT38Zi6kzMWb","executionInfo":{"status":"ok","timestamp":1709825840077,"user_tz":-480,"elapsed":3,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}},"outputId":"00bccd9c-b356-4578-c447-728caf5a5d3e"},"execution_count":23,"outputs":[{"output_type":"execute_result","data":{"text/plain":["Series([], dtype: int64)"]},"metadata":{},"execution_count":23}]},{"cell_type":"markdown","source":["# **Feature Selection and Model Training**"],"metadata":{"id":"Sp9k3u_o4tPT"}},{"cell_type":"code","source":["# Refernce [12]: https://www.analyticsvidhya.com/blog/2020/06/feature-selection-techniques-machine-learning/\n","# Refernce [13]: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection\n","features = ['gender', 'admission_age', 'race', 'heart_rate_mean', 'sbp_mean', 'dbp_mean', 'lactate_min', 'lactate_max']\n","target = 'aki'\n","\n","X = data_cleaned[features]\n","y = data_cleaned[target]\n","\n","# Refernce [14]: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html\n","categorical_features = ['gender', 'race']\n","numerical_features = ['admission_age', 'heart_rate_mean', 'sbp_mean', 'dbp_mean', 'lactate_min', 'lactate_max']\n","\n","preprocessor = ColumnTransformer(\n","    transformers=[\n","        ('num', StandardScaler(), numerical_features),\n","        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])\n","\n","X_prepared = preprocessor.fit_transform(X)\n","\n","# Refernce [15]: https://towardsdatascience.com/stratified-sampling-you-may-have-been-splitting-your-dataset-all-wrong-8cfdd0d32502\n","X_train, X_test, y_train, y_test = train_test_split(X_prepared, y, test_size=0.2, random_state=42, stratify=y)\n","\n","X_train.shape, X_test.shape"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"M33yYmKQzSqT","executionInfo":{"status":"ok","timestamp":1709825843858,"user_tz":-480,"elapsed":593,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}},"outputId":"856f5704-ce79-4e15-d744-3e14086b0eeb"},"execution_count":24,"outputs":[{"output_type":"execute_result","data":{"text/plain":["((40736, 41), (10184, 41))"]},"metadata":{},"execution_count":24}]},{"cell_type":"markdown","source":["# **Model Prediction and Evaluation**"],"metadata":{"id":"4ZV1qCg9zkb-"}},{"cell_type":"markdown","source":["## **Logistic Regression**\n","\n"],"metadata":{"id":"FDdmNMyrkK4E"}},{"cell_type":"code","source":["# Refernce [16]: https://scikit-learn.org/stable/modules/grid_search.html\n","# Refernce [17]: https://drbeane.github.io/python_dsci/pages/grid_search.html\n","C_values = [0.001, 0.01, 0.1, 1, 10, 100]\n","\n","param_grid = {'C': C_values}\n","lr_model = LogisticRegression(penalty='l1', solver='liblinear')\n","grid_search = GridSearchCV(lr_model, param_grid, cv=5)\n","\n","grid_search.fit(X_train, y_train)\n","\n","best_model = grid_search.best_estimator_\n","\n","y_pred = best_model.predict(X_test)\n","\n","accuracy = accuracy_score(y_test, y_pred)\n","precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')\n","\n","accuracy, precision, recall, f1"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"2-D6mNOyzama","executionInfo":{"status":"ok","timestamp":1709825986657,"user_tz":-480,"elapsed":138445,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}},"outputId":"3f5257cc-766f-4eec-b42f-c6ed2c73bba7"},"execution_count":25,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(0.40298507462686567,\n"," 0.3320444093843265,\n"," 0.40298507462686567,\n"," 0.3342230247225089)"]},"metadata":{},"execution_count":25}]},{"cell_type":"markdown","source":["## **Decision Tree**"],"metadata":{"id":"zkgcrMqXlQiZ"}},{"cell_type":"code","source":["# Refernce [18]: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html\n","decision_tree = DecisionTreeClassifier(random_state=42)\n","\n","decision_tree.fit(X_train, y_train)\n","\n","y_pred_dt = decision_tree.predict(X_test)\n","\n","accuracy_dt = accuracy_score(y_test, y_pred_dt)\n","precision_dt, recall_dt, f1_dt, _ = precision_recall_fscore_support(y_test, y_pred_dt, average='weighted')\n","\n","accuracy_dt, precision_dt, recall_dt, f1_dt"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"sg-aD14elS_6","executionInfo":{"status":"ok","timestamp":1709826029626,"user_tz":-480,"elapsed":3960,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}},"outputId":"62a1b1b5-633f-43dc-82ba-5de8379d51ee"},"execution_count":26,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(0.31922623723487825,\n"," 0.31932298016417965,\n"," 0.31922623723487825,\n"," 0.3192673787031557)"]},"metadata":{},"execution_count":26}]},{"cell_type":"markdown","source":["## **Random Forest**"],"metadata":{"id":"Uev9GBLtWzVA"}},{"cell_type":"code","source":["# Refernce [19]: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html\n","random_forest = RandomForestClassifier(random_state=42)\n","\n","random_forest.fit(X_train, y_train)\n","\n","y_pred_rf = random_forest.predict(X_test)\n","\n","accuracy_rf = accuracy_score(y_test, y_pred_rf)\n","precision_rf, recall_rf, f1_rf, _ = precision_recall_fscore_support(y_test, y_pred_rf, average='weighted')\n","\n","accuracy_rf, precision_rf, recall_rf, f1_rf"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"y4wB0NCnW64T","executionInfo":{"status":"ok","timestamp":1709826158783,"user_tz":-480,"elapsed":109714,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}},"outputId":"29a5f0e9-ae1c-45f8-af5a-63b1b194ea70"},"execution_count":28,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(0.3881578947368421, 0.358609063596094, 0.3881578947368421, 0.3568783712222866)"]},"metadata":{},"execution_count":28}]},{"cell_type":"markdown","source":["## **SVM**"],"metadata":{"id":"s5aosjeOke50"}},{"cell_type":"code","source":["# Refernce [20]: https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html\n","# Refernce [21]: https://www.datatechnotes.com/2020/07/classification-example-with-linearsvm-in-python.html\n","# Refernce [22]: https://medium.com/@mrconnor/understanding-support-vector-machines-through-code-a-detailed-guide-692d0061d78b\n","svm_model = LinearSVC(random_state=42, max_iter=100000)\n","\n","svm_model.fit(X_train, y_train)\n","\n","y_pred_svm = svm_model.predict(X_test)\n","\n","\n","accuracy_svm = accuracy_score(y_test, y_pred_svm)\n","precision_svm, recall_svm, f1_svm, _ = precision_recall_fscore_support(y_test, y_pred_svm, average='weighted')\n","\n","accuracy_svm, precision_svm, recall_svm, f1_svm"],"metadata":{"id":"hJujwmMHkjCT","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1709827788707,"user_tz":-480,"elapsed":1363556,"user":{"displayName":"Kunshi Lin","userId":"01385305786663149743"}},"outputId":"5019fc46-3ee0-4b35-e60b-04dd7ee534a0"},"execution_count":30,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(0.40327965435978, 0.33123320296781494, 0.40327965435978, 0.3331245901794984)"]},"metadata":{},"execution_count":30}]}]}