[b5ec00]: / Data_processing / imputation.ipynb

Download this file

1 lines (1 with data), 41.5 kB

{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"imputation.ipynb","provenance":[],"collapsed_sections":[]},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","source":["! pip install icd10-cm"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Q1Hoyh6oaYqd","executionInfo":{"status":"ok","timestamp":1651483216220,"user_tz":240,"elapsed":5316,"user":{"displayName":"Benedikt Geiger","userId":"17925887631246406508"}},"outputId":"7b78fbe2-07f6-4121-b883-5cc4026c724b"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Collecting icd10-cm\n","  Downloading icd10_cm-0.0.4-py2.py3-none-any.whl (675 kB)\n","\u001b[?25l\r\u001b[K     |▌                               | 10 kB 26.9 MB/s eta 0:00:01\r\u001b[K     |█                               | 20 kB 12.0 MB/s eta 0:00:01\r\u001b[K     |█▌                              | 30 kB 9.7 MB/s eta 0:00:01\r\u001b[K     |██                              | 40 kB 8.5 MB/s eta 0:00:01\r\u001b[K     |██▍                             | 51 kB 4.4 MB/s eta 0:00:01\r\u001b[K     |███                             | 61 kB 5.2 MB/s eta 0:00:01\r\u001b[K     |███▍                            | 71 kB 5.6 MB/s eta 0:00:01\r\u001b[K     |███▉                            | 81 kB 5.7 MB/s eta 0:00:01\r\u001b[K     |████▍                           | 92 kB 6.4 MB/s eta 0:00:01\r\u001b[K     |████▉                           | 102 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |█████▍                          | 112 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |█████▉                          | 122 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |██████▎                         | 133 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |██████▉                         | 143 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |███████▎                        | 153 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |███████▊                        | 163 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |████████▎                       | 174 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |████████▊                       | 184 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |█████████▏                      | 194 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |█████████▊                      | 204 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |██████████▏                     | 215 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |██████████▊                     | 225 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |███████████▏                    | 235 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |███████████▋                    | 245 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |████████████▏                   | 256 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |████████████▋                   | 266 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |█████████████                   | 276 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |█████████████▋                  | 286 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |██████████████                  | 296 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |██████████████▋                 | 307 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |███████████████                 | 317 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |███████████████▌                | 327 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |████████████████                | 337 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |████████████████▌               | 348 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |█████████████████               | 358 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |█████████████████▌              | 368 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |██████████████████              | 378 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |██████████████████▍             | 389 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |███████████████████             | 399 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |███████████████████▍            | 409 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |████████████████████            | 419 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |████████████████████▍           | 430 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |████████████████████▉           | 440 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |█████████████████████▍          | 450 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |█████████████████████▉          | 460 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |██████████████████████▎         | 471 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |██████████████████████▉         | 481 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |███████████████████████▎        | 491 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |███████████████████████▊        | 501 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |████████████████████████▎       | 512 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |████████████████████████▊       | 522 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |█████████████████████████▎      | 532 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |█████████████████████████▊      | 542 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |██████████████████████████▏     | 552 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |██████████████████████████▊     | 563 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |███████████████████████████▏    | 573 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |███████████████████████████▋    | 583 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |████████████████████████████▏   | 593 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |████████████████████████████▋   | 604 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |█████████████████████████████▏  | 614 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |█████████████████████████████▋  | 624 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |██████████████████████████████  | 634 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |██████████████████████████████▋ | 645 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |███████████████████████████████ | 655 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |███████████████████████████████▌| 665 kB 5.1 MB/s eta 0:00:01\r\u001b[K     |████████████████████████████████| 675 kB 5.1 MB/s \n","\u001b[?25hInstalling collected packages: icd10-cm\n","Successfully installed icd10-cm-0.0.4\n"]}]},{"cell_type":"code","source":["import pandas as pd\n","import os\n","import numpy as np\n","import icd10\n","import pickle\n","import matplotlib.pyplot as plt"],"metadata":{"id":"uRWSjYFJFkOI"},"execution_count":null,"outputs":[]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"gfPk_m_WEFFr","executionInfo":{"status":"ok","timestamp":1651487996613,"user_tz":240,"elapsed":1864,"user":{"displayName":"Benedikt Geiger","userId":"17925887631246406508"}},"outputId":"0f23885f-a5c2-49b9-f150-aa1e9dcc27bd"},"outputs":[{"output_type":"stream","name":"stdout","text":["Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount(\"/content/gdrive\", force_remount=True).\n"]}],"source":["from google.colab import drive\n","drive.mount('/content/gdrive')"]},{"cell_type":"code","source":["# Demo of icd10\n","code = icd10.find(\"R11\")\n","print(code.description)         # Acute bronchitis due to Mycoplasma pneumoniae\n","if code.billable:\n","    print(code, \"is billable\")  # J20.0 is billable\n","\n","print(code.chapter)             # X\n","print(code.block)               # J00-J99\n","print(code.block_description)   # Diseases of the respiratory system"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"mTV5gVdexknC","executionInfo":{"status":"ok","timestamp":1651487996615,"user_tz":240,"elapsed":13,"user":{"displayName":"Benedikt Geiger","userId":"17925887631246406508"}},"outputId":"69a70e18-dec9-450b-f6b6-b0b5f5d7b29a"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Nausea and vomiting\n","XVIII\n","R00-R99\n","Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified\n"]}]},{"cell_type":"code","source":["deep_learning_dir = '/content/gdrive/My Drive/BMI 707 Project' "],"metadata":{"id":"Fh4HoiPnFtOr"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["df_training = pd.read_pickle(deep_learning_dir + '/data_formatting/training_data.pickle')\n","df_val = pd.read_pickle(deep_learning_dir + '/data_formatting/validation_data.pickle')\n","df_testing = pd.read_pickle(deep_learning_dir + '/data_formatting/testing_data.pickle')\n","\n","df_total = df_training.append(df_val)\n","df_total = df_total.append(df_testing)\n","\n","print(str(df_training.shape), str(df_val.shape), str(df_testing.shape), str(df_total.shape))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"nnkr38avie6w","executionInfo":{"status":"ok","timestamp":1651487996831,"user_tz":240,"elapsed":223,"user":{"displayName":"Benedikt Geiger","userId":"17925887631246406508"}},"outputId":"5718fe15-8213-4ee7-f3e0-883f483598a8"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["(3094, 7) (344, 7) (1146, 7) (4584, 7)\n"]}]},{"cell_type":"markdown","source":["# Inpute participant data"],"metadata":{"id":"_7TE11jp_M_o"}},{"cell_type":"code","source":["df = df_total.explode('icdcodes')\n","# Keep first ICD code\n","df['icdcodes'] = df['icdcodes'].apply(lambda x: x.split(\".\")[0])\n","df = df.drop_duplicates('nctid', keep='first')"],"metadata":{"id":"Lg82ya5tAsbu"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["def get_chapter(x): \n","  code = icd10.find(x)\n","  desc = 'Other'\n","\n","  try: \n","    desc =  code.chapter\n","  except Exception: \n","    pass \n","  return desc"],"metadata":{"id":"ms7oAGFNgWhV"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["df['chapter'] = df['icdcodes'].apply(get_chapter)"],"metadata":{"id":"3t5UJi6Uaqdu"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["df.chapter.unique()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"9nHEwJWSqJG3","executionInfo":{"status":"ok","timestamp":1651487997039,"user_tz":240,"elapsed":24,"user":{"displayName":"Benedikt Geiger","userId":"17925887631246406508"}},"outputId":"db5cb1cb-ab98-4032-ef4e-418d5f6dd320"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array(['XVIII', 'V', 'VI', 'IV', 'XI', None, 'III', 'VII', 'XIII', 'II',\n","       'X', 'I', 'IX', 'XV', 'XXI', 'XII', 'XIV', 'XX', 'XIX', 'XVI',\n","       'XVII', 'Other', 'VIII'], dtype=object)"]},"metadata":{},"execution_count":299}]},{"cell_type":"code","source":["# Mapped as Other\n","df[df['chapter'] == 'Other']['icdcodes'].value_counts()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"nY7GaFct2DoE","executionInfo":{"status":"ok","timestamp":1651487997041,"user_tz":240,"elapsed":21,"user":{"displayName":"Benedikt Geiger","userId":"17925887631246406508"}},"outputId":"42bf6514-6896-4850-b1ca-e7813e1fa7c2"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["B00    44\n","O9A     6\n","J00     2\n","C7A     1\n","O00     1\n","Name: icdcodes, dtype: int64"]},"metadata":{},"execution_count":300}]},{"cell_type":"code","source":["# Mapped as na are all in the neoplasm chapter\n","df[df['chapter'].isna()]['icdcodes'].value_counts()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"O3F8LCP_3PSi","executionInfo":{"status":"ok","timestamp":1651487997042,"user_tz":240,"elapsed":18,"user":{"displayName":"Benedikt Geiger","userId":"17925887631246406508"}},"outputId":"b39f2d13-5d41-4523-c64b-d59523b2f8e7"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["C79    158\n","C78    102\n","C61     54\n","C90     40\n","C95     31\n","C91     20\n","C76     16\n","C96     16\n","C92     15\n","C57     11\n","K94     10\n","C49     10\n","C71      6\n","C73      5\n","C83      4\n","C67      3\n","C88      2\n","C84      1\n","C81      1\n","Name: icdcodes, dtype: int64"]},"metadata":{},"execution_count":301}]},{"cell_type":"code","source":["# hand curation\n","df.loc[df['icdcodes'] == 'B00', 'chapter'] = 'I'\n","df.loc[df['chapter'].isna(), 'chapter'] = 'II'\n","df.loc[df['icdcodes'] == 'C7A', 'chapter'] = 'II'\n","df.loc[df['icdcodes'] == 'J00', 'chapter'] = 'X'\n","df.loc[df['icdcodes'] == 'K94', 'chapter'] = 'XI'\n","df.loc[df['icdcodes'] == 'O00', 'chapter'] = 'XV'\n","df.loc[df['icdcodes'] == 'O9A', 'chapter'] = 'XV'"],"metadata":{"id":"agqv9fFn5eGM"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# number of trials with missing n_participants info\n","sum(df['n_participants'].isna()) / len(df)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"oWKL8XeTH6c4","executionInfo":{"status":"ok","timestamp":1651487997044,"user_tz":240,"elapsed":15,"user":{"displayName":"Benedikt Geiger","userId":"17925887631246406508"}},"outputId":"40ad15bf-0f94-49f7-de71-47da61ad9dad"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0.3706369982547993"]},"metadata":{},"execution_count":303}]},{"cell_type":"code","source":["#unique_chapter = df.chapter.unique()\n","#\n","#for x in unique_chapter: \n","#  df[df.chapter == x]['n_participants'].hist(bins=30)\n","#  plt.title('Chapter %s' %x)\n","#  plt.show()"],"metadata":{"id":"9kmL7tMWp7Dk"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# Impute the number of participants per chapter with the median due to skewedness\n","df['n_participants'] = df['n_participants'].fillna(df.groupby('chapter')['n_participants'].transform('median'))"],"metadata":{"id":"85nQ1i9fp61l"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["df.head()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"i52GG15PIS4K","executionInfo":{"status":"ok","timestamp":1651487998166,"user_tz":240,"elapsed":12,"user":{"displayName":"Benedikt Geiger","userId":"17925887631246406508"}},"outputId":"6ebfb561-439a-4b08-c0db-63d3cddbd879"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["         nctid  n_participants  \\\n","0  NCT00475085           944.0   \n","1  NCT01626859           152.0   \n","2  NCT00203957          2605.0   \n","3  NCT00169832          3204.0   \n","4  NCT01249352          1958.0   \n","\n","                                               drugs  \\\n","0  [aprepitant, dexamethasone, granisetron hydroc...   \n","1  [mp-214 low dose, mp-214 middle dose, mp-214 h...   \n","2                   [istradefylline, istradefylline]   \n","3                         [rosiglitazone or placebo]   \n","4             [nimotuzumab, cisplatin, fluorouracil]   \n","\n","                                      diseases icdcodes  \\\n","0                                     [nausea]      R11   \n","1                              [schizophrenia]      F20   \n","2                         [parkinsons disease]      G20   \n","3  [diabetes, coronary artery bypass grafting]      E23   \n","4          [esophageal cancer, adenocarcinoma]      K22   \n","\n","                                            criteria  label chapter  \n","0  \\n        Inclusion criteria:\\n\\n          -  ...      1   XVIII  \n","1  \\n        Inclusion Criteria:\\n\\n          -  ...      1       V  \n","2  \\n        Inclusion Criteria:\\n\\n          -  ...      1      VI  \n","3  \\n        Inclusion Criteria:\\n\\n        AT SC...      0      IV  \n","4  \\n        Inclusion Criteria:\\n\\n          1. ...      1      XI  "],"text/html":["\n","  <div id=\"df-42645b82-0bfa-40fa-b385-69b89788f026\">\n","    <div class=\"colab-df-container\">\n","      <div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>nctid</th>\n","      <th>n_participants</th>\n","      <th>drugs</th>\n","      <th>diseases</th>\n","      <th>icdcodes</th>\n","      <th>criteria</th>\n","      <th>label</th>\n","      <th>chapter</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>NCT00475085</td>\n","      <td>944.0</td>\n","      <td>[aprepitant, dexamethasone, granisetron hydroc...</td>\n","      <td>[nausea]</td>\n","      <td>R11</td>\n","      <td>\\n        Inclusion criteria:\\n\\n          -  ...</td>\n","      <td>1</td>\n","      <td>XVIII</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>NCT01626859</td>\n","      <td>152.0</td>\n","      <td>[mp-214 low dose, mp-214 middle dose, mp-214 h...</td>\n","      <td>[schizophrenia]</td>\n","      <td>F20</td>\n","      <td>\\n        Inclusion Criteria:\\n\\n          -  ...</td>\n","      <td>1</td>\n","      <td>V</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>NCT00203957</td>\n","      <td>2605.0</td>\n","      <td>[istradefylline, istradefylline]</td>\n","      <td>[parkinsons disease]</td>\n","      <td>G20</td>\n","      <td>\\n        Inclusion Criteria:\\n\\n          -  ...</td>\n","      <td>1</td>\n","      <td>VI</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>NCT00169832</td>\n","      <td>3204.0</td>\n","      <td>[rosiglitazone or placebo]</td>\n","      <td>[diabetes, coronary artery bypass grafting]</td>\n","      <td>E23</td>\n","      <td>\\n        Inclusion Criteria:\\n\\n        AT SC...</td>\n","      <td>0</td>\n","      <td>IV</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>NCT01249352</td>\n","      <td>1958.0</td>\n","      <td>[nimotuzumab, cisplatin, fluorouracil]</td>\n","      <td>[esophageal cancer, adenocarcinoma]</td>\n","      <td>K22</td>\n","      <td>\\n        Inclusion Criteria:\\n\\n          1. ...</td>\n","      <td>1</td>\n","      <td>XI</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>\n","      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-42645b82-0bfa-40fa-b385-69b89788f026')\"\n","              title=\"Convert this dataframe to an interactive table.\"\n","              style=\"display:none;\">\n","        \n","  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n","       width=\"24px\">\n","    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n","    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n","  </svg>\n","      </button>\n","      \n","  <style>\n","    .colab-df-container {\n","      display:flex;\n","      flex-wrap:wrap;\n","      gap: 12px;\n","    }\n","\n","    .colab-df-convert {\n","      background-color: #E8F0FE;\n","      border: none;\n","      border-radius: 50%;\n","      cursor: pointer;\n","      display: none;\n","      fill: #1967D2;\n","      height: 32px;\n","      padding: 0 0 0 0;\n","      width: 32px;\n","    }\n","\n","    .colab-df-convert:hover {\n","      background-color: #E2EBFA;\n","      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n","      fill: #174EA6;\n","    }\n","\n","    [theme=dark] .colab-df-convert {\n","      background-color: #3B4455;\n","      fill: #D2E3FC;\n","    }\n","\n","    [theme=dark] .colab-df-convert:hover {\n","      background-color: #434B5C;\n","      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n","      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n","      fill: #FFFFFF;\n","    }\n","  </style>\n","\n","      <script>\n","        const buttonEl =\n","          document.querySelector('#df-42645b82-0bfa-40fa-b385-69b89788f026 button.colab-df-convert');\n","        buttonEl.style.display =\n","          google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n","        async function convertToInteractive(key) {\n","          const element = document.querySelector('#df-42645b82-0bfa-40fa-b385-69b89788f026');\n","          const dataTable =\n","            await google.colab.kernel.invokeFunction('convertToInteractive',\n","                                                     [key], {});\n","          if (!dataTable) return;\n","\n","          const docLinkHtml = 'Like what you see? Visit the ' +\n","            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n","            + ' to learn more about interactive tables.';\n","          element.innerHTML = '';\n","          dataTable['output_type'] = 'display_data';\n","          await google.colab.output.renderOutput(dataTable, element);\n","          const docLink = document.createElement('div');\n","          docLink.innerHTML = docLinkHtml;\n","          element.appendChild(docLink);\n","        }\n","      </script>\n","    </div>\n","  </div>\n","  "]},"metadata":{},"execution_count":306}]},{"cell_type":"code","source":["for chapter in df.chapter.unique():\n","  n_part_chapter = df[df[\"chapter\"] == chapter][\"n_participants\"]\n","  df.loc[df[\"chapter\"] == chapter, \"norm_n_participants\"] = (n_part_chapter - np.median(n_part_chapter)) /  np.median(n_part_chapter)"],"metadata":{"id":"7lE-x23ATcGy"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["df.head()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"GbjtthzuUQpi","executionInfo":{"status":"ok","timestamp":1651488000283,"user_tz":240,"elapsed":200,"user":{"displayName":"Benedikt Geiger","userId":"17925887631246406508"}},"outputId":"f96b8bf5-0fa0-48d3-8d14-8b4d5e65f2d2"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["         nctid  n_participants  \\\n","0  NCT00475085           944.0   \n","1  NCT01626859           152.0   \n","2  NCT00203957          2605.0   \n","3  NCT00169832          3204.0   \n","4  NCT01249352          1958.0   \n","\n","                                               drugs  \\\n","0  [aprepitant, dexamethasone, granisetron hydroc...   \n","1  [mp-214 low dose, mp-214 middle dose, mp-214 h...   \n","2                   [istradefylline, istradefylline]   \n","3                         [rosiglitazone or placebo]   \n","4             [nimotuzumab, cisplatin, fluorouracil]   \n","\n","                                      diseases icdcodes  \\\n","0                                     [nausea]      R11   \n","1                              [schizophrenia]      F20   \n","2                         [parkinsons disease]      G20   \n","3  [diabetes, coronary artery bypass grafting]      E23   \n","4          [esophageal cancer, adenocarcinoma]      K22   \n","\n","                                            criteria  label chapter  \\\n","0  \\n        Inclusion criteria:\\n\\n          -  ...      1   XVIII   \n","1  \\n        Inclusion Criteria:\\n\\n          -  ...      1       V   \n","2  \\n        Inclusion Criteria:\\n\\n          -  ...      1      VI   \n","3  \\n        Inclusion Criteria:\\n\\n        AT SC...      0      IV   \n","4  \\n        Inclusion Criteria:\\n\\n          1. ...      1      XI   \n","\n","   norm_n_participants  \n","0            -0.477298  \n","1            -0.930211  \n","2             0.000000  \n","3             0.000000  \n","4             0.000000  "],"text/html":["\n","  <div id=\"df-b21216ba-20c5-47f5-9c80-cae1ac9fbf73\">\n","    <div class=\"colab-df-container\">\n","      <div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>nctid</th>\n","      <th>n_participants</th>\n","      <th>drugs</th>\n","      <th>diseases</th>\n","      <th>icdcodes</th>\n","      <th>criteria</th>\n","      <th>label</th>\n","      <th>chapter</th>\n","      <th>norm_n_participants</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>NCT00475085</td>\n","      <td>944.0</td>\n","      <td>[aprepitant, dexamethasone, granisetron hydroc...</td>\n","      <td>[nausea]</td>\n","      <td>R11</td>\n","      <td>\\n        Inclusion criteria:\\n\\n          -  ...</td>\n","      <td>1</td>\n","      <td>XVIII</td>\n","      <td>-0.477298</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>NCT01626859</td>\n","      <td>152.0</td>\n","      <td>[mp-214 low dose, mp-214 middle dose, mp-214 h...</td>\n","      <td>[schizophrenia]</td>\n","      <td>F20</td>\n","      <td>\\n        Inclusion Criteria:\\n\\n          -  ...</td>\n","      <td>1</td>\n","      <td>V</td>\n","      <td>-0.930211</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>NCT00203957</td>\n","      <td>2605.0</td>\n","      <td>[istradefylline, istradefylline]</td>\n","      <td>[parkinsons disease]</td>\n","      <td>G20</td>\n","      <td>\\n        Inclusion Criteria:\\n\\n          -  ...</td>\n","      <td>1</td>\n","      <td>VI</td>\n","      <td>0.000000</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>NCT00169832</td>\n","      <td>3204.0</td>\n","      <td>[rosiglitazone or placebo]</td>\n","      <td>[diabetes, coronary artery bypass grafting]</td>\n","      <td>E23</td>\n","      <td>\\n        Inclusion Criteria:\\n\\n        AT SC...</td>\n","      <td>0</td>\n","      <td>IV</td>\n","      <td>0.000000</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>NCT01249352</td>\n","      <td>1958.0</td>\n","      <td>[nimotuzumab, cisplatin, fluorouracil]</td>\n","      <td>[esophageal cancer, adenocarcinoma]</td>\n","      <td>K22</td>\n","      <td>\\n        Inclusion Criteria:\\n\\n          1. ...</td>\n","      <td>1</td>\n","      <td>XI</td>\n","      <td>0.000000</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>\n","      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-b21216ba-20c5-47f5-9c80-cae1ac9fbf73')\"\n","              title=\"Convert this dataframe to an interactive table.\"\n","              style=\"display:none;\">\n","        \n","  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n","       width=\"24px\">\n","    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n","    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n","  </svg>\n","      </button>\n","      \n","  <style>\n","    .colab-df-container {\n","      display:flex;\n","      flex-wrap:wrap;\n","      gap: 12px;\n","    }\n","\n","    .colab-df-convert {\n","      background-color: #E8F0FE;\n","      border: none;\n","      border-radius: 50%;\n","      cursor: pointer;\n","      display: none;\n","      fill: #1967D2;\n","      height: 32px;\n","      padding: 0 0 0 0;\n","      width: 32px;\n","    }\n","\n","    .colab-df-convert:hover {\n","      background-color: #E2EBFA;\n","      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n","      fill: #174EA6;\n","    }\n","\n","    [theme=dark] .colab-df-convert {\n","      background-color: #3B4455;\n","      fill: #D2E3FC;\n","    }\n","\n","    [theme=dark] .colab-df-convert:hover {\n","      background-color: #434B5C;\n","      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n","      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n","      fill: #FFFFFF;\n","    }\n","  </style>\n","\n","      <script>\n","        const buttonEl =\n","          document.querySelector('#df-b21216ba-20c5-47f5-9c80-cae1ac9fbf73 button.colab-df-convert');\n","        buttonEl.style.display =\n","          google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n","        async function convertToInteractive(key) {\n","          const element = document.querySelector('#df-b21216ba-20c5-47f5-9c80-cae1ac9fbf73');\n","          const dataTable =\n","            await google.colab.kernel.invokeFunction('convertToInteractive',\n","                                                     [key], {});\n","          if (!dataTable) return;\n","\n","          const docLinkHtml = 'Like what you see? Visit the ' +\n","            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n","            + ' to learn more about interactive tables.';\n","          element.innerHTML = '';\n","          dataTable['output_type'] = 'display_data';\n","          await google.colab.output.renderOutput(dataTable, element);\n","          const docLink = document.createElement('div');\n","          docLink.innerHTML = docLinkHtml;\n","          element.appendChild(docLink);\n","        }\n","      </script>\n","    </div>\n","  </div>\n","  "]},"metadata":{},"execution_count":308}]},{"cell_type":"markdown","source":["# Trial success data"],"metadata":{"id":"XoM90nmr-LsX"}},{"cell_type":"code","source":["# compute the success probability per ICD chapter\n","trial_success = df.groupby(['chapter']).agg(total_trials=('nctid', np.size),\n","                                            successful_trial=('label', np.sum))\n","trial_success['probability_success'] = trial_success['successful_trial'] / trial_success['total_trials']\n","trial_success = trial_success['probability_success']"],"metadata":{"id":"RM_KFuYN1ZIK"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["df = df.merge(trial_success, on='chapter', how='left')\n","df.head()"],"metadata":{"id":"1YfwjeX7KX_o","executionInfo":{"status":"ok","timestamp":1651488013123,"user_tz":240,"elapsed":245,"user":{"displayName":"Benedikt Geiger","userId":"17925887631246406508"}},"colab":{"base_uri":"https://localhost:8080/","height":302},"outputId":"9c6107b4-81f0-4544-b8e6-b3b6181edd9e"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["         nctid  n_participants  \\\n","0  NCT00475085           944.0   \n","1  NCT01626859           152.0   \n","2  NCT00203957          2605.0   \n","3  NCT00169832          3204.0   \n","4  NCT01249352          1958.0   \n","\n","                                               drugs  \\\n","0  [aprepitant, dexamethasone, granisetron hydroc...   \n","1  [mp-214 low dose, mp-214 middle dose, mp-214 h...   \n","2                   [istradefylline, istradefylline]   \n","3                         [rosiglitazone or placebo]   \n","4             [nimotuzumab, cisplatin, fluorouracil]   \n","\n","                                      diseases icdcodes  \\\n","0                                     [nausea]      R11   \n","1                              [schizophrenia]      F20   \n","2                         [parkinsons disease]      G20   \n","3  [diabetes, coronary artery bypass grafting]      E23   \n","4          [esophageal cancer, adenocarcinoma]      K22   \n","\n","                                            criteria  label chapter  \\\n","0  \\n        Inclusion criteria:\\n\\n          -  ...      1   XVIII   \n","1  \\n        Inclusion Criteria:\\n\\n          -  ...      1       V   \n","2  \\n        Inclusion Criteria:\\n\\n          -  ...      1      VI   \n","3  \\n        Inclusion Criteria:\\n\\n        AT SC...      0      IV   \n","4  \\n        Inclusion Criteria:\\n\\n          1. ...      1      XI   \n","\n","   norm_n_participants  probability_success  \n","0            -0.477298             0.538462  \n","1            -0.930211             0.716814  \n","2             0.000000             0.660465  \n","3             0.000000             0.854633  \n","4             0.000000             0.611765  "],"text/html":["\n","  <div id=\"df-040c4060-2ce0-4340-9d5a-871a575caadb\">\n","    <div class=\"colab-df-container\">\n","      <div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>nctid</th>\n","      <th>n_participants</th>\n","      <th>drugs</th>\n","      <th>diseases</th>\n","      <th>icdcodes</th>\n","      <th>criteria</th>\n","      <th>label</th>\n","      <th>chapter</th>\n","      <th>norm_n_participants</th>\n","      <th>probability_success</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>NCT00475085</td>\n","      <td>944.0</td>\n","      <td>[aprepitant, dexamethasone, granisetron hydroc...</td>\n","      <td>[nausea]</td>\n","      <td>R11</td>\n","      <td>\\n        Inclusion criteria:\\n\\n          -  ...</td>\n","      <td>1</td>\n","      <td>XVIII</td>\n","      <td>-0.477298</td>\n","      <td>0.538462</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>NCT01626859</td>\n","      <td>152.0</td>\n","      <td>[mp-214 low dose, mp-214 middle dose, mp-214 h...</td>\n","      <td>[schizophrenia]</td>\n","      <td>F20</td>\n","      <td>\\n        Inclusion Criteria:\\n\\n          -  ...</td>\n","      <td>1</td>\n","      <td>V</td>\n","      <td>-0.930211</td>\n","      <td>0.716814</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>NCT00203957</td>\n","      <td>2605.0</td>\n","      <td>[istradefylline, istradefylline]</td>\n","      <td>[parkinsons disease]</td>\n","      <td>G20</td>\n","      <td>\\n        Inclusion Criteria:\\n\\n          -  ...</td>\n","      <td>1</td>\n","      <td>VI</td>\n","      <td>0.000000</td>\n","      <td>0.660465</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>NCT00169832</td>\n","      <td>3204.0</td>\n","      <td>[rosiglitazone or placebo]</td>\n","      <td>[diabetes, coronary artery bypass grafting]</td>\n","      <td>E23</td>\n","      <td>\\n        Inclusion Criteria:\\n\\n        AT SC...</td>\n","      <td>0</td>\n","      <td>IV</td>\n","      <td>0.000000</td>\n","      <td>0.854633</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>NCT01249352</td>\n","      <td>1958.0</td>\n","      <td>[nimotuzumab, cisplatin, fluorouracil]</td>\n","      <td>[esophageal cancer, adenocarcinoma]</td>\n","      <td>K22</td>\n","      <td>\\n        Inclusion Criteria:\\n\\n          1. ...</td>\n","      <td>1</td>\n","      <td>XI</td>\n","      <td>0.000000</td>\n","      <td>0.611765</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>\n","      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-040c4060-2ce0-4340-9d5a-871a575caadb')\"\n","              title=\"Convert this dataframe to an interactive table.\"\n","              style=\"display:none;\">\n","        \n","  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n","       width=\"24px\">\n","    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n","    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n","  </svg>\n","      </button>\n","      \n","  <style>\n","    .colab-df-container {\n","      display:flex;\n","      flex-wrap:wrap;\n","      gap: 12px;\n","    }\n","\n","    .colab-df-convert {\n","      background-color: #E8F0FE;\n","      border: none;\n","      border-radius: 50%;\n","      cursor: pointer;\n","      display: none;\n","      fill: #1967D2;\n","      height: 32px;\n","      padding: 0 0 0 0;\n","      width: 32px;\n","    }\n","\n","    .colab-df-convert:hover {\n","      background-color: #E2EBFA;\n","      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n","      fill: #174EA6;\n","    }\n","\n","    [theme=dark] .colab-df-convert {\n","      background-color: #3B4455;\n","      fill: #D2E3FC;\n","    }\n","\n","    [theme=dark] .colab-df-convert:hover {\n","      background-color: #434B5C;\n","      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n","      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n","      fill: #FFFFFF;\n","    }\n","  </style>\n","\n","      <script>\n","        const buttonEl =\n","          document.querySelector('#df-040c4060-2ce0-4340-9d5a-871a575caadb button.colab-df-convert');\n","        buttonEl.style.display =\n","          google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n","        async function convertToInteractive(key) {\n","          const element = document.querySelector('#df-040c4060-2ce0-4340-9d5a-871a575caadb');\n","          const dataTable =\n","            await google.colab.kernel.invokeFunction('convertToInteractive',\n","                                                     [key], {});\n","          if (!dataTable) return;\n","\n","          const docLinkHtml = 'Like what you see? Visit the ' +\n","            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n","            + ' to learn more about interactive tables.';\n","          element.innerHTML = '';\n","          dataTable['output_type'] = 'display_data';\n","          await google.colab.output.renderOutput(dataTable, element);\n","          const docLink = document.createElement('div');\n","          docLink.innerHTML = docLinkHtml;\n","          element.appendChild(docLink);\n","        }\n","      </script>\n","    </div>\n","  </div>\n","  "]},"metadata":{},"execution_count":311}]},{"cell_type":"code","source":["final_data = {nctid: np.array([df[\"norm_n_participants\"][i], df[\"probability_success\"][i]]) for i,nctid in enumerate(df[\"nctid\"])}"],"metadata":{"id":"xj_FACXuTE3j"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["with open(\"nctid2npart_success.pkl\", 'wb') as handle:\n","    pickle.dump(final_data, handle, protocol=pickle.HIGHEST_PROTOCOL)"],"metadata":{"id":"RVYjsLkyTmHJ"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["!mv nctid2npart_success.pkl \"/content/gdrive/My Drive/BMI 707 Project/embeddings/\""],"metadata":{"id":"mZ9hk1ZeT4u3"},"execution_count":null,"outputs":[]}]}