--- a +++ b/Data_processing/imputation.ipynb @@ -0,0 +1 @@ +{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"imputation.ipynb","provenance":[],"collapsed_sections":[]},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","source":["! pip install icd10-cm"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Q1Hoyh6oaYqd","executionInfo":{"status":"ok","timestamp":1651483216220,"user_tz":240,"elapsed":5316,"user":{"displayName":"Benedikt Geiger","userId":"17925887631246406508"}},"outputId":"7b78fbe2-07f6-4121-b883-5cc4026c724b"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Collecting icd10-cm\n"," Downloading icd10_cm-0.0.4-py2.py3-none-any.whl (675 kB)\n","\u001b[?25l\r\u001b[K |▌ | 10 kB 26.9 MB/s eta 0:00:01\r\u001b[K |█ | 20 kB 12.0 MB/s eta 0:00:01\r\u001b[K |█▌ | 30 kB 9.7 MB/s eta 0:00:01\r\u001b[K |██ | 40 kB 8.5 MB/s eta 0:00:01\r\u001b[K |██▍ | 51 kB 4.4 MB/s eta 0:00:01\r\u001b[K |███ | 61 kB 5.2 MB/s eta 0:00:01\r\u001b[K |███▍ | 71 kB 5.6 MB/s eta 0:00:01\r\u001b[K |███▉ | 81 kB 5.7 MB/s eta 0:00:01\r\u001b[K |████▍ | 92 kB 6.4 MB/s eta 0:00:01\r\u001b[K |████▉ | 102 kB 5.1 MB/s eta 0:00:01\r\u001b[K |█████▍ | 112 kB 5.1 MB/s eta 0:00:01\r\u001b[K |█████▉ | 122 kB 5.1 MB/s eta 0:00:01\r\u001b[K |██████▎ | 133 kB 5.1 MB/s eta 0:00:01\r\u001b[K |██████▉ | 143 kB 5.1 MB/s eta 0:00:01\r\u001b[K |███████▎ | 153 kB 5.1 MB/s eta 0:00:01\r\u001b[K |███████▊ | 163 kB 5.1 MB/s eta 0:00:01\r\u001b[K |████████▎ | 174 kB 5.1 MB/s eta 0:00:01\r\u001b[K |████████▊ | 184 kB 5.1 MB/s eta 0:00:01\r\u001b[K |█████████▏ | 194 kB 5.1 MB/s eta 0:00:01\r\u001b[K |█████████▊ | 204 kB 5.1 MB/s eta 0:00:01\r\u001b[K |██████████▏ | 215 kB 5.1 MB/s eta 0:00:01\r\u001b[K |██████████▊ | 225 kB 5.1 MB/s eta 0:00:01\r\u001b[K |███████████▏ | 235 kB 5.1 MB/s eta 0:00:01\r\u001b[K |███████████▋ | 245 kB 5.1 MB/s eta 0:00:01\r\u001b[K |████████████▏ | 256 kB 5.1 MB/s eta 0:00:01\r\u001b[K |████████████▋ | 266 kB 5.1 MB/s eta 0:00:01\r\u001b[K |█████████████ | 276 kB 5.1 MB/s eta 0:00:01\r\u001b[K |█████████████▋ | 286 kB 5.1 MB/s eta 0:00:01\r\u001b[K |██████████████ | 296 kB 5.1 MB/s eta 0:00:01\r\u001b[K |██████████████▋ | 307 kB 5.1 MB/s eta 0:00:01\r\u001b[K |███████████████ | 317 kB 5.1 MB/s eta 0:00:01\r\u001b[K |███████████████▌ | 327 kB 5.1 MB/s eta 0:00:01\r\u001b[K |████████████████ | 337 kB 5.1 MB/s eta 0:00:01\r\u001b[K |████████████████▌ | 348 kB 5.1 MB/s eta 0:00:01\r\u001b[K |█████████████████ | 358 kB 5.1 MB/s eta 0:00:01\r\u001b[K |█████████████████▌ | 368 kB 5.1 MB/s eta 0:00:01\r\u001b[K |██████████████████ | 378 kB 5.1 MB/s eta 0:00:01\r\u001b[K |██████████████████▍ | 389 kB 5.1 MB/s eta 0:00:01\r\u001b[K |███████████████████ | 399 kB 5.1 MB/s eta 0:00:01\r\u001b[K |███████████████████▍ | 409 kB 5.1 MB/s eta 0:00:01\r\u001b[K |████████████████████ | 419 kB 5.1 MB/s eta 0:00:01\r\u001b[K |████████████████████▍ | 430 kB 5.1 MB/s eta 0:00:01\r\u001b[K |████████████████████▉ | 440 kB 5.1 MB/s eta 0:00:01\r\u001b[K |█████████████████████▍ | 450 kB 5.1 MB/s eta 0:00:01\r\u001b[K |█████████████████████▉ | 460 kB 5.1 MB/s eta 0:00:01\r\u001b[K |██████████████████████▎ | 471 kB 5.1 MB/s eta 0:00:01\r\u001b[K |██████████████████████▉ | 481 kB 5.1 MB/s eta 0:00:01\r\u001b[K |███████████████████████▎ | 491 kB 5.1 MB/s eta 0:00:01\r\u001b[K |███████████████████████▊ | 501 kB 5.1 MB/s eta 0:00:01\r\u001b[K |████████████████████████▎ | 512 kB 5.1 MB/s eta 0:00:01\r\u001b[K |████████████████████████▊ | 522 kB 5.1 MB/s eta 0:00:01\r\u001b[K |█████████████████████████▎ | 532 kB 5.1 MB/s eta 0:00:01\r\u001b[K |█████████████████████████▊ | 542 kB 5.1 MB/s eta 0:00:01\r\u001b[K |██████████████████████████▏ | 552 kB 5.1 MB/s eta 0:00:01\r\u001b[K |██████████████████████████▊ | 563 kB 5.1 MB/s eta 0:00:01\r\u001b[K |███████████████████████████▏ | 573 kB 5.1 MB/s eta 0:00:01\r\u001b[K |███████████████████████████▋ | 583 kB 5.1 MB/s eta 0:00:01\r\u001b[K |████████████████████████████▏ | 593 kB 5.1 MB/s eta 0:00:01\r\u001b[K |████████████████████████████▋ | 604 kB 5.1 MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▏ | 614 kB 5.1 MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▋ | 624 kB 5.1 MB/s eta 0:00:01\r\u001b[K |██████████████████████████████ | 634 kB 5.1 MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▋ | 645 kB 5.1 MB/s eta 0:00:01\r\u001b[K |███████████████████████████████ | 655 kB 5.1 MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▌| 665 kB 5.1 MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 675 kB 5.1 MB/s \n","\u001b[?25hInstalling collected packages: icd10-cm\n","Successfully installed icd10-cm-0.0.4\n"]}]},{"cell_type":"code","source":["import pandas as pd\n","import os\n","import numpy as np\n","import icd10\n","import pickle\n","import matplotlib.pyplot as plt"],"metadata":{"id":"uRWSjYFJFkOI"},"execution_count":null,"outputs":[]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"gfPk_m_WEFFr","executionInfo":{"status":"ok","timestamp":1651487996613,"user_tz":240,"elapsed":1864,"user":{"displayName":"Benedikt Geiger","userId":"17925887631246406508"}},"outputId":"0f23885f-a5c2-49b9-f150-aa1e9dcc27bd"},"outputs":[{"output_type":"stream","name":"stdout","text":["Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount(\"/content/gdrive\", force_remount=True).\n"]}],"source":["from google.colab import drive\n","drive.mount('/content/gdrive')"]},{"cell_type":"code","source":["# Demo of icd10\n","code = icd10.find(\"R11\")\n","print(code.description) # Acute bronchitis due to Mycoplasma pneumoniae\n","if code.billable:\n"," print(code, \"is billable\") # J20.0 is billable\n","\n","print(code.chapter) # X\n","print(code.block) # J00-J99\n","print(code.block_description) # Diseases of the respiratory system"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"mTV5gVdexknC","executionInfo":{"status":"ok","timestamp":1651487996615,"user_tz":240,"elapsed":13,"user":{"displayName":"Benedikt Geiger","userId":"17925887631246406508"}},"outputId":"69a70e18-dec9-450b-f6b6-b0b5f5d7b29a"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Nausea and vomiting\n","XVIII\n","R00-R99\n","Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified\n"]}]},{"cell_type":"code","source":["deep_learning_dir = '/content/gdrive/My Drive/BMI 707 Project' "],"metadata":{"id":"Fh4HoiPnFtOr"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["df_training = pd.read_pickle(deep_learning_dir + '/data_formatting/training_data.pickle')\n","df_val = pd.read_pickle(deep_learning_dir + '/data_formatting/validation_data.pickle')\n","df_testing = pd.read_pickle(deep_learning_dir + '/data_formatting/testing_data.pickle')\n","\n","df_total = df_training.append(df_val)\n","df_total = df_total.append(df_testing)\n","\n","print(str(df_training.shape), str(df_val.shape), str(df_testing.shape), str(df_total.shape))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"nnkr38avie6w","executionInfo":{"status":"ok","timestamp":1651487996831,"user_tz":240,"elapsed":223,"user":{"displayName":"Benedikt Geiger","userId":"17925887631246406508"}},"outputId":"5718fe15-8213-4ee7-f3e0-883f483598a8"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["(3094, 7) (344, 7) (1146, 7) (4584, 7)\n"]}]},{"cell_type":"markdown","source":["# Inpute participant data"],"metadata":{"id":"_7TE11jp_M_o"}},{"cell_type":"code","source":["df = df_total.explode('icdcodes')\n","# Keep first ICD code\n","df['icdcodes'] = df['icdcodes'].apply(lambda x: x.split(\".\")[0])\n","df = df.drop_duplicates('nctid', keep='first')"],"metadata":{"id":"Lg82ya5tAsbu"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["def get_chapter(x): \n"," code = icd10.find(x)\n"," desc = 'Other'\n","\n"," try: \n"," desc = code.chapter\n"," except Exception: \n"," pass \n"," return desc"],"metadata":{"id":"ms7oAGFNgWhV"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["df['chapter'] = df['icdcodes'].apply(get_chapter)"],"metadata":{"id":"3t5UJi6Uaqdu"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["df.chapter.unique()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"9nHEwJWSqJG3","executionInfo":{"status":"ok","timestamp":1651487997039,"user_tz":240,"elapsed":24,"user":{"displayName":"Benedikt Geiger","userId":"17925887631246406508"}},"outputId":"db5cb1cb-ab98-4032-ef4e-418d5f6dd320"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array(['XVIII', 'V', 'VI', 'IV', 'XI', None, 'III', 'VII', 'XIII', 'II',\n"," 'X', 'I', 'IX', 'XV', 'XXI', 'XII', 'XIV', 'XX', 'XIX', 'XVI',\n"," 'XVII', 'Other', 'VIII'], dtype=object)"]},"metadata":{},"execution_count":299}]},{"cell_type":"code","source":["# Mapped as Other\n","df[df['chapter'] == 'Other']['icdcodes'].value_counts()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"nY7GaFct2DoE","executionInfo":{"status":"ok","timestamp":1651487997041,"user_tz":240,"elapsed":21,"user":{"displayName":"Benedikt Geiger","userId":"17925887631246406508"}},"outputId":"42bf6514-6896-4850-b1ca-e7813e1fa7c2"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["B00 44\n","O9A 6\n","J00 2\n","C7A 1\n","O00 1\n","Name: icdcodes, dtype: int64"]},"metadata":{},"execution_count":300}]},{"cell_type":"code","source":["# Mapped as na are all in the neoplasm chapter\n","df[df['chapter'].isna()]['icdcodes'].value_counts()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"O3F8LCP_3PSi","executionInfo":{"status":"ok","timestamp":1651487997042,"user_tz":240,"elapsed":18,"user":{"displayName":"Benedikt Geiger","userId":"17925887631246406508"}},"outputId":"b39f2d13-5d41-4523-c64b-d59523b2f8e7"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["C79 158\n","C78 102\n","C61 54\n","C90 40\n","C95 31\n","C91 20\n","C76 16\n","C96 16\n","C92 15\n","C57 11\n","K94 10\n","C49 10\n","C71 6\n","C73 5\n","C83 4\n","C67 3\n","C88 2\n","C84 1\n","C81 1\n","Name: icdcodes, dtype: int64"]},"metadata":{},"execution_count":301}]},{"cell_type":"code","source":["# hand curation\n","df.loc[df['icdcodes'] == 'B00', 'chapter'] = 'I'\n","df.loc[df['chapter'].isna(), 'chapter'] = 'II'\n","df.loc[df['icdcodes'] == 'C7A', 'chapter'] = 'II'\n","df.loc[df['icdcodes'] == 'J00', 'chapter'] = 'X'\n","df.loc[df['icdcodes'] == 'K94', 'chapter'] = 'XI'\n","df.loc[df['icdcodes'] == 'O00', 'chapter'] = 'XV'\n","df.loc[df['icdcodes'] == 'O9A', 'chapter'] = 'XV'"],"metadata":{"id":"agqv9fFn5eGM"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# number of trials with missing n_participants info\n","sum(df['n_participants'].isna()) / len(df)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"oWKL8XeTH6c4","executionInfo":{"status":"ok","timestamp":1651487997044,"user_tz":240,"elapsed":15,"user":{"displayName":"Benedikt Geiger","userId":"17925887631246406508"}},"outputId":"40ad15bf-0f94-49f7-de71-47da61ad9dad"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0.3706369982547993"]},"metadata":{},"execution_count":303}]},{"cell_type":"code","source":["#unique_chapter = df.chapter.unique()\n","#\n","#for x in unique_chapter: \n","# df[df.chapter == x]['n_participants'].hist(bins=30)\n","# plt.title('Chapter %s' %x)\n","# plt.show()"],"metadata":{"id":"9kmL7tMWp7Dk"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# Impute the number of participants per chapter with the median due to skewedness\n","df['n_participants'] = df['n_participants'].fillna(df.groupby('chapter')['n_participants'].transform('median'))"],"metadata":{"id":"85nQ1i9fp61l"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["df.head()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"i52GG15PIS4K","executionInfo":{"status":"ok","timestamp":1651487998166,"user_tz":240,"elapsed":12,"user":{"displayName":"Benedikt Geiger","userId":"17925887631246406508"}},"outputId":"6ebfb561-439a-4b08-c0db-63d3cddbd879"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" nctid n_participants \\\n","0 NCT00475085 944.0 \n","1 NCT01626859 152.0 \n","2 NCT00203957 2605.0 \n","3 NCT00169832 3204.0 \n","4 NCT01249352 1958.0 \n","\n"," drugs \\\n","0 [aprepitant, dexamethasone, granisetron hydroc... \n","1 [mp-214 low dose, mp-214 middle dose, mp-214 h... \n","2 [istradefylline, istradefylline] \n","3 [rosiglitazone or placebo] \n","4 [nimotuzumab, cisplatin, fluorouracil] \n","\n"," diseases icdcodes \\\n","0 [nausea] R11 \n","1 [schizophrenia] F20 \n","2 [parkinsons disease] G20 \n","3 [diabetes, coronary artery bypass grafting] E23 \n","4 [esophageal cancer, adenocarcinoma] K22 \n","\n"," criteria label chapter \n","0 \\n Inclusion criteria:\\n\\n - ... 1 XVIII \n","1 \\n Inclusion Criteria:\\n\\n - ... 1 V \n","2 \\n Inclusion Criteria:\\n\\n - ... 1 VI \n","3 \\n Inclusion Criteria:\\n\\n AT SC... 0 IV \n","4 \\n Inclusion Criteria:\\n\\n 1. ... 1 XI "],"text/html":["\n"," <div id=\"df-42645b82-0bfa-40fa-b385-69b89788f026\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>nctid</th>\n"," <th>n_participants</th>\n"," <th>drugs</th>\n"," <th>diseases</th>\n"," <th>icdcodes</th>\n"," <th>criteria</th>\n"," <th>label</th>\n"," <th>chapter</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>NCT00475085</td>\n"," <td>944.0</td>\n"," <td>[aprepitant, dexamethasone, granisetron hydroc...</td>\n"," <td>[nausea]</td>\n"," <td>R11</td>\n"," <td>\\n Inclusion criteria:\\n\\n - ...</td>\n"," <td>1</td>\n"," <td>XVIII</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>NCT01626859</td>\n"," <td>152.0</td>\n"," <td>[mp-214 low dose, mp-214 middle dose, mp-214 h...</td>\n"," <td>[schizophrenia]</td>\n"," <td>F20</td>\n"," <td>\\n Inclusion Criteria:\\n\\n - ...</td>\n"," <td>1</td>\n"," <td>V</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>NCT00203957</td>\n"," <td>2605.0</td>\n"," <td>[istradefylline, istradefylline]</td>\n"," <td>[parkinsons disease]</td>\n"," <td>G20</td>\n"," <td>\\n Inclusion Criteria:\\n\\n - ...</td>\n"," <td>1</td>\n"," <td>VI</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>NCT00169832</td>\n"," <td>3204.0</td>\n"," <td>[rosiglitazone or placebo]</td>\n"," <td>[diabetes, coronary artery bypass grafting]</td>\n"," <td>E23</td>\n"," <td>\\n Inclusion Criteria:\\n\\n AT SC...</td>\n"," <td>0</td>\n"," <td>IV</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>NCT01249352</td>\n"," <td>1958.0</td>\n"," <td>[nimotuzumab, cisplatin, fluorouracil]</td>\n"," <td>[esophageal cancer, adenocarcinoma]</td>\n"," <td>K22</td>\n"," <td>\\n Inclusion Criteria:\\n\\n 1. ...</td>\n"," <td>1</td>\n"," <td>XI</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-42645b82-0bfa-40fa-b385-69b89788f026')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-42645b82-0bfa-40fa-b385-69b89788f026 button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-42645b82-0bfa-40fa-b385-69b89788f026');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "]},"metadata":{},"execution_count":306}]},{"cell_type":"code","source":["for chapter in df.chapter.unique():\n"," n_part_chapter = df[df[\"chapter\"] == chapter][\"n_participants\"]\n"," df.loc[df[\"chapter\"] == chapter, \"norm_n_participants\"] = (n_part_chapter - np.median(n_part_chapter)) / np.median(n_part_chapter)"],"metadata":{"id":"7lE-x23ATcGy"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["df.head()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"GbjtthzuUQpi","executionInfo":{"status":"ok","timestamp":1651488000283,"user_tz":240,"elapsed":200,"user":{"displayName":"Benedikt Geiger","userId":"17925887631246406508"}},"outputId":"f96b8bf5-0fa0-48d3-8d14-8b4d5e65f2d2"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" nctid n_participants \\\n","0 NCT00475085 944.0 \n","1 NCT01626859 152.0 \n","2 NCT00203957 2605.0 \n","3 NCT00169832 3204.0 \n","4 NCT01249352 1958.0 \n","\n"," drugs \\\n","0 [aprepitant, dexamethasone, granisetron hydroc... \n","1 [mp-214 low dose, mp-214 middle dose, mp-214 h... \n","2 [istradefylline, istradefylline] \n","3 [rosiglitazone or placebo] \n","4 [nimotuzumab, cisplatin, fluorouracil] \n","\n"," diseases icdcodes \\\n","0 [nausea] R11 \n","1 [schizophrenia] F20 \n","2 [parkinsons disease] G20 \n","3 [diabetes, coronary artery bypass grafting] E23 \n","4 [esophageal cancer, adenocarcinoma] K22 \n","\n"," criteria label chapter \\\n","0 \\n Inclusion criteria:\\n\\n - ... 1 XVIII \n","1 \\n Inclusion Criteria:\\n\\n - ... 1 V \n","2 \\n Inclusion Criteria:\\n\\n - ... 1 VI \n","3 \\n Inclusion Criteria:\\n\\n AT SC... 0 IV \n","4 \\n Inclusion Criteria:\\n\\n 1. ... 1 XI \n","\n"," norm_n_participants \n","0 -0.477298 \n","1 -0.930211 \n","2 0.000000 \n","3 0.000000 \n","4 0.000000 "],"text/html":["\n"," <div id=\"df-b21216ba-20c5-47f5-9c80-cae1ac9fbf73\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>nctid</th>\n"," <th>n_participants</th>\n"," <th>drugs</th>\n"," <th>diseases</th>\n"," <th>icdcodes</th>\n"," <th>criteria</th>\n"," <th>label</th>\n"," <th>chapter</th>\n"," <th>norm_n_participants</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>NCT00475085</td>\n"," <td>944.0</td>\n"," <td>[aprepitant, dexamethasone, granisetron hydroc...</td>\n"," <td>[nausea]</td>\n"," <td>R11</td>\n"," <td>\\n Inclusion criteria:\\n\\n - ...</td>\n"," <td>1</td>\n"," <td>XVIII</td>\n"," <td>-0.477298</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>NCT01626859</td>\n"," <td>152.0</td>\n"," <td>[mp-214 low dose, mp-214 middle dose, mp-214 h...</td>\n"," <td>[schizophrenia]</td>\n"," <td>F20</td>\n"," <td>\\n Inclusion Criteria:\\n\\n - ...</td>\n"," <td>1</td>\n"," <td>V</td>\n"," <td>-0.930211</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>NCT00203957</td>\n"," <td>2605.0</td>\n"," <td>[istradefylline, istradefylline]</td>\n"," <td>[parkinsons disease]</td>\n"," <td>G20</td>\n"," <td>\\n Inclusion Criteria:\\n\\n - ...</td>\n"," <td>1</td>\n"," <td>VI</td>\n"," <td>0.000000</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>NCT00169832</td>\n"," <td>3204.0</td>\n"," <td>[rosiglitazone or placebo]</td>\n"," <td>[diabetes, coronary artery bypass grafting]</td>\n"," <td>E23</td>\n"," <td>\\n Inclusion Criteria:\\n\\n AT SC...</td>\n"," <td>0</td>\n"," <td>IV</td>\n"," <td>0.000000</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>NCT01249352</td>\n"," <td>1958.0</td>\n"," <td>[nimotuzumab, cisplatin, fluorouracil]</td>\n"," <td>[esophageal cancer, adenocarcinoma]</td>\n"," <td>K22</td>\n"," <td>\\n Inclusion Criteria:\\n\\n 1. ...</td>\n"," <td>1</td>\n"," <td>XI</td>\n"," <td>0.000000</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-b21216ba-20c5-47f5-9c80-cae1ac9fbf73')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-b21216ba-20c5-47f5-9c80-cae1ac9fbf73 button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-b21216ba-20c5-47f5-9c80-cae1ac9fbf73');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "]},"metadata":{},"execution_count":308}]},{"cell_type":"markdown","source":["# Trial success data"],"metadata":{"id":"XoM90nmr-LsX"}},{"cell_type":"code","source":["# compute the success probability per ICD chapter\n","trial_success = df.groupby(['chapter']).agg(total_trials=('nctid', np.size),\n"," successful_trial=('label', np.sum))\n","trial_success['probability_success'] = trial_success['successful_trial'] / trial_success['total_trials']\n","trial_success = trial_success['probability_success']"],"metadata":{"id":"RM_KFuYN1ZIK"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["df = df.merge(trial_success, on='chapter', how='left')\n","df.head()"],"metadata":{"id":"1YfwjeX7KX_o","executionInfo":{"status":"ok","timestamp":1651488013123,"user_tz":240,"elapsed":245,"user":{"displayName":"Benedikt Geiger","userId":"17925887631246406508"}},"colab":{"base_uri":"https://localhost:8080/","height":302},"outputId":"9c6107b4-81f0-4544-b8e6-b3b6181edd9e"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" nctid n_participants \\\n","0 NCT00475085 944.0 \n","1 NCT01626859 152.0 \n","2 NCT00203957 2605.0 \n","3 NCT00169832 3204.0 \n","4 NCT01249352 1958.0 \n","\n"," drugs \\\n","0 [aprepitant, dexamethasone, granisetron hydroc... \n","1 [mp-214 low dose, mp-214 middle dose, mp-214 h... \n","2 [istradefylline, istradefylline] \n","3 [rosiglitazone or placebo] \n","4 [nimotuzumab, cisplatin, fluorouracil] \n","\n"," diseases icdcodes \\\n","0 [nausea] R11 \n","1 [schizophrenia] F20 \n","2 [parkinsons disease] G20 \n","3 [diabetes, coronary artery bypass grafting] E23 \n","4 [esophageal cancer, adenocarcinoma] K22 \n","\n"," criteria label chapter \\\n","0 \\n Inclusion criteria:\\n\\n - ... 1 XVIII \n","1 \\n Inclusion Criteria:\\n\\n - ... 1 V \n","2 \\n Inclusion Criteria:\\n\\n - ... 1 VI \n","3 \\n Inclusion Criteria:\\n\\n AT SC... 0 IV \n","4 \\n Inclusion Criteria:\\n\\n 1. ... 1 XI \n","\n"," norm_n_participants probability_success \n","0 -0.477298 0.538462 \n","1 -0.930211 0.716814 \n","2 0.000000 0.660465 \n","3 0.000000 0.854633 \n","4 0.000000 0.611765 "],"text/html":["\n"," <div id=\"df-040c4060-2ce0-4340-9d5a-871a575caadb\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>nctid</th>\n"," <th>n_participants</th>\n"," <th>drugs</th>\n"," <th>diseases</th>\n"," <th>icdcodes</th>\n"," <th>criteria</th>\n"," <th>label</th>\n"," <th>chapter</th>\n"," <th>norm_n_participants</th>\n"," <th>probability_success</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>NCT00475085</td>\n"," <td>944.0</td>\n"," <td>[aprepitant, dexamethasone, granisetron hydroc...</td>\n"," <td>[nausea]</td>\n"," <td>R11</td>\n"," <td>\\n Inclusion criteria:\\n\\n - ...</td>\n"," <td>1</td>\n"," <td>XVIII</td>\n"," <td>-0.477298</td>\n"," <td>0.538462</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>NCT01626859</td>\n"," <td>152.0</td>\n"," <td>[mp-214 low dose, mp-214 middle dose, mp-214 h...</td>\n"," <td>[schizophrenia]</td>\n"," <td>F20</td>\n"," <td>\\n Inclusion Criteria:\\n\\n - ...</td>\n"," <td>1</td>\n"," <td>V</td>\n"," <td>-0.930211</td>\n"," <td>0.716814</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>NCT00203957</td>\n"," <td>2605.0</td>\n"," <td>[istradefylline, istradefylline]</td>\n"," <td>[parkinsons disease]</td>\n"," <td>G20</td>\n"," <td>\\n Inclusion Criteria:\\n\\n - ...</td>\n"," <td>1</td>\n"," <td>VI</td>\n"," <td>0.000000</td>\n"," <td>0.660465</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>NCT00169832</td>\n"," <td>3204.0</td>\n"," <td>[rosiglitazone or placebo]</td>\n"," <td>[diabetes, coronary artery bypass grafting]</td>\n"," <td>E23</td>\n"," <td>\\n Inclusion Criteria:\\n\\n AT SC...</td>\n"," <td>0</td>\n"," <td>IV</td>\n"," <td>0.000000</td>\n"," <td>0.854633</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>NCT01249352</td>\n"," <td>1958.0</td>\n"," <td>[nimotuzumab, cisplatin, fluorouracil]</td>\n"," <td>[esophageal cancer, adenocarcinoma]</td>\n"," <td>K22</td>\n"," <td>\\n Inclusion Criteria:\\n\\n 1. ...</td>\n"," <td>1</td>\n"," <td>XI</td>\n"," <td>0.000000</td>\n"," <td>0.611765</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-040c4060-2ce0-4340-9d5a-871a575caadb')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-040c4060-2ce0-4340-9d5a-871a575caadb button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-040c4060-2ce0-4340-9d5a-871a575caadb');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "]},"metadata":{},"execution_count":311}]},{"cell_type":"code","source":["final_data = {nctid: np.array([df[\"norm_n_participants\"][i], df[\"probability_success\"][i]]) for i,nctid in enumerate(df[\"nctid\"])}"],"metadata":{"id":"xj_FACXuTE3j"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["with open(\"nctid2npart_success.pkl\", 'wb') as handle:\n"," pickle.dump(final_data, handle, protocol=pickle.HIGHEST_PROTOCOL)"],"metadata":{"id":"RVYjsLkyTmHJ"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["!mv nctid2npart_success.pkl \"/content/gdrive/My Drive/BMI 707 Project/embeddings/\""],"metadata":{"id":"mZ9hk1ZeT4u3"},"execution_count":null,"outputs":[]}]} \ No newline at end of file