BMI707_Project-Clinical-T / Git / [b5ec00] /Data_processing/disease

Models:
joseph-gordon/
BMI707_Project-Clinical-T
Downloads: 1
[b5ec00]: / Data_processing / disease_embeddings.ipynb
History
Download this file
1 lines (1 with data), 17.8 kB

{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"disease_embeddings.ipynb","provenance":[],"collapsed_sections":[]},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","source":["# compare strings\n","!pip install jellyfish"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"tOHLooBWWeKC","executionInfo":{"status":"ok","timestamp":1651465346178,"user_tz":240,"elapsed":13781,"user":{"displayName":"Benedikt Geiger","userId":"17925887631246406508"}},"outputId":"70ad6214-8b2b-40da-a01a-0eaeee6bb51d"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Collecting jellyfish\n","  Downloading jellyfish-0.9.0.tar.gz (132 kB)\n","\u001b[?25l\r\u001b[K     |██▌                             | 10 kB 24.0 MB/s eta 0:00:01\r\u001b[K     |█████                           | 20 kB 28.3 MB/s eta 0:00:01\r\u001b[K     |███████▍                        | 30 kB 11.7 MB/s eta 0:00:01\r\u001b[K     |█████████▉                      | 40 kB 9.0 MB/s eta 0:00:01\r\u001b[K     |████████████▍                   | 51 kB 6.5 MB/s eta 0:00:01\r\u001b[K     |██████████████▉                 | 61 kB 7.6 MB/s eta 0:00:01\r\u001b[K     |█████████████████▎              | 71 kB 7.8 MB/s eta 0:00:01\r\u001b[K     |███████████████████▊            | 81 kB 6.7 MB/s eta 0:00:01\r\u001b[K     |██████████████████████▎         | 92 kB 7.3 MB/s eta 0:00:01\r\u001b[K     |████████████████████████▊       | 102 kB 7.9 MB/s eta 0:00:01\r\u001b[K     |███████████████████████████▏    | 112 kB 7.9 MB/s eta 0:00:01\r\u001b[K     |█████████████████████████████▋  | 122 kB 7.9 MB/s eta 0:00:01\r\u001b[K     |████████████████████████████████| 132 kB 7.9 MB/s \n","\u001b[?25hBuilding wheels for collected packages: jellyfish\n","  Building wheel for jellyfish (setup.py) ... \u001b[?25l\u001b[?25hdone\n","  Created wheel for jellyfish: filename=jellyfish-0.9.0-cp37-cp37m-linux_x86_64.whl size=73990 sha256=e832ad139a9792e4fddb334baac4f68c0fde839697339be84dfb0b0d31597652\n","  Stored in directory: /root/.cache/pip/wheels/fe/99/4e/646ce766df0d070b0ef04db27aa11543e2767fda3075aec31b\n","Successfully built jellyfish\n","Installing collected packages: jellyfish\n","Successfully installed jellyfish-0.9.0\n"]}]},{"cell_type":"code","execution_count":null,"metadata":{"id":"xmYS7gP9R9KM"},"outputs":[],"source":["import numpy as np\n","import pandas as pd\n","import jellyfish\n","from tqdm import tqdm\n","import pickle"]},{"cell_type":"code","source":["from google.colab import drive\n","drive.mount('/content/gdrive')"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"bJjEunvTSC9e","executionInfo":{"status":"ok","timestamp":1651465364334,"user_tz":240,"elapsed":17461,"user":{"displayName":"Benedikt Geiger","userId":"17925887631246406508"}},"outputId":"75e58e7b-9969-402b-adc4-1b3f84e48eb9"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/gdrive\n"]}]},{"cell_type":"code","source":["deep_learning_dir = \"/content/gdrive/My Drive/BMI 707 Project\"\n","\n","cui_path = \"/data/mapping\"\n","\n","# cui_str is a dataframe with two columns: CUI | concept as string\n","cui_str = pd.read_csv(deep_learning_dir + cui_path + \"/dictionary.csv\", sep=\"|\")\n","str_to_cui = dict(zip(cui_str[\"STR\"], cui_str[\"CUI\"]))"],"metadata":{"id":"Jixnn-72SE5N"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["data_path = \"/data_formatting\"\n","\n","train = pd.read_csv(deep_learning_dir + data_path + \"/training_data.tsv\", sep=\"\\t\")\n","val = pd.read_csv(deep_learning_dir + data_path + \"/validation_data.tsv\", sep=\"\\t\")\n","test = pd.read_csv(deep_learning_dir + data_path + \"/testing_data.tsv\", sep=\"\\t\")"],"metadata":{"id":"_qNKahXGT0PE"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["def clean_column(col):\n","  \"\"\"\n","  After reading in the files, the disease cells are strings indead of lists\n","  \"\"\"\n","  col = [e.replace(\"[\", '') for e in col]\n","  col = [e.replace(\"]\", '') for e in col]\n","  col = [e.replace(\"'\", '') for e in col]\n","  col = [e.split(\", \") for e in col]\n","\n","  return col\n","\n","\n","def flatten_lol(l):\n","  \"\"\"\n","  Flatten a list of lists\n","  \"\"\"\n","\n","  return [item for sublist in l for item in sublist]"],"metadata":{"id":"P5C5xhPFMxbn"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["diseases_train = clean_column(train[\"diseases\"])\n","diseases_val = clean_column(val[\"diseases\"])\n","diseases_test = clean_column(test[\"diseases\"])"],"metadata":{"id":"s7VNnOl-LnNB"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# all diseases in the dataset\n","diseases = flatten_lol(diseases_train) + flatten_lol(diseases_val) + flatten_lol(diseases_test)\n","\n","# hand-curation\n","diseases = list(set(diseases) - set([\"type 2\", \"high-risk\", \"first-line\", \"oral\", \"major\", \"breast\", \"immediate\", \"nontuberculous\",\n","                                     \"partial\", \"waldenstroms\", \"painful\", \"inborn\", \"local\", \"chronic\", \"specific\", \"left\", \"depression\",\n","                                     \"high\", \"human\", \"sudden\", \"mixed\", \"perennial\", \"postherpetic\", \"functional\", \"prevention\", \"safety\"]))"],"metadata":{"id":"QemMrOJUPKn0"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["def d_to_cuis(disease):\n","  \"\"\"\n","  Find CUI of a given disease. If no CUI matches, split the disease into words and look for CUI's\n","  of the words. If there is no match even after splitting the disease string into words, look for the\n","  closest CUI concept, where distance is defined via the damerau-levelshtein string distance.\n","  \"\"\"\n","  if disease in str_to_cui: \n","    \n","    return [str_to_cui[disease]]\n","\n","  # when the disease is not found in the str_to_cui dictionary\n","  words = disease.split(\" \")\n","  words = [w.split(\"-\") for w in words]\n","  words = flatten_lol(words)\n","\n","  cuis = []\n","\n","  for i,w in enumerate(words):\n","\n","      if w in str_to_cui:\n","\n","          cuis.append(str_to_cui[w])\n","  \n","  if len(cuis) == 0:\n","\n","    print(f\"There was no CUI match for the words in disease {disease}.\\n\")\n","\n","    distances = np.zeros(len(str_to_cui), dtype=float)\n","\n","    # compute string distance to all other CUI concepts\n","    for i,s in enumerate(str_to_cui.keys()):\n","\n","      distances[i] = jellyfish.damerau_levenshtein_distance(disease, str(s))\n","\n","    print(f\"The closest string we could find was {list(str_to_cui.keys())[np.argmin(distances)]}. \\n\")\n","\n","    cuis = [str_to_cui[list(str_to_cui.keys())[np.argmin(distances)]]]\n","    \n","  \n","  return cuis"],"metadata":{"id":"IrLzw6qmPgyb"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# match all diseases to a CUI\n","diseases_to_cuis = {}\n","\n","for d in tqdm(diseases):\n","\n","  diseases_to_cuis[d] = d_to_cuis(d)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"jhzb82gzaz8B","executionInfo":{"status":"ok","timestamp":1651465845252,"user_tz":240,"elapsed":368427,"user":{"displayName":"Benedikt Geiger","userId":"17925887631246406508"}},"outputId":"1244c5ba-a5bb-4081-bf6b-7e1545a78261"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["\r  0%|          | 0/1586 [00:00<?, ?it/s]"]},{"output_type":"stream","name":"stdout","text":["There was no CUI match for the words in disease chronic obstructive.\n","\n"]},{"output_type":"stream","name":"stderr","text":["\r  1%|          | 10/1586 [00:24<1:05:09,  2.48s/it]"]},{"output_type":"stream","name":"stdout","text":["The closest string we could find was chronic obstruction. \n","\n","There was no CUI match for the words in disease rheumatoid.\n","\n"]},{"output_type":"stream","name":"stderr","text":["\r  1%|          | 11/1586 [00:41<1:50:41,  4.22s/it]"]},{"output_type":"stream","name":"stdout","text":["The closest string we could find was rheumatism. \n","\n","There was no CUI match for the words in disease combined.\n","\n"]},{"output_type":"stream","name":"stderr","text":["\r  2%|▏         | 36/1586 [00:54<30:56,  1.20s/it]  "]},{"output_type":"stream","name":"stdout","text":["The closest string we could find was combines. \n","\n","There was no CUI match for the words in disease psychological.\n","\n"]},{"output_type":"stream","name":"stderr","text":["\r  5%|▌         | 86/1586 [01:12<15:21,  1.63it/s]"]},{"output_type":"stream","name":"stdout","text":["The closest string we could find was psychologic. \n","\n","There was no CUI match for the words in disease primary open angle.\n","\n"]},{"output_type":"stream","name":"stderr","text":["\r 18%|█▊        | 293/1586 [01:37<04:38,  4.64it/s]"]},{"output_type":"stream","name":"stdout","text":["The closest string we could find was wide open angle. \n","\n","There was no CUI match for the words in disease obstructive.\n","\n"]},{"output_type":"stream","name":"stderr","text":["\r 19%|█▉        | 308/1586 [01:53<06:01,  3.54it/s]"]},{"output_type":"stream","name":"stdout","text":["The closest string we could find was obstruction. \n","\n","There was no CUI match for the words in disease neurogenic.\n","\n"]},{"output_type":"stream","name":"stderr","text":["\r 31%|███       | 489/1586 [02:08<03:05,  5.91it/s]"]},{"output_type":"stream","name":"stdout","text":["The closest string we could find was neurogenia. \n","\n","There was no CUI match for the words in disease adenosquamous.\n","\n"]},{"output_type":"stream","name":"stderr","text":[" 38%|███▊      | 598/1586 [02:26<02:46,  5.92it/s]"]},{"output_type":"stream","name":"stdout","text":["The closest string we could find was adenothamnus. \n","\n","There was no CUI match for the words in disease protocol specific.\n","\n"]},{"output_type":"stream","name":"stderr","text":["\r 38%|███▊      | 603/1586 [02:49<04:15,  3.84it/s]"]},{"output_type":"stream","name":"stdout","text":["The closest string we could find was protocol specialist. \n","\n","There was no CUI match for the words in disease neurocognition.\n","\n"]},{"output_type":"stream","name":"stderr","text":["\r 42%|████▏     | 660/1586 [03:08<04:19,  3.56it/s]"]},{"output_type":"stream","name":"stdout","text":["The closest string we could find was neurocognitive. \n","\n","There was no CUI match for the words in disease cystoid.\n","\n"]},{"output_type":"stream","name":"stderr","text":["\r 48%|████▊     | 762/1586 [03:20<02:57,  4.64it/s]"]},{"output_type":"stream","name":"stdout","text":["The closest string we could find was cystoma. \n","\n","There was no CUI match for the words in disease hypertension,.\n","\n"]},{"output_type":"stream","name":"stderr","text":["\r 72%|███████▏  | 1139/1586 [03:38<00:45,  9.74it/s]"]},{"output_type":"stream","name":"stdout","text":["The closest string we could find was hypertension. \n","\n","There was no CUI match for the words in disease stage ii/iii.\n","\n"]},{"output_type":"stream","name":"stderr","text":[" 73%|███████▎  | 1151/1586 [03:56<01:01,  7.02it/s]"]},{"output_type":"stream","name":"stdout","text":["The closest string we could find was stage iii. \n","\n","There was no CUI match for the words in disease open-angle.\n","\n"]},{"output_type":"stream","name":"stderr","text":["\r 74%|███████▎  | 1168/1586 [04:11<01:17,  5.42it/s]"]},{"output_type":"stream","name":"stdout","text":["The closest string we could find was open angle. \n","\n","There was no CUI match for the words in disease psoriatic.\n","\n"]},{"output_type":"stream","name":"stderr","text":["\r 78%|███████▊  | 1237/1586 [04:25<01:06,  5.28it/s]"]},{"output_type":"stream","name":"stdout","text":["The closest string we could find was psoriatec. \n","\n","There was no CUI match for the words in disease genetic.\n","\n"]},{"output_type":"stream","name":"stderr","text":["\r 85%|████████▍ | 1347/1586 [04:37<00:38,  6.25it/s]"]},{"output_type":"stream","name":"stdout","text":["The closest string we could find was genetics. \n","\n","There was no CUI match for the words in disease sexual dysfunctions.\n","\n"]},{"output_type":"stream","name":"stderr","text":["\r 89%|████████▉ | 1413/1586 [05:01<00:36,  4.76it/s]"]},{"output_type":"stream","name":"stdout","text":["The closest string we could find was sexual dysfunction. \n","\n","There was no CUI match for the words in disease gouty.\n","\n"]},{"output_type":"stream","name":"stderr","text":["\r 92%|█████████▏| 1456/1586 [05:11<00:27,  4.67it/s]"]},{"output_type":"stream","name":"stdout","text":["The closest string we could find was gouts. \n","\n","There was no CUI match for the words in disease low-grade.\n","\n"]},{"output_type":"stream","name":"stderr","text":["\r 97%|█████████▋| 1536/1586 [05:25<00:10,  4.96it/s]"]},{"output_type":"stream","name":"stdout","text":["The closest string we could find was low grade. \n","\n","There was no CUI match for the words in disease healthy.\n","\n"]},{"output_type":"stream","name":"stderr","text":["\r 97%|█████████▋| 1537/1586 [05:37<00:13,  3.70it/s]"]},{"output_type":"stream","name":"stdout","text":["The closest string we could find was health. \n","\n","There was no CUI match for the words in disease ankylosing.\n","\n"]},{"output_type":"stream","name":"stderr","text":["\r 98%|█████████▊| 1562/1586 [05:52<00:07,  3.06it/s]"]},{"output_type":"stream","name":"stdout","text":["The closest string we could find was ankylosis. \n","\n","There was no CUI match for the words in disease myelocytic.\n","\n"]},{"output_type":"stream","name":"stderr","text":["100%|██████████| 1586/1586 [06:08<00:00,  4.31it/s]"]},{"output_type":"stream","name":"stdout","text":["The closest string we could find was myelocytes. \n","\n"]},{"output_type":"stream","name":"stderr","text":["\n"]}]},{"cell_type":"code","source":["# cui to embedding\n","cui_emb = pd.read_csv(deep_learning_dir + cui_path + \"/cui2vec_pretrained.csv\")\n","cui_emb.set_index(\"Unnamed: 0\", inplace=True)\n","\n","embeddings = {}\n","\n","for index, row in cui_emb.iterrows():\n","    embeddings[index] = np.array(row)"],"metadata":{"id":"iPGGSvEOdrTP"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# compute embedding of every disease with at least one existing corresponding CUI\n","# Note: a disease can have multiple corresponding CUI's\n","\n","d_emb = {}\n","\n","for d, cuis in diseases_to_cuis.items():\n","  \n","  embs = [embeddings[cui] for cui in cuis if cui in embeddings]\n","\n","  if len(embs) != 0:\n","\n","    d_emb[d] = np.mean(embs, axis=0)"],"metadata":{"id":"QycdyvpVfmSu"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["def embedding_row(diseases):\n","  \"\"\"\n","  Embed a list of diseases. Will correspond to the diseases associated to a single clinical trial\n","  \"\"\"\n","\n","  d_embs = [d_emb[d] for d in diseases if d in d_emb]\n","\n","  if len(d_embs) == 0:\n","    # 500 for embedding\n","    return np.zeros(500, dtype=float)\n","\n","  else:\n","    # 500 for average disease embedding\n","    return np.mean(d_embs, axis=0)"],"metadata":{"id":"cXntR6zLkXUQ"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# key: nctid of clinical trial, value: disease embedding\n","\n","train_embeddings = [embedding_row(diseases) for diseases in diseases_train]\n","train_n_diseases = [len(d) for d in diseases_train]\n","train_d_embeddings = dict(zip(train[\"nctid\"], train_embeddings))\n","train_n_diseases = dict(zip(train[\"nctid\"], train_n_diseases))\n","\n","val_embeddings = [embedding_row(diseases) for diseases in diseases_val]\n","val_n_diseases = [len(d) for d in diseases_val]\n","val_d_embeddings = dict(zip(val[\"nctid\"], val_embeddings))\n","val_n_diseases = dict(zip(val[\"nctid\"], val_n_diseases))\n","\n","\n","test_embeddings = [embedding_row(diseases) for diseases in diseases_test]\n","test_n_diseases = [len(d) for d in diseases_test]\n","test_d_embeddings = dict(zip(test[\"nctid\"], test_embeddings))\n","test_n_diseases = dict(zip(test[\"nctid\"], test_n_diseases))\n","\n","# combinded dictionary\n","final_d_embeddings = {**train_d_embeddings, **val_d_embeddings, **test_d_embeddings}\n","final_n_diseases = {**train_n_diseases, **val_n_diseases, **test_n_diseases}"],"metadata":{"id":"iMRpfmZ3kb-1"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["with open(\"nctid2diseases.pkl\", 'wb') as handle:\n","    pickle.dump(final_d_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)\n","\n","with open(\"nctid2n_diseases.pkl\", 'wb') as handle:\n","    pickle.dump(final_n_diseases, handle, protocol=pickle.HIGHEST_PROTOCOL)"],"metadata":{"id":"UM9ZeEVn3NAU"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["!mv nctid2diseases.pkl \"/content/gdrive/My Drive/BMI 707 Project/embeddings/\"\n","!mv nctid2n_diseases.pkl \"/content/gdrive/My Drive/BMI 707 Project/embeddings/\""],"metadata":{"id":"tQ6WfCFG3vHv"},"execution_count":null,"outputs":[]}]}