[dee452]: / Data_processing / target_embeddings_ELMo.ipynb

Download this file

1 lines (1 with data), 31.8 kB

{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"target_embeddings_ELMo.ipynb","provenance":[{"file_id":"1Ubkqn46rUex1huZFXQrlIIQ3UjFPJIFr","timestamp":1651450774467}],"machine_shape":"hm","collapsed_sections":[]},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"accelerator":"GPU"},"cells":[{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ApYPcWiSvA_7","executionInfo":{"status":"ok","timestamp":1651477269214,"user_tz":240,"elapsed":15158,"user":{"displayName":"Aishwarya Chander","userId":"08767132026600463397"}},"outputId":"37d774dc-5d1f-4bf5-aeba-cadb8bcdd0c7"},"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/gdrive\n"]}],"source":["from google.colab import drive\n","drive.mount('/content/gdrive')"]},{"cell_type":"code","source":["# Directory to the project folder\n","\n","deep_learning_dir = '/content/gdrive/MyDrive/BMI 707 Project/target_embeddings' "],"metadata":{"id":"5yyI3tpw4FJC","executionInfo":{"status":"ok","timestamp":1651477269215,"user_tz":240,"elapsed":3,"user":{"displayName":"Aishwarya Chander","userId":"08767132026600463397"}}},"execution_count":2,"outputs":[]},{"cell_type":"code","source":["# Import packages\n","\n","import os\n","import torch\n","import pickle\n","import numpy as np\n","import pandas as pd\n","from tqdm import tqdm"],"metadata":{"id":"1rbFpMRUvCoP","executionInfo":{"status":"ok","timestamp":1651477272790,"user_tz":240,"elapsed":3578,"user":{"displayName":"Aishwarya Chander","userId":"08767132026600463397"}}},"execution_count":3,"outputs":[]},{"cell_type":"code","source":["# Install ELMo embedder\n","# Reference: https://pypi.org/project/simple-elmo/\n","\n","!pip install --upgrade simple_elmo"],"metadata":{"id":"yjBkQv40vIN6"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["from simple_elmo import ElmoModel\n","\n","# Load model weights to ELMo\n","model_dir = deep_learning_dir + '/uniref50_v2'\n","\n","model = ElmoModel()\n","model.load(model_dir)"],"metadata":{"id":"w5qXyI8ovKqN"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["def seq2emb(sequence, model):\n","  '''\n","  Retrieving an embedding from a sequence using ELMO model.\n","\n","  Input: \n","    sequence (str): string with sequence\n","    model (ElmoModel): elmo model with preloaded weights\n","\n","  Returns:\n","    emb (tensor): tensor with shape ([1024]) with embedding of protein.\n","  '''\n","  emb = torch.tensor(model.get_elmo_vectors(sequence))\n","  emb = torch.tensor(emb).sum(dim=0).mean(dim=0) \n","  return emb"],"metadata":{"id":"O4vnlDSrWhHZ"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Target Dataset Processing:\n"],"metadata":{"id":"QeU_qW9WsV8P"}},{"cell_type":"code","source":["targets_df"],"metadata":{"id":"GabQeu8SuCiy","executionInfo":{"status":"ok","timestamp":1651477328704,"user_tz":240,"elapsed":145,"user":{"displayName":"Aishwarya Chander","userId":"08767132026600463397"}},"outputId":"4bfa5304-d561-45a9-fe05-93f9726703d6","colab":{"base_uri":"https://localhost:8080/","height":424}},"execution_count":6,"outputs":[{"output_type":"execute_result","data":{"text/plain":["       Unnamed: 0 drugbank_id  target_id  \\\n","0               0     DB00001  BE0000048   \n","1               1     DB00002  BE0000767   \n","2               1     DB00002  BE0000901   \n","3               1     DB00002  BE0002093   \n","4               1     DB00002  BE0002094   \n","...           ...         ...        ...   \n","19201       13445     DB15569  BE0004071   \n","19202       13446     DB15570  BE0009787   \n","19203       13468     DB15593  BE0009794   \n","19204       13469     DB15594  BE0009797   \n","19205       13474     DB15599  BE0003417   \n","\n","                                         target_sequence  \n","0      MAHVRGLQLPGCLALAALCSLVHSQHVFLAPQQARSLLQRVRRANT...  \n","1      MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...  \n","2      MWQLLLPTALLLLVSAGMRTEDLPKAVVFLEPQWYSVLEKDSVTLK...  \n","3      MWLLYLLVPALFCRAGGSIPIPQKLFGEVTSPLFPKPYPNNFETTT...  \n","4      MEGPRGWLVLCVLAISLASMVTEDLCRAPDGKKGEAGRPGRRGRPG...  \n","...                                                  ...  \n","19201  MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...  \n","19202  MNPTDIADTTLDESIYSNYYLYESIPKPCTKEGIKAFGELFLPPLY...  \n","19203  MLWWEEVEDCYEREDVQKKTFTKWVNAQFSKFGKQHIENLFSDLQD...  \n","19204                                               none  \n","19205  MWGLKVLLLPVVSFALYPEEILDTHWELWKKTHRKQYNNKVDEISR...  \n","\n","[19206 rows x 4 columns]"],"text/html":["\n","  <div id=\"df-e78cd4af-b218-4afc-901e-cbda4b1ffebb\">\n","    <div class=\"colab-df-container\">\n","      <div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>Unnamed: 0</th>\n","      <th>drugbank_id</th>\n","      <th>target_id</th>\n","      <th>target_sequence</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>0</td>\n","      <td>DB00001</td>\n","      <td>BE0000048</td>\n","      <td>MAHVRGLQLPGCLALAALCSLVHSQHVFLAPQQARSLLQRVRRANT...</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>1</td>\n","      <td>DB00002</td>\n","      <td>BE0000767</td>\n","      <td>MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>1</td>\n","      <td>DB00002</td>\n","      <td>BE0000901</td>\n","      <td>MWQLLLPTALLLLVSAGMRTEDLPKAVVFLEPQWYSVLEKDSVTLK...</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>1</td>\n","      <td>DB00002</td>\n","      <td>BE0002093</td>\n","      <td>MWLLYLLVPALFCRAGGSIPIPQKLFGEVTSPLFPKPYPNNFETTT...</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>1</td>\n","      <td>DB00002</td>\n","      <td>BE0002094</td>\n","      <td>MEGPRGWLVLCVLAISLASMVTEDLCRAPDGKKGEAGRPGRRGRPG...</td>\n","    </tr>\n","    <tr>\n","      <th>...</th>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","    </tr>\n","    <tr>\n","      <th>19201</th>\n","      <td>13445</td>\n","      <td>DB15569</td>\n","      <td>BE0004071</td>\n","      <td>MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...</td>\n","    </tr>\n","    <tr>\n","      <th>19202</th>\n","      <td>13446</td>\n","      <td>DB15570</td>\n","      <td>BE0009787</td>\n","      <td>MNPTDIADTTLDESIYSNYYLYESIPKPCTKEGIKAFGELFLPPLY...</td>\n","    </tr>\n","    <tr>\n","      <th>19203</th>\n","      <td>13468</td>\n","      <td>DB15593</td>\n","      <td>BE0009794</td>\n","      <td>MLWWEEVEDCYEREDVQKKTFTKWVNAQFSKFGKQHIENLFSDLQD...</td>\n","    </tr>\n","    <tr>\n","      <th>19204</th>\n","      <td>13469</td>\n","      <td>DB15594</td>\n","      <td>BE0009797</td>\n","      <td>none</td>\n","    </tr>\n","    <tr>\n","      <th>19205</th>\n","      <td>13474</td>\n","      <td>DB15599</td>\n","      <td>BE0003417</td>\n","      <td>MWGLKVLLLPVVSFALYPEEILDTHWELWKKTHRKQYNNKVDEISR...</td>\n","    </tr>\n","  </tbody>\n","</table>\n","<p>19206 rows × 4 columns</p>\n","</div>\n","      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-e78cd4af-b218-4afc-901e-cbda4b1ffebb')\"\n","              title=\"Convert this dataframe to an interactive table.\"\n","              style=\"display:none;\">\n","        \n","  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n","       width=\"24px\">\n","    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n","    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n","  </svg>\n","      </button>\n","      \n","  <style>\n","    .colab-df-container {\n","      display:flex;\n","      flex-wrap:wrap;\n","      gap: 12px;\n","    }\n","\n","    .colab-df-convert {\n","      background-color: #E8F0FE;\n","      border: none;\n","      border-radius: 50%;\n","      cursor: pointer;\n","      display: none;\n","      fill: #1967D2;\n","      height: 32px;\n","      padding: 0 0 0 0;\n","      width: 32px;\n","    }\n","\n","    .colab-df-convert:hover {\n","      background-color: #E2EBFA;\n","      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n","      fill: #174EA6;\n","    }\n","\n","    [theme=dark] .colab-df-convert {\n","      background-color: #3B4455;\n","      fill: #D2E3FC;\n","    }\n","\n","    [theme=dark] .colab-df-convert:hover {\n","      background-color: #434B5C;\n","      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n","      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n","      fill: #FFFFFF;\n","    }\n","  </style>\n","\n","      <script>\n","        const buttonEl =\n","          document.querySelector('#df-e78cd4af-b218-4afc-901e-cbda4b1ffebb button.colab-df-convert');\n","        buttonEl.style.display =\n","          google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n","        async function convertToInteractive(key) {\n","          const element = document.querySelector('#df-e78cd4af-b218-4afc-901e-cbda4b1ffebb');\n","          const dataTable =\n","            await google.colab.kernel.invokeFunction('convertToInteractive',\n","                                                     [key], {});\n","          if (!dataTable) return;\n","\n","          const docLinkHtml = 'Like what you see? Visit the ' +\n","            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n","            + ' to learn more about interactive tables.';\n","          element.innerHTML = '';\n","          dataTable['output_type'] = 'display_data';\n","          await google.colab.output.renderOutput(dataTable, element);\n","          const docLink = document.createElement('div');\n","          docLink.innerHTML = docLinkHtml;\n","          element.appendChild(docLink);\n","        }\n","      </script>\n","    </div>\n","  </div>\n","  "]},"metadata":{},"execution_count":6}]},{"cell_type":"code","source":["targets_df = pd.read_csv(\"/content/gdrive/MyDrive/BMI 707 Project/embeddings/target_embeddings/target_mappings.csv\")\n","\n","# Clean and sort data\n","clean_targets_df = targets_df[targets_df.target_sequence != 'none'] # remove entries with values 'none'\n","clean_targets_df = clean_targets_df.drop_duplicates(subset = [\"target_id\"]) # drop duplicates \n","clean_targets_df = clean_targets_df.sort_values(by=\"target_sequence\", key=lambda x: x.str.len()) # sort by sequence length\n","clean_targets_df"],"metadata":{"id":"vl9Qa2o03HUJ","executionInfo":{"status":"ok","timestamp":1651477316143,"user_tz":240,"elapsed":687,"user":{"displayName":"Aishwarya Chander","userId":"08767132026600463397"}},"colab":{"base_uri":"https://localhost:8080/","height":424},"outputId":"5f040371-c42c-4770-ef83-a90b61812d99"},"execution_count":5,"outputs":[{"output_type":"execute_result","data":{"text/plain":["       Unnamed: 0 drugbank_id  target_id  \\\n","2023          358     DB00370  BE0005582   \n","7754         2251     DB02379  BE0002017   \n","3041          604     DB00619  BE0001104   \n","12279        4870     DB05194  BE0002482   \n","11351        4195     DB04464  BE0004086   \n","...           ...         ...        ...   \n","1356          190     DB00201  BE0000739   \n","5850         1349     DB01411  BE0001034   \n","12064        4665     DB04959  BE0002460   \n","13987        6395     DB07293  BE0003801   \n","12067        4670     DB04964  BE0002439   \n","\n","                                         target_sequence  \n","2023                                         MLLWVQQALLA  \n","7754                                      AGVPFNTKTPYGPT  \n","3041                             GEGDVRCRGAASAVAAAAAAARQ  \n","12279                          MIWEEFTPEEGKGYREEVLTVKEIT  \n","11351                         MAQDIISTIGDLVKWIIDTVNKFTKK  \n","...                                                  ...  \n","1356   MGDAEGEDEVQFLRTDDEVVLQCSATVLKEQLKLCLAAEGFGNRLC...  \n","5850   MGLPLARLAAVCLALSLAGGSELQTEGRTRYHGRNVCSTWGNFHYK...  \n","12064  MATSGGEEAAAAAPAPGTPATGADTTPGWEVAVRPLLSASYSAFEM...  \n","13987  MESLVLGVNEKTHVQLSLPVLQVRDVLVRGFGDSVEEALSEAREHL...  \n","12067  MLKPSGLPGSSSPTRSLMTGSRSTKATPEMDSGLTGATLSPKTSTG...  \n","\n","[4809 rows x 4 columns]"],"text/html":["\n","  <div id=\"df-76a6f3ec-7371-4789-897f-0434af3e1bfa\">\n","    <div class=\"colab-df-container\">\n","      <div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>Unnamed: 0</th>\n","      <th>drugbank_id</th>\n","      <th>target_id</th>\n","      <th>target_sequence</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>2023</th>\n","      <td>358</td>\n","      <td>DB00370</td>\n","      <td>BE0005582</td>\n","      <td>MLLWVQQALLA</td>\n","    </tr>\n","    <tr>\n","      <th>7754</th>\n","      <td>2251</td>\n","      <td>DB02379</td>\n","      <td>BE0002017</td>\n","      <td>AGVPFNTKTPYGPT</td>\n","    </tr>\n","    <tr>\n","      <th>3041</th>\n","      <td>604</td>\n","      <td>DB00619</td>\n","      <td>BE0001104</td>\n","      <td>GEGDVRCRGAASAVAAAAAAARQ</td>\n","    </tr>\n","    <tr>\n","      <th>12279</th>\n","      <td>4870</td>\n","      <td>DB05194</td>\n","      <td>BE0002482</td>\n","      <td>MIWEEFTPEEGKGYREEVLTVKEIT</td>\n","    </tr>\n","    <tr>\n","      <th>11351</th>\n","      <td>4195</td>\n","      <td>DB04464</td>\n","      <td>BE0004086</td>\n","      <td>MAQDIISTIGDLVKWIIDTVNKFTKK</td>\n","    </tr>\n","    <tr>\n","      <th>...</th>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","    </tr>\n","    <tr>\n","      <th>1356</th>\n","      <td>190</td>\n","      <td>DB00201</td>\n","      <td>BE0000739</td>\n","      <td>MGDAEGEDEVQFLRTDDEVVLQCSATVLKEQLKLCLAAEGFGNRLC...</td>\n","    </tr>\n","    <tr>\n","      <th>5850</th>\n","      <td>1349</td>\n","      <td>DB01411</td>\n","      <td>BE0001034</td>\n","      <td>MGLPLARLAAVCLALSLAGGSELQTEGRTRYHGRNVCSTWGNFHYK...</td>\n","    </tr>\n","    <tr>\n","      <th>12064</th>\n","      <td>4665</td>\n","      <td>DB04959</td>\n","      <td>BE0002460</td>\n","      <td>MATSGGEEAAAAAPAPGTPATGADTTPGWEVAVRPLLSASYSAFEM...</td>\n","    </tr>\n","    <tr>\n","      <th>13987</th>\n","      <td>6395</td>\n","      <td>DB07293</td>\n","      <td>BE0003801</td>\n","      <td>MESLVLGVNEKTHVQLSLPVLQVRDVLVRGFGDSVEEALSEAREHL...</td>\n","    </tr>\n","    <tr>\n","      <th>12067</th>\n","      <td>4670</td>\n","      <td>DB04964</td>\n","      <td>BE0002439</td>\n","      <td>MLKPSGLPGSSSPTRSLMTGSRSTKATPEMDSGLTGATLSPKTSTG...</td>\n","    </tr>\n","  </tbody>\n","</table>\n","<p>4809 rows × 4 columns</p>\n","</div>\n","      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-76a6f3ec-7371-4789-897f-0434af3e1bfa')\"\n","              title=\"Convert this dataframe to an interactive table.\"\n","              style=\"display:none;\">\n","        \n","  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n","       width=\"24px\">\n","    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n","    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n","  </svg>\n","      </button>\n","      \n","  <style>\n","    .colab-df-container {\n","      display:flex;\n","      flex-wrap:wrap;\n","      gap: 12px;\n","    }\n","\n","    .colab-df-convert {\n","      background-color: #E8F0FE;\n","      border: none;\n","      border-radius: 50%;\n","      cursor: pointer;\n","      display: none;\n","      fill: #1967D2;\n","      height: 32px;\n","      padding: 0 0 0 0;\n","      width: 32px;\n","    }\n","\n","    .colab-df-convert:hover {\n","      background-color: #E2EBFA;\n","      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n","      fill: #174EA6;\n","    }\n","\n","    [theme=dark] .colab-df-convert {\n","      background-color: #3B4455;\n","      fill: #D2E3FC;\n","    }\n","\n","    [theme=dark] .colab-df-convert:hover {\n","      background-color: #434B5C;\n","      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n","      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n","      fill: #FFFFFF;\n","    }\n","  </style>\n","\n","      <script>\n","        const buttonEl =\n","          document.querySelector('#df-76a6f3ec-7371-4789-897f-0434af3e1bfa button.colab-df-convert');\n","        buttonEl.style.display =\n","          google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n","        async function convertToInteractive(key) {\n","          const element = document.querySelector('#df-76a6f3ec-7371-4789-897f-0434af3e1bfa');\n","          const dataTable =\n","            await google.colab.kernel.invokeFunction('convertToInteractive',\n","                                                     [key], {});\n","          if (!dataTable) return;\n","\n","          const docLinkHtml = 'Like what you see? Visit the ' +\n","            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n","            + ' to learn more about interactive tables.';\n","          element.innerHTML = '';\n","          dataTable['output_type'] = 'display_data';\n","          await google.colab.output.renderOutput(dataTable, element);\n","          const docLink = document.createElement('div');\n","          docLink.innerHTML = docLinkHtml;\n","          element.appendChild(docLink);\n","        }\n","      </script>\n","    </div>\n","  </div>\n","  "]},"metadata":{},"execution_count":5}]},{"cell_type":"code","source":["# Generate target embeddings as a list\n","\n","target_embeddings = []\n","\n","for emb in tqdm(clean_targets_df['target_sequence']):\n","  one_emb = seq2emb(emb, model) # use seq2emb function defined above\n","  target_embeddings.append(one_emb)\n","\n","print('Done!')"],"metadata":{"id":"Oy2YR-Pj3igD"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# Convert list to array for the dataframe (readability) \n","\n","embs_array = []\n","for i in target_embeddings:\n","  a = i.numpy()\n","  embs_array.append(a)\n","\n","# Add embeddings to dataframe\n","clean_targets_df['embeddings'] = embs_array"],"metadata":{"id":"yzzRmNWCmXr8"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# Export to pickle\n","clean_targets_df.to_pickle('drugbank_target_embedddings.pkl')\n","\n","# Copy to another directory to save file\n","# !cp -r \"/content/drugbank_target_embedddings.pkl\" \"/content/gdrive/MyDrive/BMI 707 Project/embeddings\""],"metadata":{"id":"Ga3ktZ8rmZty"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Load and view the final file: "],"metadata":{"id":"hKLsxK-YueKp"}},{"cell_type":"code","source":["# Load and view\n","final_df = pd.read_pickle(deep_learning_dir + \"/drugbank_target_embedddings.pkl\")\n","final_df"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":659},"id":"H7KxaS-3udVg","executionInfo":{"status":"ok","timestamp":1651461559791,"user_tz":240,"elapsed":328,"user":{"displayName":"Aishwarya Chander","userId":"08767132026600463397"}},"outputId":"8e60fda3-1025-4556-c448-dbac88071b63"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["       Unnamed: 0 drugbank_id  target_id  \\\n","2023          358     DB00370  BE0005582   \n","7754         2251     DB02379  BE0002017   \n","3041          604     DB00619  BE0001104   \n","12279        4870     DB05194  BE0002482   \n","11351        4195     DB04464  BE0004086   \n","...           ...         ...        ...   \n","1356          190     DB00201  BE0000739   \n","5850         1349     DB01411  BE0001034   \n","12064        4665     DB04959  BE0002460   \n","13987        6395     DB07293  BE0003801   \n","12067        4670     DB04964  BE0002439   \n","\n","                                         target_sequence  \\\n","2023                                         MLLWVQQALLA   \n","7754                                      AGVPFNTKTPYGPT   \n","3041                             GEGDVRCRGAASAVAAAAAAARQ   \n","12279                          MIWEEFTPEEGKGYREEVLTVKEIT   \n","11351                         MAQDIISTIGDLVKWIIDTVNKFTKK   \n","...                                                  ...   \n","1356   MGDAEGEDEVQFLRTDDEVVLQCSATVLKEQLKLCLAAEGFGNRLC...   \n","5850   MGLPLARLAAVCLALSLAGGSELQTEGRTRYHGRNVCSTWGNFHYK...   \n","12064  MATSGGEEAAAAAPAPGTPATGADTTPGWEVAVRPLLSASYSAFEM...   \n","13987  MESLVLGVNEKTHVQLSLPVLQVRDVLVRGFGDSVEEALSEAREHL...   \n","12067  MLKPSGLPGSSSPTRSLMTGSRSTKATPEMDSGLTGATLSPKTSTG...   \n","\n","                                              embeddings  \n","2023   [7.6123374360613525, 7.972399162128568, 1.5261...  \n","7754   [7.6123374360613525, 7.972399162128568, 1.5261...  \n","3041   [7.6123374360613525, 7.972399162128568, 1.5261...  \n","12279  [7.6123374360613525, 7.972399162128568, 1.5261...  \n","11351  [7.6123374360613525, 7.972399162128568, 1.5261...  \n","...                                                  ...  \n","1356   [387.6335365786217, 532.5416541863233, 167.123...  \n","5850   [444.92687717976514, 738.1280948370695, -60.24...  \n","12064  [392.5545535045676, 573.7628478072584, 160.352...  \n","13987  [530.1488111329963, 766.3823193386197, 209.408...  \n","12067  [1266.8234119648114, 1881.3945757914335, -7.99...  \n","\n","[4809 rows x 5 columns]"],"text/html":["\n","  <div id=\"df-e3c328fd-cb0d-47ec-9bc7-6fa4ec08ba5a\">\n","    <div class=\"colab-df-container\">\n","      <div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>Unnamed: 0</th>\n","      <th>drugbank_id</th>\n","      <th>target_id</th>\n","      <th>target_sequence</th>\n","      <th>embeddings</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>2023</th>\n","      <td>358</td>\n","      <td>DB00370</td>\n","      <td>BE0005582</td>\n","      <td>MLLWVQQALLA</td>\n","      <td>[7.6123374360613525, 7.972399162128568, 1.5261...</td>\n","    </tr>\n","    <tr>\n","      <th>7754</th>\n","      <td>2251</td>\n","      <td>DB02379</td>\n","      <td>BE0002017</td>\n","      <td>AGVPFNTKTPYGPT</td>\n","      <td>[7.6123374360613525, 7.972399162128568, 1.5261...</td>\n","    </tr>\n","    <tr>\n","      <th>3041</th>\n","      <td>604</td>\n","      <td>DB00619</td>\n","      <td>BE0001104</td>\n","      <td>GEGDVRCRGAASAVAAAAAAARQ</td>\n","      <td>[7.6123374360613525, 7.972399162128568, 1.5261...</td>\n","    </tr>\n","    <tr>\n","      <th>12279</th>\n","      <td>4870</td>\n","      <td>DB05194</td>\n","      <td>BE0002482</td>\n","      <td>MIWEEFTPEEGKGYREEVLTVKEIT</td>\n","      <td>[7.6123374360613525, 7.972399162128568, 1.5261...</td>\n","    </tr>\n","    <tr>\n","      <th>11351</th>\n","      <td>4195</td>\n","      <td>DB04464</td>\n","      <td>BE0004086</td>\n","      <td>MAQDIISTIGDLVKWIIDTVNKFTKK</td>\n","      <td>[7.6123374360613525, 7.972399162128568, 1.5261...</td>\n","    </tr>\n","    <tr>\n","      <th>...</th>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","    </tr>\n","    <tr>\n","      <th>1356</th>\n","      <td>190</td>\n","      <td>DB00201</td>\n","      <td>BE0000739</td>\n","      <td>MGDAEGEDEVQFLRTDDEVVLQCSATVLKEQLKLCLAAEGFGNRLC...</td>\n","      <td>[387.6335365786217, 532.5416541863233, 167.123...</td>\n","    </tr>\n","    <tr>\n","      <th>5850</th>\n","      <td>1349</td>\n","      <td>DB01411</td>\n","      <td>BE0001034</td>\n","      <td>MGLPLARLAAVCLALSLAGGSELQTEGRTRYHGRNVCSTWGNFHYK...</td>\n","      <td>[444.92687717976514, 738.1280948370695, -60.24...</td>\n","    </tr>\n","    <tr>\n","      <th>12064</th>\n","      <td>4665</td>\n","      <td>DB04959</td>\n","      <td>BE0002460</td>\n","      <td>MATSGGEEAAAAAPAPGTPATGADTTPGWEVAVRPLLSASYSAFEM...</td>\n","      <td>[392.5545535045676, 573.7628478072584, 160.352...</td>\n","    </tr>\n","    <tr>\n","      <th>13987</th>\n","      <td>6395</td>\n","      <td>DB07293</td>\n","      <td>BE0003801</td>\n","      <td>MESLVLGVNEKTHVQLSLPVLQVRDVLVRGFGDSVEEALSEAREHL...</td>\n","      <td>[530.1488111329963, 766.3823193386197, 209.408...</td>\n","    </tr>\n","    <tr>\n","      <th>12067</th>\n","      <td>4670</td>\n","      <td>DB04964</td>\n","      <td>BE0002439</td>\n","      <td>MLKPSGLPGSSSPTRSLMTGSRSTKATPEMDSGLTGATLSPKTSTG...</td>\n","      <td>[1266.8234119648114, 1881.3945757914335, -7.99...</td>\n","    </tr>\n","  </tbody>\n","</table>\n","<p>4809 rows × 5 columns</p>\n","</div>\n","      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-e3c328fd-cb0d-47ec-9bc7-6fa4ec08ba5a')\"\n","              title=\"Convert this dataframe to an interactive table.\"\n","              style=\"display:none;\">\n","        \n","  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n","       width=\"24px\">\n","    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n","    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n","  </svg>\n","      </button>\n","      \n","  <style>\n","    .colab-df-container {\n","      display:flex;\n","      flex-wrap:wrap;\n","      gap: 12px;\n","    }\n","\n","    .colab-df-convert {\n","      background-color: #E8F0FE;\n","      border: none;\n","      border-radius: 50%;\n","      cursor: pointer;\n","      display: none;\n","      fill: #1967D2;\n","      height: 32px;\n","      padding: 0 0 0 0;\n","      width: 32px;\n","    }\n","\n","    .colab-df-convert:hover {\n","      background-color: #E2EBFA;\n","      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n","      fill: #174EA6;\n","    }\n","\n","    [theme=dark] .colab-df-convert {\n","      background-color: #3B4455;\n","      fill: #D2E3FC;\n","    }\n","\n","    [theme=dark] .colab-df-convert:hover {\n","      background-color: #434B5C;\n","      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n","      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n","      fill: #FFFFFF;\n","    }\n","  </style>\n","\n","      <script>\n","        const buttonEl =\n","          document.querySelector('#df-e3c328fd-cb0d-47ec-9bc7-6fa4ec08ba5a button.colab-df-convert');\n","        buttonEl.style.display =\n","          google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n","        async function convertToInteractive(key) {\n","          const element = document.querySelector('#df-e3c328fd-cb0d-47ec-9bc7-6fa4ec08ba5a');\n","          const dataTable =\n","            await google.colab.kernel.invokeFunction('convertToInteractive',\n","                                                     [key], {});\n","          if (!dataTable) return;\n","\n","          const docLinkHtml = 'Like what you see? Visit the ' +\n","            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n","            + ' to learn more about interactive tables.';\n","          element.innerHTML = '';\n","          dataTable['output_type'] = 'display_data';\n","          await google.colab.output.renderOutput(dataTable, element);\n","          const docLink = document.createElement('div');\n","          docLink.innerHTML = docLinkHtml;\n","          element.appendChild(docLink);\n","        }\n","      </script>\n","    </div>\n","  </div>\n","  "]},"metadata":{},"execution_count":49}]},{"cell_type":"code","source":["# View one example\n","print(\"Sample Target Embedding\\nTarget ID:\", final_df['target_id'][1], \n","      \"\\nTarget Embedding:\", final_df['embeddings'][1], \n","      \"\\nTarget Sequence:\", final_df['target_sequence'][1])"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"oG2cdxVZxt_Z","executionInfo":{"status":"ok","timestamp":1651461769641,"user_tz":240,"elapsed":4,"user":{"displayName":"Aishwarya Chander","userId":"08767132026600463397"}},"outputId":"cf5a3377-ba79-4ece-ed2f-6017b0bc27d3"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Sample Target Embedding\n","Target ID: BE0000767 \n","Target Embedding: [ 91.97858468 125.20936934  41.84956151 ...  45.29450765 -88.95176846\n"," -41.53821779] \n","Target Sequence: MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYVQRNYDLSFLKTIQEVAGYVLIALNTVERIPLENLQIIRGNMYYENSYALAVLSNYDANKTGLKELPMRNLQEILHGAVRFSNNPALCNVESIQWRDIVSSDFLSNMSMDFQNHLGSCQKCDPSCPNGSCWGAGEENCQKLTKIICAQQCSGRCRGKSPSDCCHNQCAAGCTGPRESDCLVCRKFRDEATCKDTCPPLMLYNPTTYQMDVNPEGKYSFGATCVKKCPRNYVVTDHGSCVRACGADSYEMEEDGVRKCKKCEGPCRKVCNGIGIGEFKDSLSINATNIKHFKNCTSISGDLHILPVAFRGDSFTHTPPLDPQELDILKTVKEITGFLLIQAWPENRTDLHAFENLEIIRGRTKQHGQFSLAVVSLNITSLGLRSLKEISDGDVIISGNKNLCYANTINWKKLFGTSGQKTKIISNRGENSCKATGQVCHALCSPEGCWGPEPRDCVSCRNVSRGRECVDKCNLLEGEPREFVENSECIQCHPECLPQAMNITCTGRGPDNCIQCAHYIDGPHCVKTCPAGVMGENNTLVWKYADAGHVCHLCHPNCTYGCTGPGLEGCPTNGPKIPSIATGMVGALLLLLVVALGIGLFMRRRHIVRKRTLRRLLQERELVEPLTPSGEAPNQALLRILKETEFKKIKVLGSGAFGTVYKGLWIPEGEKVKIPVAIKELREATSPKANKEILDEAYVMASVDNPHVCRLLGICLTSTVQLITQLMPFGCLLDYVREHKDNIGSQYLLNWCVQIAKGMNYLEDRRLVHRDLAARNVLVKTPQHVKITDFGLAKLLGAEEKEYHAEGGKVPIKWMALESILHRIYTHQSDVWSYGVTVWELMTFGSKPYDGIPASEISSILEKGERLPQPPICTIDVYMIMVKCWMIDADSRPKFRELIIEFSKMARDPQRYLVIQGDERMHLPSPTDSNFYRALMDEEDMDDVVDADEYLIPQQGFFSSPSTSRTPLLSSLSATSNNSTVACIDRNGLQSCPIKEDSFLQRYSSDPTGALTEDSIDDTFLPVPEYINQSVPKRPAGSVQNPVYHNQPLNPAPSRDPHYQDPHSTAVGNPEYLNTVQPTCVNSTFDSPAHWAQKGSHQISLDNPDYQQDFFPKEAKPNGIFKGSTAENAEYLRVAPQSSEFIGA\n"]}]}]}