1 lines (1 with data), 31.8 kB
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"target_embeddings_ELMo.ipynb","provenance":[{"file_id":"1Ubkqn46rUex1huZFXQrlIIQ3UjFPJIFr","timestamp":1651450774467}],"machine_shape":"hm","collapsed_sections":[]},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"accelerator":"GPU"},"cells":[{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ApYPcWiSvA_7","executionInfo":{"status":"ok","timestamp":1651477269214,"user_tz":240,"elapsed":15158,"user":{"displayName":"Aishwarya Chander","userId":"08767132026600463397"}},"outputId":"37d774dc-5d1f-4bf5-aeba-cadb8bcdd0c7"},"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/gdrive\n"]}],"source":["from google.colab import drive\n","drive.mount('/content/gdrive')"]},{"cell_type":"code","source":["# Directory to the project folder\n","\n","deep_learning_dir = '/content/gdrive/MyDrive/BMI 707 Project/target_embeddings' "],"metadata":{"id":"5yyI3tpw4FJC","executionInfo":{"status":"ok","timestamp":1651477269215,"user_tz":240,"elapsed":3,"user":{"displayName":"Aishwarya Chander","userId":"08767132026600463397"}}},"execution_count":2,"outputs":[]},{"cell_type":"code","source":["# Import packages\n","\n","import os\n","import torch\n","import pickle\n","import numpy as np\n","import pandas as pd\n","from tqdm import tqdm"],"metadata":{"id":"1rbFpMRUvCoP","executionInfo":{"status":"ok","timestamp":1651477272790,"user_tz":240,"elapsed":3578,"user":{"displayName":"Aishwarya Chander","userId":"08767132026600463397"}}},"execution_count":3,"outputs":[]},{"cell_type":"code","source":["# Install ELMo embedder\n","# Reference: https://pypi.org/project/simple-elmo/\n","\n","!pip install --upgrade simple_elmo"],"metadata":{"id":"yjBkQv40vIN6"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["from simple_elmo import ElmoModel\n","\n","# Load model weights to ELMo\n","model_dir = deep_learning_dir + '/uniref50_v2'\n","\n","model = ElmoModel()\n","model.load(model_dir)"],"metadata":{"id":"w5qXyI8ovKqN"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["def seq2emb(sequence, model):\n"," '''\n"," Retrieving an embedding from a sequence using ELMO model.\n","\n"," Input: \n"," sequence (str): string with sequence\n"," model (ElmoModel): elmo model with preloaded weights\n","\n"," Returns:\n"," emb (tensor): tensor with shape ([1024]) with embedding of protein.\n"," '''\n"," emb = torch.tensor(model.get_elmo_vectors(sequence))\n"," emb = torch.tensor(emb).sum(dim=0).mean(dim=0) \n"," return emb"],"metadata":{"id":"O4vnlDSrWhHZ"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Target Dataset Processing:\n"],"metadata":{"id":"QeU_qW9WsV8P"}},{"cell_type":"code","source":["targets_df"],"metadata":{"id":"GabQeu8SuCiy","executionInfo":{"status":"ok","timestamp":1651477328704,"user_tz":240,"elapsed":145,"user":{"displayName":"Aishwarya Chander","userId":"08767132026600463397"}},"outputId":"4bfa5304-d561-45a9-fe05-93f9726703d6","colab":{"base_uri":"https://localhost:8080/","height":424}},"execution_count":6,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" Unnamed: 0 drugbank_id target_id \\\n","0 0 DB00001 BE0000048 \n","1 1 DB00002 BE0000767 \n","2 1 DB00002 BE0000901 \n","3 1 DB00002 BE0002093 \n","4 1 DB00002 BE0002094 \n","... ... ... ... \n","19201 13445 DB15569 BE0004071 \n","19202 13446 DB15570 BE0009787 \n","19203 13468 DB15593 BE0009794 \n","19204 13469 DB15594 BE0009797 \n","19205 13474 DB15599 BE0003417 \n","\n"," target_sequence \n","0 MAHVRGLQLPGCLALAALCSLVHSQHVFLAPQQARSLLQRVRRANT... \n","1 MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED... \n","2 MWQLLLPTALLLLVSAGMRTEDLPKAVVFLEPQWYSVLEKDSVTLK... \n","3 MWLLYLLVPALFCRAGGSIPIPQKLFGEVTSPLFPKPYPNNFETTT... \n","4 MEGPRGWLVLCVLAISLASMVTEDLCRAPDGKKGEAGRPGRRGRPG... \n","... ... \n","19201 MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI... \n","19202 MNPTDIADTTLDESIYSNYYLYESIPKPCTKEGIKAFGELFLPPLY... \n","19203 MLWWEEVEDCYEREDVQKKTFTKWVNAQFSKFGKQHIENLFSDLQD... \n","19204 none \n","19205 MWGLKVLLLPVVSFALYPEEILDTHWELWKKTHRKQYNNKVDEISR... \n","\n","[19206 rows x 4 columns]"],"text/html":["\n"," <div id=\"df-e78cd4af-b218-4afc-901e-cbda4b1ffebb\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>Unnamed: 0</th>\n"," <th>drugbank_id</th>\n"," <th>target_id</th>\n"," <th>target_sequence</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>0</td>\n"," <td>DB00001</td>\n"," <td>BE0000048</td>\n"," <td>MAHVRGLQLPGCLALAALCSLVHSQHVFLAPQQARSLLQRVRRANT...</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>1</td>\n"," <td>DB00002</td>\n"," <td>BE0000767</td>\n"," <td>MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>1</td>\n"," <td>DB00002</td>\n"," <td>BE0000901</td>\n"," <td>MWQLLLPTALLLLVSAGMRTEDLPKAVVFLEPQWYSVLEKDSVTLK...</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>1</td>\n"," <td>DB00002</td>\n"," <td>BE0002093</td>\n"," <td>MWLLYLLVPALFCRAGGSIPIPQKLFGEVTSPLFPKPYPNNFETTT...</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>1</td>\n"," <td>DB00002</td>\n"," <td>BE0002094</td>\n"," <td>MEGPRGWLVLCVLAISLASMVTEDLCRAPDGKKGEAGRPGRRGRPG...</td>\n"," </tr>\n"," <tr>\n"," <th>...</th>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," </tr>\n"," <tr>\n"," <th>19201</th>\n"," <td>13445</td>\n"," <td>DB15569</td>\n"," <td>BE0004071</td>\n"," <td>MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...</td>\n"," </tr>\n"," <tr>\n"," <th>19202</th>\n"," <td>13446</td>\n"," <td>DB15570</td>\n"," <td>BE0009787</td>\n"," <td>MNPTDIADTTLDESIYSNYYLYESIPKPCTKEGIKAFGELFLPPLY...</td>\n"," </tr>\n"," <tr>\n"," <th>19203</th>\n"," <td>13468</td>\n"," <td>DB15593</td>\n"," <td>BE0009794</td>\n"," <td>MLWWEEVEDCYEREDVQKKTFTKWVNAQFSKFGKQHIENLFSDLQD...</td>\n"," </tr>\n"," <tr>\n"," <th>19204</th>\n"," <td>13469</td>\n"," <td>DB15594</td>\n"," <td>BE0009797</td>\n"," <td>none</td>\n"," </tr>\n"," <tr>\n"," <th>19205</th>\n"," <td>13474</td>\n"," <td>DB15599</td>\n"," <td>BE0003417</td>\n"," <td>MWGLKVLLLPVVSFALYPEEILDTHWELWKKTHRKQYNNKVDEISR...</td>\n"," </tr>\n"," </tbody>\n","</table>\n","<p>19206 rows × 4 columns</p>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-e78cd4af-b218-4afc-901e-cbda4b1ffebb')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-e78cd4af-b218-4afc-901e-cbda4b1ffebb button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-e78cd4af-b218-4afc-901e-cbda4b1ffebb');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "]},"metadata":{},"execution_count":6}]},{"cell_type":"code","source":["targets_df = pd.read_csv(\"/content/gdrive/MyDrive/BMI 707 Project/embeddings/target_embeddings/target_mappings.csv\")\n","\n","# Clean and sort data\n","clean_targets_df = targets_df[targets_df.target_sequence != 'none'] # remove entries with values 'none'\n","clean_targets_df = clean_targets_df.drop_duplicates(subset = [\"target_id\"]) # drop duplicates \n","clean_targets_df = clean_targets_df.sort_values(by=\"target_sequence\", key=lambda x: x.str.len()) # sort by sequence length\n","clean_targets_df"],"metadata":{"id":"vl9Qa2o03HUJ","executionInfo":{"status":"ok","timestamp":1651477316143,"user_tz":240,"elapsed":687,"user":{"displayName":"Aishwarya Chander","userId":"08767132026600463397"}},"colab":{"base_uri":"https://localhost:8080/","height":424},"outputId":"5f040371-c42c-4770-ef83-a90b61812d99"},"execution_count":5,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" Unnamed: 0 drugbank_id target_id \\\n","2023 358 DB00370 BE0005582 \n","7754 2251 DB02379 BE0002017 \n","3041 604 DB00619 BE0001104 \n","12279 4870 DB05194 BE0002482 \n","11351 4195 DB04464 BE0004086 \n","... ... ... ... \n","1356 190 DB00201 BE0000739 \n","5850 1349 DB01411 BE0001034 \n","12064 4665 DB04959 BE0002460 \n","13987 6395 DB07293 BE0003801 \n","12067 4670 DB04964 BE0002439 \n","\n"," target_sequence \n","2023 MLLWVQQALLA \n","7754 AGVPFNTKTPYGPT \n","3041 GEGDVRCRGAASAVAAAAAAARQ \n","12279 MIWEEFTPEEGKGYREEVLTVKEIT \n","11351 MAQDIISTIGDLVKWIIDTVNKFTKK \n","... ... \n","1356 MGDAEGEDEVQFLRTDDEVVLQCSATVLKEQLKLCLAAEGFGNRLC... \n","5850 MGLPLARLAAVCLALSLAGGSELQTEGRTRYHGRNVCSTWGNFHYK... \n","12064 MATSGGEEAAAAAPAPGTPATGADTTPGWEVAVRPLLSASYSAFEM... \n","13987 MESLVLGVNEKTHVQLSLPVLQVRDVLVRGFGDSVEEALSEAREHL... \n","12067 MLKPSGLPGSSSPTRSLMTGSRSTKATPEMDSGLTGATLSPKTSTG... \n","\n","[4809 rows x 4 columns]"],"text/html":["\n"," <div id=\"df-76a6f3ec-7371-4789-897f-0434af3e1bfa\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>Unnamed: 0</th>\n"," <th>drugbank_id</th>\n"," <th>target_id</th>\n"," <th>target_sequence</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>2023</th>\n"," <td>358</td>\n"," <td>DB00370</td>\n"," <td>BE0005582</td>\n"," <td>MLLWVQQALLA</td>\n"," </tr>\n"," <tr>\n"," <th>7754</th>\n"," <td>2251</td>\n"," <td>DB02379</td>\n"," <td>BE0002017</td>\n"," <td>AGVPFNTKTPYGPT</td>\n"," </tr>\n"," <tr>\n"," <th>3041</th>\n"," <td>604</td>\n"," <td>DB00619</td>\n"," <td>BE0001104</td>\n"," <td>GEGDVRCRGAASAVAAAAAAARQ</td>\n"," </tr>\n"," <tr>\n"," <th>12279</th>\n"," <td>4870</td>\n"," <td>DB05194</td>\n"," <td>BE0002482</td>\n"," <td>MIWEEFTPEEGKGYREEVLTVKEIT</td>\n"," </tr>\n"," <tr>\n"," <th>11351</th>\n"," <td>4195</td>\n"," <td>DB04464</td>\n"," <td>BE0004086</td>\n"," <td>MAQDIISTIGDLVKWIIDTVNKFTKK</td>\n"," </tr>\n"," <tr>\n"," <th>...</th>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," </tr>\n"," <tr>\n"," <th>1356</th>\n"," <td>190</td>\n"," <td>DB00201</td>\n"," <td>BE0000739</td>\n"," <td>MGDAEGEDEVQFLRTDDEVVLQCSATVLKEQLKLCLAAEGFGNRLC...</td>\n"," </tr>\n"," <tr>\n"," <th>5850</th>\n"," <td>1349</td>\n"," <td>DB01411</td>\n"," <td>BE0001034</td>\n"," <td>MGLPLARLAAVCLALSLAGGSELQTEGRTRYHGRNVCSTWGNFHYK...</td>\n"," </tr>\n"," <tr>\n"," <th>12064</th>\n"," <td>4665</td>\n"," <td>DB04959</td>\n"," <td>BE0002460</td>\n"," <td>MATSGGEEAAAAAPAPGTPATGADTTPGWEVAVRPLLSASYSAFEM...</td>\n"," </tr>\n"," <tr>\n"," <th>13987</th>\n"," <td>6395</td>\n"," <td>DB07293</td>\n"," <td>BE0003801</td>\n"," <td>MESLVLGVNEKTHVQLSLPVLQVRDVLVRGFGDSVEEALSEAREHL...</td>\n"," </tr>\n"," <tr>\n"," <th>12067</th>\n"," <td>4670</td>\n"," <td>DB04964</td>\n"," <td>BE0002439</td>\n"," <td>MLKPSGLPGSSSPTRSLMTGSRSTKATPEMDSGLTGATLSPKTSTG...</td>\n"," </tr>\n"," </tbody>\n","</table>\n","<p>4809 rows × 4 columns</p>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-76a6f3ec-7371-4789-897f-0434af3e1bfa')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-76a6f3ec-7371-4789-897f-0434af3e1bfa button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-76a6f3ec-7371-4789-897f-0434af3e1bfa');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "]},"metadata":{},"execution_count":5}]},{"cell_type":"code","source":["# Generate target embeddings as a list\n","\n","target_embeddings = []\n","\n","for emb in tqdm(clean_targets_df['target_sequence']):\n"," one_emb = seq2emb(emb, model) # use seq2emb function defined above\n"," target_embeddings.append(one_emb)\n","\n","print('Done!')"],"metadata":{"id":"Oy2YR-Pj3igD"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# Convert list to array for the dataframe (readability) \n","\n","embs_array = []\n","for i in target_embeddings:\n"," a = i.numpy()\n"," embs_array.append(a)\n","\n","# Add embeddings to dataframe\n","clean_targets_df['embeddings'] = embs_array"],"metadata":{"id":"yzzRmNWCmXr8"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# Export to pickle\n","clean_targets_df.to_pickle('drugbank_target_embedddings.pkl')\n","\n","# Copy to another directory to save file\n","# !cp -r \"/content/drugbank_target_embedddings.pkl\" \"/content/gdrive/MyDrive/BMI 707 Project/embeddings\""],"metadata":{"id":"Ga3ktZ8rmZty"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Load and view the final file: "],"metadata":{"id":"hKLsxK-YueKp"}},{"cell_type":"code","source":["# Load and view\n","final_df = pd.read_pickle(deep_learning_dir + \"/drugbank_target_embedddings.pkl\")\n","final_df"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":659},"id":"H7KxaS-3udVg","executionInfo":{"status":"ok","timestamp":1651461559791,"user_tz":240,"elapsed":328,"user":{"displayName":"Aishwarya Chander","userId":"08767132026600463397"}},"outputId":"8e60fda3-1025-4556-c448-dbac88071b63"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" Unnamed: 0 drugbank_id target_id \\\n","2023 358 DB00370 BE0005582 \n","7754 2251 DB02379 BE0002017 \n","3041 604 DB00619 BE0001104 \n","12279 4870 DB05194 BE0002482 \n","11351 4195 DB04464 BE0004086 \n","... ... ... ... \n","1356 190 DB00201 BE0000739 \n","5850 1349 DB01411 BE0001034 \n","12064 4665 DB04959 BE0002460 \n","13987 6395 DB07293 BE0003801 \n","12067 4670 DB04964 BE0002439 \n","\n"," target_sequence \\\n","2023 MLLWVQQALLA \n","7754 AGVPFNTKTPYGPT \n","3041 GEGDVRCRGAASAVAAAAAAARQ \n","12279 MIWEEFTPEEGKGYREEVLTVKEIT \n","11351 MAQDIISTIGDLVKWIIDTVNKFTKK \n","... ... \n","1356 MGDAEGEDEVQFLRTDDEVVLQCSATVLKEQLKLCLAAEGFGNRLC... \n","5850 MGLPLARLAAVCLALSLAGGSELQTEGRTRYHGRNVCSTWGNFHYK... \n","12064 MATSGGEEAAAAAPAPGTPATGADTTPGWEVAVRPLLSASYSAFEM... \n","13987 MESLVLGVNEKTHVQLSLPVLQVRDVLVRGFGDSVEEALSEAREHL... \n","12067 MLKPSGLPGSSSPTRSLMTGSRSTKATPEMDSGLTGATLSPKTSTG... \n","\n"," embeddings \n","2023 [7.6123374360613525, 7.972399162128568, 1.5261... \n","7754 [7.6123374360613525, 7.972399162128568, 1.5261... \n","3041 [7.6123374360613525, 7.972399162128568, 1.5261... \n","12279 [7.6123374360613525, 7.972399162128568, 1.5261... \n","11351 [7.6123374360613525, 7.972399162128568, 1.5261... \n","... ... \n","1356 [387.6335365786217, 532.5416541863233, 167.123... \n","5850 [444.92687717976514, 738.1280948370695, -60.24... \n","12064 [392.5545535045676, 573.7628478072584, 160.352... \n","13987 [530.1488111329963, 766.3823193386197, 209.408... \n","12067 [1266.8234119648114, 1881.3945757914335, -7.99... \n","\n","[4809 rows x 5 columns]"],"text/html":["\n"," <div id=\"df-e3c328fd-cb0d-47ec-9bc7-6fa4ec08ba5a\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>Unnamed: 0</th>\n"," <th>drugbank_id</th>\n"," <th>target_id</th>\n"," <th>target_sequence</th>\n"," <th>embeddings</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>2023</th>\n"," <td>358</td>\n"," <td>DB00370</td>\n"," <td>BE0005582</td>\n"," <td>MLLWVQQALLA</td>\n"," <td>[7.6123374360613525, 7.972399162128568, 1.5261...</td>\n"," </tr>\n"," <tr>\n"," <th>7754</th>\n"," <td>2251</td>\n"," <td>DB02379</td>\n"," <td>BE0002017</td>\n"," <td>AGVPFNTKTPYGPT</td>\n"," <td>[7.6123374360613525, 7.972399162128568, 1.5261...</td>\n"," </tr>\n"," <tr>\n"," <th>3041</th>\n"," <td>604</td>\n"," <td>DB00619</td>\n"," <td>BE0001104</td>\n"," <td>GEGDVRCRGAASAVAAAAAAARQ</td>\n"," <td>[7.6123374360613525, 7.972399162128568, 1.5261...</td>\n"," </tr>\n"," <tr>\n"," <th>12279</th>\n"," <td>4870</td>\n"," <td>DB05194</td>\n"," <td>BE0002482</td>\n"," <td>MIWEEFTPEEGKGYREEVLTVKEIT</td>\n"," <td>[7.6123374360613525, 7.972399162128568, 1.5261...</td>\n"," </tr>\n"," <tr>\n"," <th>11351</th>\n"," <td>4195</td>\n"," <td>DB04464</td>\n"," <td>BE0004086</td>\n"," <td>MAQDIISTIGDLVKWIIDTVNKFTKK</td>\n"," <td>[7.6123374360613525, 7.972399162128568, 1.5261...</td>\n"," </tr>\n"," <tr>\n"," <th>...</th>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," </tr>\n"," <tr>\n"," <th>1356</th>\n"," <td>190</td>\n"," <td>DB00201</td>\n"," <td>BE0000739</td>\n"," <td>MGDAEGEDEVQFLRTDDEVVLQCSATVLKEQLKLCLAAEGFGNRLC...</td>\n"," <td>[387.6335365786217, 532.5416541863233, 167.123...</td>\n"," </tr>\n"," <tr>\n"," <th>5850</th>\n"," <td>1349</td>\n"," <td>DB01411</td>\n"," <td>BE0001034</td>\n"," <td>MGLPLARLAAVCLALSLAGGSELQTEGRTRYHGRNVCSTWGNFHYK...</td>\n"," <td>[444.92687717976514, 738.1280948370695, -60.24...</td>\n"," </tr>\n"," <tr>\n"," <th>12064</th>\n"," <td>4665</td>\n"," <td>DB04959</td>\n"," <td>BE0002460</td>\n"," <td>MATSGGEEAAAAAPAPGTPATGADTTPGWEVAVRPLLSASYSAFEM...</td>\n"," <td>[392.5545535045676, 573.7628478072584, 160.352...</td>\n"," </tr>\n"," <tr>\n"," <th>13987</th>\n"," <td>6395</td>\n"," <td>DB07293</td>\n"," <td>BE0003801</td>\n"," <td>MESLVLGVNEKTHVQLSLPVLQVRDVLVRGFGDSVEEALSEAREHL...</td>\n"," <td>[530.1488111329963, 766.3823193386197, 209.408...</td>\n"," </tr>\n"," <tr>\n"," <th>12067</th>\n"," <td>4670</td>\n"," <td>DB04964</td>\n"," <td>BE0002439</td>\n"," <td>MLKPSGLPGSSSPTRSLMTGSRSTKATPEMDSGLTGATLSPKTSTG...</td>\n"," <td>[1266.8234119648114, 1881.3945757914335, -7.99...</td>\n"," </tr>\n"," </tbody>\n","</table>\n","<p>4809 rows × 5 columns</p>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-e3c328fd-cb0d-47ec-9bc7-6fa4ec08ba5a')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-e3c328fd-cb0d-47ec-9bc7-6fa4ec08ba5a button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-e3c328fd-cb0d-47ec-9bc7-6fa4ec08ba5a');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "]},"metadata":{},"execution_count":49}]},{"cell_type":"code","source":["# View one example\n","print(\"Sample Target Embedding\\nTarget ID:\", final_df['target_id'][1], \n"," \"\\nTarget Embedding:\", final_df['embeddings'][1], \n"," \"\\nTarget Sequence:\", final_df['target_sequence'][1])"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"oG2cdxVZxt_Z","executionInfo":{"status":"ok","timestamp":1651461769641,"user_tz":240,"elapsed":4,"user":{"displayName":"Aishwarya Chander","userId":"08767132026600463397"}},"outputId":"cf5a3377-ba79-4ece-ed2f-6017b0bc27d3"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Sample Target Embedding\n","Target ID: BE0000767 \n","Target Embedding: [ 91.97858468 125.20936934 41.84956151 ... 45.29450765 -88.95176846\n"," -41.53821779] \n","Target Sequence: MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYVQRNYDLSFLKTIQEVAGYVLIALNTVERIPLENLQIIRGNMYYENSYALAVLSNYDANKTGLKELPMRNLQEILHGAVRFSNNPALCNVESIQWRDIVSSDFLSNMSMDFQNHLGSCQKCDPSCPNGSCWGAGEENCQKLTKIICAQQCSGRCRGKSPSDCCHNQCAAGCTGPRESDCLVCRKFRDEATCKDTCPPLMLYNPTTYQMDVNPEGKYSFGATCVKKCPRNYVVTDHGSCVRACGADSYEMEEDGVRKCKKCEGPCRKVCNGIGIGEFKDSLSINATNIKHFKNCTSISGDLHILPVAFRGDSFTHTPPLDPQELDILKTVKEITGFLLIQAWPENRTDLHAFENLEIIRGRTKQHGQFSLAVVSLNITSLGLRSLKEISDGDVIISGNKNLCYANTINWKKLFGTSGQKTKIISNRGENSCKATGQVCHALCSPEGCWGPEPRDCVSCRNVSRGRECVDKCNLLEGEPREFVENSECIQCHPECLPQAMNITCTGRGPDNCIQCAHYIDGPHCVKTCPAGVMGENNTLVWKYADAGHVCHLCHPNCTYGCTGPGLEGCPTNGPKIPSIATGMVGALLLLLVVALGIGLFMRRRHIVRKRTLRRLLQERELVEPLTPSGEAPNQALLRILKETEFKKIKVLGSGAFGTVYKGLWIPEGEKVKIPVAIKELREATSPKANKEILDEAYVMASVDNPHVCRLLGICLTSTVQLITQLMPFGCLLDYVREHKDNIGSQYLLNWCVQIAKGMNYLEDRRLVHRDLAARNVLVKTPQHVKITDFGLAKLLGAEEKEYHAEGGKVPIKWMALESILHRIYTHQSDVWSYGVTVWELMTFGSKPYDGIPASEISSILEKGERLPQPPICTIDVYMIMVKCWMIDADSRPKFRELIIEFSKMARDPQRYLVIQGDERMHLPSPTDSNFYRALMDEEDMDDVVDADEYLIPQQGFFSSPSTSRTPLLSSLSATSNNSTVACIDRNGLQSCPIKEDSFLQRYSSDPTGALTEDSIDDTFLPVPEYINQSVPKRPAGSVQNPVYHNQPLNPAPSRDPHYQDPHSTAVGNPEYLNTVQPTCVNSTFDSPAHWAQKGSHQISLDNPDYQQDFFPKEAKPNGIFKGSTAENAEYLRVAPQSSEFIGA\n"]}]}]}