[db6163]: / data_prep / construct_kg / build_graph.ipynb

Download this file

4909 lines (4908 with data), 167.3 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:17:42.529843Z",
     "start_time": "2021-08-06T12:17:37.821976Z"
    }
   },
   "outputs": [],
   "source": [
    "from tqdm.notebook import tqdm\n",
    "import re\n",
    "import os \n",
    "import shutil\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import igraph as ig\n",
    "from scipy.sparse import lil_matrix, save_npz\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "import torch\n",
    "from transformers import AutoTokenizer, AutoModel, pipeline\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "#data_path = '../../datasets/'\n",
    "#save_path = data_path +'kg/'\n",
    "data_path = '/n/data1/hms/dbmi/zaklab/emily/rare_disease_dx/data/8.9.21_kg/raw/sources/'\n",
    "save_path = '/n/data1/hms/dbmi/zaklab/emily/rare_disease_dx/data/8.9.21_kg/our_kg/'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# Read datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:17:42.583094Z",
     "start_time": "2021-08-06T12:17:42.532233Z"
    },
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def assert_dtypes(df): \n",
    "    all_string = True\n",
    "    for i, x in enumerate(df.dtypes.values): \n",
    "        if x != np.dtype('O'): \n",
    "            all_string = False\n",
    "            print(df.columns[i], x)\n",
    "    if not all_string: assert False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:17:58.905940Z",
     "start_time": "2021-08-06T12:17:42.598948Z"
    },
    "hidden": true
   },
   "outputs": [],
   "source": [
    "df_ppi = pd.read_csv(data_path+'ppi/protein_protein.csv', low_memory=False).dropna()\n",
    "df_ppi = df_ppi.astype({'proteinA_entrezid':int}).astype({'proteinA_entrezid':str})\n",
    "df_ppi = df_ppi.astype({'proteinB_entrezid':int}).astype({'proteinB_entrezid':str})\n",
    "assert_dtypes(df_ppi)\n",
    "\n",
    "df_drugbank = pd.read_csv(data_path+'drugbank/drug_protein.csv', low_memory=False)\n",
    "df_drugbank = df_drugbank.get(['DrugBank', 'relation', 'NCBIGeneID','DrugBankName']).dropna()\n",
    "df_drugbank = df_drugbank.astype({'NCBIGeneID':int}).astype({'NCBIGeneID':str})\n",
    "assert_dtypes(df_drugbank)\n",
    "\n",
    "df_disgenet = pd.read_csv(data_path+'disgenet/curated_gene_disease_associations.tsv', sep='\\t', low_memory=False)\n",
    "df_disgenet = df_disgenet.astype({'geneId':int}).astype({'geneId':str})\n",
    "\n",
    "df_mondo_terms = pd.read_csv(data_path+'mondo/mondo_terms.csv', low_memory=False)\n",
    "df_mondo_terms = df_mondo_terms.astype({'id':int}).astype({'id':str})\n",
    "\n",
    "df_mondo_xref = pd.read_csv(data_path+'mondo/mondo_references.csv', low_memory=False)\n",
    "df_mondo_xref = df_mondo_xref.astype({'mondo_id':int}).astype({'mondo_id':str})\n",
    "assert_dtypes(df_mondo_xref)\n",
    "\n",
    "df_mondo_parents = pd.read_csv(data_path+'mondo/mondo_parents.csv', low_memory=False)\n",
    "df_mondo_parents = df_mondo_parents.astype({'parent':int}).astype({'parent':str})\n",
    "df_mondo_parents = df_mondo_parents.astype({'child':int}).astype({'child':str})\n",
    "assert_dtypes(df_mondo_parents)\n",
    "\n",
    "df_drug_central = pd.read_csv(data_path+'drugcentral/drug_disease.csv', low_memory=False)\n",
    "df_drug_central = df_drug_central.get(['cas_reg_no','relationship_name', 'umls_cui']) # 'concept_id', 'concept_name', 'snomed_conceptid'\n",
    "df_drug_central = df_drug_central.query('not @df_drug_central.cas_reg_no.isna()')\n",
    "df_drug_central = df_drug_central.query('not @df_drug_central.umls_cui.isna()')\n",
    "assert_dtypes(df_drug_central)\n",
    "\n",
    "df_ddi = pd.read_csv(data_path+'drugbank/drug_drug.csv', low_memory=False)\n",
    "assert_dtypes(df_ddi)\n",
    "\n",
    "df_hp_terms = pd.read_csv(data_path+'hpo/hp_terms.csv', low_memory=False)\n",
    "df_hp_terms = df_hp_terms.astype({'id':int}).astype({'id':str})\n",
    "\n",
    "df_hp_xref = pd.read_csv(data_path+'hpo/hp_references.csv', low_memory=False)\n",
    "df_hp_xref = df_hp_xref.astype({'hp_id':int}).astype({'hp_id':str})\n",
    "\n",
    "df_hp_parents = pd.read_csv(data_path+'hpo/hp_parents.csv', low_memory=False)\n",
    "df_hp_parents = df_hp_parents.astype({'parent':int}).astype({'parent':str})\n",
    "df_hp_parents = df_hp_parents.astype({'child':int}).astype({'child':str})\n",
    "assert_dtypes(df_hp_parents)\n",
    "\n",
    "df_hpoa_pos = pd.read_csv(data_path+'hpo/disease_phenotype_pos.csv', low_memory=False)\n",
    "df_hpoa_pos = df_hpoa_pos.astype({'hp_id':int}).astype({'hp_id':str})\n",
    "df_hpoa_pos = df_hpoa_pos.astype({'disease_ontology_id':int}).astype({'disease_ontology_id':str})\n",
    "assert_dtypes(df_hpoa_pos)\n",
    "\n",
    "df_hpoa_neg = pd.read_csv(data_path+'hpo/disease_phenotype_neg.csv', low_memory=False)\n",
    "df_hpoa_neg = df_hpoa_neg.astype({'hp_id':int}).astype({'hp_id':str})\n",
    "df_hpoa_neg = df_hpoa_neg.astype({'disease_ontology_id':int}).astype({'disease_ontology_id':str})\n",
    "assert_dtypes(df_hpoa_neg)\n",
    "\n",
    "df_sider = pd.read_csv(data_path+'sider/sider.csv', low_memory=False)\n",
    "assert_dtypes(df_sider)\n",
    "\n",
    "df_go_terms = pd.read_csv(data_path+'go/go_terms_info.csv', low_memory=False)\n",
    "df_go_terms = df_go_terms.astype({'go_term_id':int}).astype({'go_term_id':str})\n",
    "assert_dtypes(df_go_terms)\n",
    "\n",
    "df_go_edges = pd.read_csv(data_path+'go/go_terms_relations.csv', low_memory=False)\n",
    "df_go_edges = df_go_edges.astype({'x':int}).astype({'x':str})\n",
    "df_go_edges = df_go_edges.astype({'y':int}).astype({'y':str})\n",
    "assert_dtypes(df_go_edges)\n",
    "\n",
    "df_gene2go = pd.read_csv(data_path+'ncbigene/protein_go_associations.csv', low_memory=False)\n",
    "df_gene2go = df_gene2go.astype({'ncbi_gene_id':int}).astype({'ncbi_gene_id':str})\n",
    "df_gene2go = df_gene2go.astype({'go_term_id':int}).astype({'go_term_id':str})\n",
    "assert_dtypes(df_gene2go)\n",
    "\n",
    "df_exposures = pd.read_csv(data_path+'ctd/exposure_data.csv', low_memory=False)\n",
    "df_exposures = df_exposures.get(['exposurestressorname', 'exposurestressorid',\n",
    "                  'exposuremarker', 'exposuremarkerid',\n",
    "                  'diseasename', 'diseaseid',\n",
    "                  'phenotypename', 'phenotypeid'])\n",
    "assert_dtypes(df_exposures)\n",
    "\n",
    "df_uberon_terms = pd.read_csv(data_path+'uberon/uberon_terms.csv', low_memory=False)\n",
    "df_uberon_terms = df_uberon_terms.astype({'id':int}).astype({'id':str})\n",
    "assert_dtypes(df_uberon_terms)\n",
    "\n",
    "df_uberon_is_a = pd.read_csv(data_path+'uberon/uberon_is_a.csv', low_memory=False)\n",
    "df_uberon_is_a = df_uberon_is_a.astype({'id':int}).astype({'id':str})\n",
    "df_uberon_is_a = df_uberon_is_a.astype({'is_a':int}).astype({'is_a':str})\n",
    "assert_dtypes(df_uberon_is_a)\n",
    "\n",
    "df_uberon_rels = pd.read_csv(data_path+'uberon/uberon_rels.csv', low_memory=False)\n",
    "df_uberon_rels = df_uberon_rels.astype({'id':int}).astype({'id':str})\n",
    "df_uberon_rels = df_uberon_rels.astype({'relation_id':int}).astype({'relation_id':str})\n",
    "assert_dtypes(df_uberon_rels)\n",
    "\n",
    "df_bgee = pd.read_csv(data_path+'bgee/anatomy_gene.csv', low_memory=False)\n",
    "df_bgee = df_bgee.astype({'expression_rank':int}).astype({'expression_rank':str})\n",
    "df_bgee = df_bgee.astype({'anatomy_id':int}).astype({'anatomy_id':str})\n",
    "assert_dtypes(df_bgee)\n",
    "\n",
    "df_reactome_terms = pd.read_csv(data_path+'reactome/reactome_terms.csv', low_memory=False)\n",
    "assert_dtypes(df_reactome_terms)\n",
    "\n",
    "df_reactome_rels = pd.read_csv(data_path+'reactome/reactome_relations.csv', low_memory=False)\n",
    "assert_dtypes(df_reactome_rels)\n",
    "\n",
    "df_reactome_ncbi = pd.read_csv(data_path+'reactome/reactome_ncbi.csv', low_memory=False)\n",
    "df_reactome_ncbi = df_reactome_ncbi[df_reactome_ncbi.ncbi_id.str.isnumeric()]\n",
    "assert_dtypes(df_reactome_ncbi)\n",
    "\n",
    "df_umls_mondo = pd.read_csv(data_path+'vocab/umls_mondo.csv', low_memory=False)\n",
    "df_umls_mondo = df_umls_mondo.astype({'mondo_id':int}).astype({'mondo_id':str})\n",
    "assert_dtypes(df_umls_mondo)\n",
    "\n",
    "df_prot_names = pd.read_csv(data_path+'vocab/gene_names.csv', low_memory=False, sep='\\t')\n",
    "df_prot_names = df_prot_names.rename(columns={'NCBI Gene ID(supplied by NCBI)':'ncbi_id', 'NCBI Gene ID':'ncbi_id2', 'Approved symbol':'symbol', 'Approved name':'name'})\n",
    "df_prot_names = df_prot_names.get(['ncbi_id', 'symbol']).dropna()\n",
    "df_prot_names = df_prot_names.astype({'ncbi_id':int}).astype({'ncbi_id':str})\n",
    "assert_dtypes(df_prot_names)\n",
    "\n",
    "db_vocab = pd.read_csv(data_path+'vocab/drugbank_vocabulary.csv', low_memory=False)\n",
    "assert_dtypes(db_vocab)\n",
    "\n",
    "df_db_atc = pd.read_csv(data_path+'vocab/drugbank_atc_codes.csv', low_memory=False).get(['atc_code','parent_key'])\n",
    "assert_dtypes(df_db_atc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# Converting databases into graph edges"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:17:59.008491Z",
     "start_time": "2021-08-06T12:17:58.909175Z"
    },
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def clean_edges(df): \n",
    "    df = df.get(['relation', 'display_relation', 'x_id','x_type', 'x_name', 'x_source','y_id','y_type', 'y_name', 'y_source'])\n",
    "    df = df.dropna()\n",
    "    df = df.drop_duplicates()\n",
    "    df = df.query('not ((x_id == y_id) and (x_type == y_type) and (x_source == y_source) and (x_name == y_name))')\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "## Basic"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "### Protein protein interactions (NCBI)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:00.653072Z",
     "start_time": "2021-08-06T12:17:59.013365Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation</th>\n",
       "      <th>display_relation</th>\n",
       "      <th>x_id</th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_name</th>\n",
       "      <th>x_source</th>\n",
       "      <th>y_id</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_name</th>\n",
       "      <th>y_source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>protein_protein</td>\n",
       "      <td>ppi</td>\n",
       "      <td>9796</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>PHYHIP</td>\n",
       "      <td>NCBI</td>\n",
       "      <td>56992</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>KIF15</td>\n",
       "      <td>NCBI</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          relation display_relation  x_id        x_type  x_name x_source  \\\n",
       "0  protein_protein              ppi  9796  gene/protein  PHYHIP     NCBI   \n",
       "\n",
       "    y_id        y_type y_name y_source  \n",
       "0  56992  gene/protein  KIF15     NCBI  "
      ]
     },
     "execution_count": 134,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_prot_prot = pd.merge(df_ppi, df_prot_names, 'left', left_on='proteinA_entrezid', right_on='ncbi_id').rename(columns={'symbol':'symbolA'})\n",
    "df_prot_prot = pd.merge(df_prot_prot, df_prot_names, 'left', left_on='proteinB_entrezid', right_on='ncbi_id').rename(columns={'symbol':'symbolB'})\n",
    "\n",
    "df_prot_prot = df_prot_prot.rename(columns={'proteinA_entrezid':'x_id', 'proteinB_entrezid':'y_id', 'symbolA':'x_name', 'symbolB':'y_name'})\n",
    "df_prot_prot['x_type'] = 'gene/protein'\n",
    "df_prot_prot['x_source'] = 'NCBI'\n",
    "df_prot_prot['y_type'] = 'gene/protein'\n",
    "df_prot_prot['y_source'] = 'NCBI'\n",
    "df_prot_prot['relation'] = 'protein_protein'\n",
    "df_prot_prot['display_relation'] = 'ppi'\n",
    "df_prot_prot = clean_edges(df_prot_prot)\n",
    "df_prot_prot.head(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "### Drug protein interactions (DrugBank)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:00.937639Z",
     "start_time": "2021-08-06T12:18:00.656244Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation</th>\n",
       "      <th>display_relation</th>\n",
       "      <th>x_id</th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_name</th>\n",
       "      <th>x_source</th>\n",
       "      <th>y_id</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_name</th>\n",
       "      <th>y_source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>drug_protein</td>\n",
       "      <td>carrier</td>\n",
       "      <td>DB09130</td>\n",
       "      <td>drug</td>\n",
       "      <td>Copper</td>\n",
       "      <td>DrugBank</td>\n",
       "      <td>2157</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>F8</td>\n",
       "      <td>NCBI</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       relation display_relation     x_id x_type  x_name  x_source  y_id  \\\n",
       "0  drug_protein          carrier  DB09130   drug  Copper  DrugBank  2157   \n",
       "\n",
       "         y_type y_name y_source  \n",
       "0  gene/protein     F8     NCBI  "
      ]
     },
     "execution_count": 135,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_prot_drug = pd.merge(df_drugbank, df_prot_names, 'left', left_on='NCBIGeneID', right_on='ncbi_id')\n",
    "\n",
    "df_prot_drug = df_prot_drug.rename(columns={'DrugBank':'x_id', 'NCBIGeneID':'y_id', 'DrugBankName':'x_name', 'symbol':'y_name'})\n",
    "df_prot_drug['x_type'] = 'drug'\n",
    "df_prot_drug['x_source'] = 'DrugBank'\n",
    "df_prot_drug['y_type'] = 'gene/protein'\n",
    "df_prot_drug['y_source'] = 'NCBI'\n",
    "df_prot_drug['display_relation'] = df_prot_drug.get('relation').values\n",
    "df_prot_drug['relation'] = 'drug_protein' # combine targets, carrier, enzyme and transporter\n",
    "df_prot_drug = clean_edges(df_prot_drug)\n",
    "df_prot_drug.head(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "### Drug disease interactions (DiseaseCentral) –– PENDING"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:01.357506Z",
     "start_time": "2021-08-06T12:18:00.940724Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation</th>\n",
       "      <th>display_relation</th>\n",
       "      <th>x_id</th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_name</th>\n",
       "      <th>x_source</th>\n",
       "      <th>y_id</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_name</th>\n",
       "      <th>y_source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>contraindication</td>\n",
       "      <td>contraindication</td>\n",
       "      <td>DB05271</td>\n",
       "      <td>drug</td>\n",
       "      <td>Rotigotine</td>\n",
       "      <td>DrugBank</td>\n",
       "      <td>5044</td>\n",
       "      <td>disease</td>\n",
       "      <td>hypertensive disorder</td>\n",
       "      <td>MONDO</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           relation  display_relation     x_id x_type      x_name  x_source  \\\n",
       "0  contraindication  contraindication  DB05271   drug  Rotigotine  DrugBank   \n",
       "\n",
       "   y_id   y_type                 y_name y_source  \n",
       "0  5044  disease  hypertensive disorder    MONDO  "
      ]
     },
     "execution_count": 136,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_drug_dis = pd.merge(df_drug_central, db_vocab, 'left', left_on='cas_reg_no', right_on='CAS')\n",
    "df_drug_dis = pd.merge(df_drug_dis, df_umls_mondo, 'inner', left_on='umls_cui', right_on='umls_id')\n",
    "df_drug_dis = pd.merge(df_drug_dis, df_mondo_terms, 'left', left_on='mondo_id', right_on='id')\n",
    "\n",
    "df_drug_dis = df_drug_dis.get(['relationship_name','DrugBank ID', 'Common name', 'mondo_id', 'name'])\n",
    "df_drug_dis = df_drug_dis.dropna().drop_duplicates()\n",
    "\n",
    "df_drug_dis = df_drug_dis.rename(columns={'DrugBank ID':'x_id', 'mondo_id':'y_id', 'Common name':'x_name', 'name':'y_name', 'relationship_name':'relation'})\n",
    "df_drug_dis['x_type'] = 'drug'\n",
    "df_drug_dis['x_source'] = 'DrugBank'\n",
    "df_drug_dis['y_type'] = 'disease'\n",
    "df_drug_dis['y_source'] = 'MONDO'\n",
    "df_drug_dis['display_relation'] = df_drug_dis.get('relation').values\n",
    "df_drug_dis = clean_edges(df_drug_dis)\n",
    "df_drug_dis.head(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "### Disease protein interactions (DisGenNet)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:02.363099Z",
     "start_time": "2021-08-06T12:18:01.381960Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation</th>\n",
       "      <th>display_relation</th>\n",
       "      <th>x_id</th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_name</th>\n",
       "      <th>x_source</th>\n",
       "      <th>y_id</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_name</th>\n",
       "      <th>y_source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>disease_protein</td>\n",
       "      <td>associated with</td>\n",
       "      <td>5090</td>\n",
       "      <td>disease</td>\n",
       "      <td>schizophrenia (disease)</td>\n",
       "      <td>MONDO</td>\n",
       "      <td>1</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>A1BG</td>\n",
       "      <td>NCBI</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          relation display_relation  x_id   x_type                   x_name  \\\n",
       "0  disease_protein  associated with  5090  disease  schizophrenia (disease)   \n",
       "\n",
       "  x_source y_id        y_type y_name y_source  \n",
       "0    MONDO    1  gene/protein   A1BG     NCBI  "
      ]
     },
     "execution_count": 137,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_dis_prot1 = df_disgenet.query('diseaseType==\"disease\"')\n",
    "\n",
    "df_dis_prot1 = pd.merge(df_dis_prot1, df_umls_mondo, 'inner', left_on='diseaseId', right_on='umls_id')\n",
    "df_dis_prot1 = pd.merge(df_dis_prot1, df_mondo_terms, 'left', left_on='mondo_id', right_on='id')\n",
    "\n",
    "df_dis_prot1 = df_dis_prot1.rename(columns={'geneId':'y_id', 'geneSymbol':'y_name', 'mondo_id':'x_id', 'name':'x_name'})\n",
    "df_dis_prot1['x_type'] = 'disease'\n",
    "df_dis_prot1['x_source'] = 'MONDO'\n",
    "df_dis_prot1['y_type'] = 'gene/protein'\n",
    "df_dis_prot1['y_source'] = 'NCBI'\n",
    "df_dis_prot1['relation'] = 'disease_protein'\n",
    "df_dis_prot1['display_relation'] = 'associated with'\n",
    "df_dis_prot1 = clean_edges(df_dis_prot1)\n",
    "df_dis_prot1.head(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "### Disease disease interations (MONDO)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:02.612068Z",
     "start_time": "2021-08-06T12:18:02.367422Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation</th>\n",
       "      <th>display_relation</th>\n",
       "      <th>x_id</th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_name</th>\n",
       "      <th>x_source</th>\n",
       "      <th>y_id</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_name</th>\n",
       "      <th>y_source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>disease_disease</td>\n",
       "      <td>parent-child</td>\n",
       "      <td>2816</td>\n",
       "      <td>disease</td>\n",
       "      <td>adrenal cortex disease</td>\n",
       "      <td>MONDO</td>\n",
       "      <td>4</td>\n",
       "      <td>disease</td>\n",
       "      <td>adrenocortical insufficiency</td>\n",
       "      <td>MONDO</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          relation display_relation  x_id   x_type                  x_name  \\\n",
       "0  disease_disease     parent-child  2816  disease  adrenal cortex disease   \n",
       "\n",
       "  x_source y_id   y_type                        y_name y_source  \n",
       "0    MONDO    4  disease  adrenocortical insufficiency    MONDO  "
      ]
     },
     "execution_count": 138,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_dis_dis1 = pd.merge(df_mondo_parents, df_mondo_terms, 'left', left_on='parent', right_on='id')\n",
    "df_dis_dis1 = df_dis_dis1.rename(columns={'parent':'x_id', 'name':'x_name'})\n",
    "df_dis_dis1 = pd.merge(df_dis_dis1, df_mondo_terms, 'left', left_on='child', right_on='id')\n",
    "df_dis_dis1 = df_dis_dis1.rename(columns={'child':'y_id', 'name':'y_name'})\n",
    "df_dis_dis1['x_type'] = 'disease'\n",
    "df_dis_dis1['x_source'] = 'MONDO'\n",
    "df_dis_dis1['y_type'] = 'disease'\n",
    "df_dis_dis1['y_source'] = 'MONDO'\n",
    "df_dis_dis1['relation'] = 'disease_disease'\n",
    "df_dis_dis1['display_relation'] = 'parent-child'\n",
    "df_dis_dis1 = clean_edges(df_dis_dis1)\n",
    "df_dis_dis1.head(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "### Drug drug interactions (DrugBank)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:15.697033Z",
     "start_time": "2021-08-06T12:18:02.616417Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation</th>\n",
       "      <th>display_relation</th>\n",
       "      <th>x_id</th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_name</th>\n",
       "      <th>x_source</th>\n",
       "      <th>y_id</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_name</th>\n",
       "      <th>y_source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>drug_drug</td>\n",
       "      <td>synergistic interaction</td>\n",
       "      <td>DB00001</td>\n",
       "      <td>drug</td>\n",
       "      <td>Lepirudin</td>\n",
       "      <td>DrugBank</td>\n",
       "      <td>DB06605</td>\n",
       "      <td>drug</td>\n",
       "      <td>Apixaban</td>\n",
       "      <td>DrugBank</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    relation         display_relation     x_id x_type     x_name  x_source  \\\n",
       "0  drug_drug  synergistic interaction  DB00001   drug  Lepirudin  DrugBank   \n",
       "\n",
       "      y_id y_type    y_name  y_source  \n",
       "0  DB06605   drug  Apixaban  DrugBank  "
      ]
     },
     "execution_count": 139,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_drug_drug = pd.merge(df_ddi, db_vocab, 'inner', left_on='drug1', right_on='DrugBank ID')\n",
    "df_drug_drug = df_drug_drug.rename(columns={'drug1':'x_id', 'Common name':'x_name'})\n",
    "df_drug_drug = pd.merge(df_drug_drug.astype({'drug2':'str'}), db_vocab, 'inner', left_on='drug2', right_on='DrugBank ID')\n",
    "df_drug_drug = df_drug_drug.rename(columns={'drug2':'y_id', 'Common name':'y_name'})\n",
    "df_drug_drug['x_type'] = 'drug'\n",
    "df_drug_drug['x_source'] = 'DrugBank'\n",
    "df_drug_drug['y_type'] = 'drug'\n",
    "df_drug_drug['y_source'] = 'DrugBank'\n",
    "df_drug_drug['relation'] = 'drug_drug'\n",
    "df_drug_drug['display_relation'] = 'synergistic interaction'\n",
    "df_drug_drug = clean_edges(df_drug_drug)\n",
    "df_drug_drug.head(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "## Effect/Phenotype"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "### Effect protein interactions (DisGenNet)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:16.132855Z",
     "start_time": "2021-08-06T12:18:15.701935Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation</th>\n",
       "      <th>display_relation</th>\n",
       "      <th>x_id</th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_name</th>\n",
       "      <th>x_source</th>\n",
       "      <th>y_id</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_name</th>\n",
       "      <th>y_source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>phenotype_protein</td>\n",
       "      <td>associated with</td>\n",
       "      <td>2240</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>Hepatomegaly</td>\n",
       "      <td>HPO</td>\n",
       "      <td>1</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>A1BG</td>\n",
       "      <td>NCBI</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            relation display_relation  x_id            x_type        x_name  \\\n",
       "0  phenotype_protein  associated with  2240  effect/phenotype  Hepatomegaly   \n",
       "\n",
       "  x_source y_id        y_type y_name y_source  \n",
       "0      HPO    1  gene/protein   A1BG     NCBI  "
      ]
     },
     "execution_count": 140,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_phe_prot = df_disgenet.query('diseaseType==\"phenotype\"')\n",
    "\n",
    "df_phe_prot = pd.merge(df_phe_prot, df_hp_xref, 'inner', left_on='diseaseId', right_on='ontology_id')\n",
    "df_phe_prot = pd.merge(df_phe_prot, df_hp_terms, 'left', left_on='hp_id', right_on='id')\n",
    "\n",
    "df_phe_prot = df_phe_prot.rename(columns={'geneId':'y_id', 'geneSymbol':'y_name', 'hp_id':'x_id', 'name':'x_name'})\n",
    "df_phe_prot['x_type'] = 'effect/phenotype'\n",
    "df_phe_prot['x_source'] = 'HPO'\n",
    "df_phe_prot['y_type'] = 'gene/protein'\n",
    "df_phe_prot['y_source'] = 'NCBI'\n",
    "df_phe_prot['relation'] = 'phenotype_protein'\n",
    "df_phe_prot['display_relation'] = 'associated with'\n",
    "df_phe_prot = clean_edges(df_phe_prot)\n",
    "df_phe_prot.head(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "### Effect effect interactions (HPO)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:16.566142Z",
     "start_time": "2021-08-06T12:18:16.138257Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation</th>\n",
       "      <th>display_relation</th>\n",
       "      <th>x_id</th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_name</th>\n",
       "      <th>x_source</th>\n",
       "      <th>y_id</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_name</th>\n",
       "      <th>y_source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>phenotype_phenotype</td>\n",
       "      <td>parent-child</td>\n",
       "      <td>1507</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>Growth abnormality</td>\n",
       "      <td>HPO</td>\n",
       "      <td>2</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>Abnormality of body height</td>\n",
       "      <td>HPO</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              relation display_relation  x_id            x_type  \\\n",
       "0  phenotype_phenotype     parent-child  1507  effect/phenotype   \n",
       "\n",
       "               x_name x_source y_id            y_type  \\\n",
       "0  Growth abnormality      HPO    2  effect/phenotype   \n",
       "\n",
       "                       y_name y_source  \n",
       "0  Abnormality of body height      HPO  "
      ]
     },
     "execution_count": 141,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_phe_phe = pd.merge(df_hp_parents, df_hp_terms, 'left', left_on='parent', right_on='id')\n",
    "df_phe_phe = df_phe_phe.rename(columns={'name':'parent_name'})\n",
    "df_phe_phe = pd.merge(df_phe_phe, df_hp_terms, 'left', left_on='child', right_on='id')\n",
    "df_phe_phe = df_phe_phe.rename(columns={'name':'child_name'})\n",
    "df_phe_phe = df_phe_phe.get(['parent', 'child', 'parent_name', 'child_name'])\n",
    "\n",
    "df_phe_phe = df_phe_phe.rename(columns={'parent':'x_id', 'child':'y_id', 'parent_name':'x_name', 'child_name':'y_name'})\n",
    "df_phe_phe['x_type'] = 'effect/phenotype'\n",
    "df_phe_phe['x_source'] = 'HPO'\n",
    "df_phe_phe['y_type'] = 'effect/phenotype'\n",
    "df_phe_phe['y_source'] = 'HPO'\n",
    "df_phe_phe['relation'] = 'phenotype_phenotype'\n",
    "df_phe_phe['display_relation'] = 'parent-child'\n",
    "df_phe_phe = clean_edges(df_phe_phe)\n",
    "df_phe_phe.head(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "### Disease effect interactions (HPO-A)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:18.049528Z",
     "start_time": "2021-08-06T12:18:16.571436Z"
    },
    "hidden": true,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation</th>\n",
       "      <th>display_relation</th>\n",
       "      <th>x_id</th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_name</th>\n",
       "      <th>x_source</th>\n",
       "      <th>y_id</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_name</th>\n",
       "      <th>y_source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>disease_phenotype_positive</td>\n",
       "      <td>phenotype present</td>\n",
       "      <td>10761</td>\n",
       "      <td>disease</td>\n",
       "      <td>retinitis pigmentosa Y-linked</td>\n",
       "      <td>MONDO</td>\n",
       "      <td>510</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>Rod-cone dystrophy</td>\n",
       "      <td>HPO</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     relation   display_relation   x_id   x_type  \\\n",
       "0  disease_phenotype_positive  phenotype present  10761  disease   \n",
       "\n",
       "                          x_name x_source y_id            y_type  \\\n",
       "0  retinitis pigmentosa Y-linked    MONDO  510  effect/phenotype   \n",
       "\n",
       "               y_name y_source  \n",
       "0  Rod-cone dystrophy      HPO  "
      ]
     },
     "execution_count": 142,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_dis_phe_pos1 = pd.merge(df_hpoa_pos, df_mondo_xref, 'left', left_on='disease_ontology_id', right_on='ontology_id')\n",
    "df_dis_phe_pos1 = df_dis_phe_pos1.query('(disease_ontology==ontology) or (disease_ontology==\"ORPHA\" and ontology==\"Orphanet\")')\n",
    "df_dis_phe_pos1 = pd.merge(df_dis_phe_pos1, df_hp_terms, 'left', left_on='hp_id', right_on='id').rename(columns={'name':'hp_name'})\n",
    "df_dis_phe_pos1 = pd.merge(df_dis_phe_pos1, df_mondo_terms, 'left', left_on='mondo_id', right_on='id').rename(columns={'name':'mondo_name'})\n",
    "df_dis_phe_pos1 = df_dis_phe_pos1.get(['mondo_id', 'mondo_name', 'hp_id', 'hp_name'])\n",
    "df_dis_phe_pos1 = df_dis_phe_pos1.rename(columns={'mondo_id':'x_id', 'mondo_name':'x_name', 'hp_id': 'y_id', 'hp_name':'y_name'})\n",
    "df_dis_phe_pos1.loc[:, 'x_source'] = 'MONDO'\n",
    "df_dis_phe_pos1.loc[:, 'x_type'] = 'disease'\n",
    "df_dis_phe_pos1.loc[:, 'y_source'] = 'HPO'\n",
    "df_dis_phe_pos1.loc[:, 'y_type'] = 'effect/phenotype'\n",
    "df_dis_phe_pos1.loc[:, 'relation'] = 'disease_phenotype_positive'\n",
    "df_dis_phe_pos1.loc[:, 'display_relation'] = 'phenotype present'\n",
    "df_dis_phe_pos1 = clean_edges(df_dis_phe_pos1)\n",
    "df_dis_phe_pos1.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:18.294348Z",
     "start_time": "2021-08-06T12:18:18.051937Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation</th>\n",
       "      <th>display_relation</th>\n",
       "      <th>x_id</th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_name</th>\n",
       "      <th>x_source</th>\n",
       "      <th>y_id</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_name</th>\n",
       "      <th>y_source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>disease_phenotype_negative</td>\n",
       "      <td>phenotype absent</td>\n",
       "      <td>13924</td>\n",
       "      <td>disease</td>\n",
       "      <td>osteogenesis imperfecta type 13</td>\n",
       "      <td>MONDO</td>\n",
       "      <td>365</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>Hearing impairment</td>\n",
       "      <td>HPO</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     relation  display_relation   x_id   x_type  \\\n",
       "0  disease_phenotype_negative  phenotype absent  13924  disease   \n",
       "\n",
       "                            x_name x_source y_id            y_type  \\\n",
       "0  osteogenesis imperfecta type 13    MONDO  365  effect/phenotype   \n",
       "\n",
       "               y_name y_source  \n",
       "0  Hearing impairment      HPO  "
      ]
     },
     "execution_count": 143,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_dis_phe_neg = pd.merge(df_hpoa_neg, df_mondo_xref, 'left', left_on='disease_ontology_id', right_on='ontology_id')\n",
    "df_dis_phe_neg = df_dis_phe_neg.query('(disease_ontology==ontology) or (disease_ontology==\"ORPHA\" and ontology==\"Orphanet\")')\n",
    "df_dis_phe_neg = pd.merge(df_dis_phe_neg, df_hp_terms, 'left', left_on='hp_id', right_on='id').rename(columns={'name':'hp_name'})\n",
    "df_dis_phe_neg = pd.merge(df_dis_phe_neg, df_mondo_terms, 'left', left_on='mondo_id', right_on='id').rename(columns={'name':'mondo_name'})\n",
    "df_dis_phe_neg = df_dis_phe_neg.get(['mondo_id', 'mondo_name', 'hp_id', 'hp_name'])\n",
    "df_dis_phe_neg = df_dis_phe_neg.rename(columns={'mondo_id':'x_id', 'mondo_name':'x_name', 'hp_id': 'y_id', 'hp_name':'y_name'})\n",
    "df_dis_phe_neg.loc[:, 'x_source'] = 'MONDO'\n",
    "df_dis_phe_neg.loc[:, 'x_type'] = 'disease'\n",
    "df_dis_phe_neg.loc[:, 'y_source'] = 'HPO'\n",
    "df_dis_phe_neg.loc[:, 'y_type'] = 'effect/phenotype'\n",
    "df_dis_phe_neg.loc[:, 'relation'] = 'disease_phenotype_negative'\n",
    "df_dis_phe_neg.loc[:, 'display_relation'] = 'phenotype absent'\n",
    "df_dis_phe_neg = clean_edges(df_dis_phe_neg)\n",
    "df_dis_phe_neg.head(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "### Remove MONDO nodes if they exist in HPO (Modified)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:18.395112Z",
     "start_time": "2021-08-06T12:18:18.296744Z"
    },
    "code_folding": [],
    "hidden": true,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ema30/zaklab/envs/rare_disease/lib/python3.8/site-packages/pandas/core/indexing.py:1843: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  self.obj[item_labels[indexer[info_axis]]] = value\n"
     ]
    }
   ],
   "source": [
    "# phenotypes that are actually diseases in MONDO\n",
    "# avoid duplicate nodes and convert disease nodes to phenotype nodes\n",
    "mondo_xref_hp_subset = df_mondo_xref.query('ontology==\"HP\"')\n",
    "mondo_xref_hp_subset.loc[:, 'ontology_id'] = mondo_xref_hp_subset.get('ontology_id').astype(int).astype(str).values\n",
    "merged_mondo_hpo = pd.merge(mondo_xref_hp_subset, df_hp_terms, 'inner', left_on='ontology_id', right_on='id')\n",
    "\n",
    "merged_mondo_hpo[['ontology_id', 'mondo_id']].to_csv(save_path+'auxillary/mondo2hpo.csv', index=False)\n",
    "mondo_r_hp_ids = merged_mondo_hpo.get('mondo_id').values\n",
    "\n",
    "def replace_mondo_w_hpo(df, mondo_id_col, drop_cols=[]): \n",
    "    cols = list(df.columns.values)\n",
    "    cols.extend(['ontology_id', 'ontology_name'])\n",
    "    [cols.remove(x) for x in drop_cols]\n",
    "    df = pd.merge(df, mondo_xref_hp_subset, 'left', left_on=mondo_id_col, right_on='mondo_id')\n",
    "    df = pd.merge(df, df_hp_terms, 'left', left_on='ontology_id', right_on='id')\n",
    "    df = df.rename(columns={'name':'ontology_name'}).get(cols)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# HANDLE DISEASE DISEASE --> EFFECT EFFECT\n",
    "\n",
    "df_phe_phe2 = df_dis_dis1.query('x_id in @mondo_r_hp_ids and y_id in @mondo_r_hp_ids')\n",
    "df_phe_phe2 = replace_mondo_w_hpo(df=df_phe_phe2, mondo_id_col='x_id', drop_cols=[c for c in df_phe_phe2.columns.values if 'x_' in c])\n",
    "df_phe_phe2 = df_phe_phe2.rename(columns={'ontology_id':'x_id', 'ontology_name':'x_name'})\n",
    "df_phe_phe2 = replace_mondo_w_hpo(df=df_phe_phe2, mondo_id_col='y_id', drop_cols=[c for c in df_phe_phe2.columns.values if 'y_' in c])\n",
    "df_phe_phe2 = df_phe_phe2.rename(columns={'ontology_id':'y_id', 'ontology_name':'y_name'})\n",
    "df_phe_phe2.loc[:, 'x_source'] = 'HPO'\n",
    "df_phe_phe2.loc[:, 'x_type'] = 'effect/phenotype'\n",
    "df_phe_phe2.loc[:, 'y_source'] = 'HPO'\n",
    "df_phe_phe2.loc[:, 'y_type'] = 'effect/phenotype'\n",
    "df_phe_phe2.loc[:,'relation'] = 'phenotype_phenotype'\n",
    "df_phe_phe2.loc[:,'display_relation'] = 'parent-child'\n",
    "df_phe_phe2 = clean_edges(df_phe_phe2)\n",
    "\n",
    "# drop relations in DIS DIS if either DIS is in HPO\n",
    "# disease disease should have no phenotype nodes\n",
    "df_dis_dis = df_dis_dis1.query('x_id not in @mondo_r_hp_ids and y_id not in @mondo_r_hp_ids')\n",
    "\n",
    "# ensure that none of the disease nodes (source or target) are hpo nodes\n",
    "assert len(df_dis_dis.query('x_id in @mondo_r_hp_ids')) == 0\n",
    "assert len(df_dis_dis.query('y_id in @mondo_r_hp_ids')) == 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {},
   "outputs": [],
   "source": [
    "# HANDLE DISEASE EFFECT NEGATIVE --> EFFECT EFFECT\n",
    "df_phe_phe3 = df_dis_phe_neg.query('x_id in @mondo_r_hp_ids')\n",
    "df_phe_phe3 = replace_mondo_w_hpo(df=df_phe_phe3, mondo_id_col='x_id', drop_cols=[c for c in df_phe_phe3.columns.values if 'x_' in c])\n",
    "df_phe_phe3 = df_phe_phe3.rename(columns={'ontology_id':'x_id', 'ontology_name':'x_name'})\n",
    "\n",
    "df_phe_phe3.loc[:, 'x_source'] = 'HPO'\n",
    "df_phe_phe3.loc[:, 'x_type'] = 'effect/phenotype'\n",
    "df_phe_phe3.loc[:, 'relation'] = 'phenotype_phenotype'\n",
    "df_phe_phe3.loc[:, 'display_relation'] = 'parent-child'\n",
    "df_phe_phe3 = clean_edges(df_phe_phe3)\n",
    "\n",
    "# drop relations in DIS PHE if DIS is in HPO\n",
    "df_dis_phe_neg = df_dis_phe_neg.query('x_id not in @mondo_r_hp_ids')\n",
    "\n",
    "# HANDLE DISEASE EFFECT POSITIVE --> EFFECT EFFECT\n",
    "df_phe_phe4 = df_dis_phe_pos1.query('x_id in @mondo_r_hp_ids')\n",
    "df_phe_phe4 = replace_mondo_w_hpo(df=df_phe_phe4, mondo_id_col='x_id', drop_cols=[c for c in df_phe_phe4.columns.values if 'x_' in c])\n",
    "df_phe_phe4 = df_phe_phe4.rename(columns={'ontology_id':'x_id', 'ontology_name':'x_name'})\n",
    "\n",
    "df_phe_phe4.loc[:, 'x_source'] = 'HPO'\n",
    "df_phe_phe4.loc[:, 'x_type'] = 'effect/phenotype'\n",
    "df_phe_phe4.loc[:,'relation'] = 'phenotype_phenotype'\n",
    "df_phe_phe4.loc[:,'display_relation'] = 'parent-child'\n",
    "df_phe_phe4 = clean_edges(df_phe_phe4)\n",
    "\n",
    "# drop relations in DIS PHE if DIS is in HPO\n",
    "df_dis_phe_pos = df_dis_phe_pos1.query('x_id not in @mondo_r_hp_ids')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {},
   "outputs": [],
   "source": [
    "# HANDLE DISEASE PROTEIN --> EFFECT PROTEIN\n",
    "df_phe_prot2 = df_dis_prot1.query('x_id in @mondo_r_hp_ids')\n",
    "df_phe_prot2 = replace_mondo_w_hpo(df=df_phe_prot2, mondo_id_col='x_id', drop_cols=[c for c in df_phe_prot2.columns.values if 'x_' in c])\n",
    "df_phe_prot2 = df_phe_prot2.rename(columns={'ontology_id':'x_id', 'ontology_name':'x_name'})\n",
    "\n",
    "df_phe_prot2.loc[:, 'x_source'] = 'HPO'\n",
    "df_phe_prot2.loc[:, 'x_type'] = 'effect/phenotype'\n",
    "df_phe_prot2.loc[:, 'relation'] = 'phenotype_protein'\n",
    "df_phe_prot2.loc[:, 'display_relation'] = 'associated with'\n",
    "df_phe_prot2 = clean_edges(df_phe_prot2)\n",
    "\n",
    "# drop relations in DIS GENE if DIS is in HPO\n",
    "df_dis_prot = df_dis_prot1.query('x_id not in @mondo_r_hp_ids')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {},
   "outputs": [],
   "source": [
    "# HANDLE DISEASE DRUG --> Remove (does not make sense to have EFFECT DRUG)\n",
    "\n",
    "df_drug_dis = df_drug_dis.query('y_id not in @mondo_r_hp_ids')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Empty DataFrame\n",
      "Columns: [relation, display_relation, x_id, x_type, x_name, x_source, y_id, y_type, y_name, y_source]\n",
      "Index: []\n",
      "Empty DataFrame\n",
      "Columns: [relation, display_relation, x_id, x_type, x_name, x_source, y_id, y_type, y_name, y_source]\n",
      "Index: []\n"
     ]
    }
   ],
   "source": [
    "# COMBINE DATAFRAMES\n",
    "\n",
    "df_phe_phe = pd.concat([df_phe_phe, df_phe_phe2, df_phe_phe3, df_phe_phe4], ignore_index=True).drop_duplicates()\n",
    "df_phe_prot = pd.concat([df_phe_prot, df_phe_prot2], ignore_index=True).drop_duplicates()\n",
    "\n",
    "print(df_phe_phe.query('x_source == \"MONDO\" and y_source == \"MONDO\" and x_id in @mondo_r_hp_ids and y_id in @mondo_r_hp_ids'))\n",
    "print(df_phe_prot.query('x_source == \"MONDO\" and x_id in @mondo_r_hp_ids'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "### Drug effect interactions (SIDER)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:20.396548Z",
     "start_time": "2021-08-06T12:18:19.535589Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation</th>\n",
       "      <th>display_relation</th>\n",
       "      <th>x_id</th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_name</th>\n",
       "      <th>x_source</th>\n",
       "      <th>y_id</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_name</th>\n",
       "      <th>y_source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>drug_effect</td>\n",
       "      <td>side effect</td>\n",
       "      <td>DB00583</td>\n",
       "      <td>drug</td>\n",
       "      <td>Levocarnitine</td>\n",
       "      <td>DrugBank</td>\n",
       "      <td>2027</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>Abdominal pain</td>\n",
       "      <td>HPO</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      relation display_relation     x_id x_type         x_name  x_source  \\\n",
       "0  drug_effect      side effect  DB00583   drug  Levocarnitine  DrugBank   \n",
       "\n",
       "   y_id            y_type          y_name y_source  \n",
       "0  2027  effect/phenotype  Abdominal pain      HPO  "
      ]
     },
     "execution_count": 150,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_drug_effect = pd.merge(df_sider, df_db_atc, 'left', left_on='atc', right_on='atc_code')\n",
    "df_drug_effect = df_drug_effect.rename(columns={'parent_key':'DrugBank', 'UMLS_from_meddra':'UMLS'})\n",
    "df_drug_effect = pd.merge(df_drug_effect, db_vocab, 'left', left_on='DrugBank', right_on='DrugBank ID')\n",
    "df_drug_effect = pd.merge(df_drug_effect, df_hp_xref, 'left', left_on='UMLS' , right_on='ontology_id')\n",
    "df_drug_effect = pd.merge(df_drug_effect, df_hp_terms, 'left', left_on='hp_id' , right_on='id')\n",
    "df_drug_effect = df_drug_effect.get(['DrugBank ID','Common name','hp_id', 'name'])\n",
    "df_drug_effect = df_drug_effect.dropna().drop_duplicates()\n",
    "\n",
    "df_drug_effect = df_drug_effect.rename(columns={'DrugBank ID':'x_id', 'Common name':'x_name', 'hp_id':'y_id', 'name':'y_name'})\n",
    "df_drug_effect['x_type'] = 'drug'\n",
    "df_drug_effect['x_source'] = 'DrugBank'\n",
    "df_drug_effect['y_type'] = 'effect/phenotype'\n",
    "df_drug_effect['y_source'] = 'HPO'\n",
    "df_drug_effect['relation'] = 'drug_effect'\n",
    "df_drug_effect['display_relation'] = 'side effect'\n",
    "#df_drug_effect = df_drug_effect.query('y_id not in @hp_ids_r_mondo')\n",
    "df_drug_effect = clean_edges(df_drug_effect)\n",
    "df_drug_effect.head(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "## GO Terms"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "### Go terms interactions (GO)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:20.828806Z",
     "start_time": "2021-08-06T12:18:20.400380Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation</th>\n",
       "      <th>display_relation</th>\n",
       "      <th>x_id</th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_name</th>\n",
       "      <th>x_source</th>\n",
       "      <th>y_id</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_name</th>\n",
       "      <th>y_source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>bioprocess_bioprocess</td>\n",
       "      <td>parent-child</td>\n",
       "      <td>51581</td>\n",
       "      <td>biological_process</td>\n",
       "      <td>negative regulation of neurotransmitter uptake</td>\n",
       "      <td>GO</td>\n",
       "      <td>51612</td>\n",
       "      <td>biological_process</td>\n",
       "      <td>negative regulation of serotonin uptake</td>\n",
       "      <td>GO</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                relation display_relation   x_id              x_type  \\\n",
       "0  bioprocess_bioprocess     parent-child  51581  biological_process   \n",
       "\n",
       "                                           x_name x_source   y_id  \\\n",
       "0  negative regulation of neurotransmitter uptake       GO  51612   \n",
       "\n",
       "               y_type                                   y_name y_source  \n",
       "0  biological_process  negative regulation of serotonin uptake       GO  "
      ]
     },
     "execution_count": 151,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bp = df_go_terms.query('go_term_type==\"biological_process\"')\n",
    "df_bp_bp = pd.merge(df_go_edges, bp, 'inner', left_on='x', right_on='go_term_id')\n",
    "df_bp_bp = df_bp_bp.rename(columns={'go_term_id':'x_id','go_term_name':'x_name','go_term_type':'x_type'})\n",
    "df_bp_bp = pd.merge(df_bp_bp, bp, 'inner', left_on='y', right_on='go_term_id')\n",
    "df_bp_bp = df_bp_bp.rename(columns={'go_term_id':'y_id','go_term_name':'y_name','go_term_type':'y_type'})\n",
    "df_bp_bp['relation'] = 'bioprocess_bioprocess'\n",
    "df_bp_bp['x_source'] = 'GO'\n",
    "df_bp_bp['y_source'] = 'GO'\n",
    "df_bp_bp['display_relation'] = 'parent-child'\n",
    "df_bp_bp = clean_edges(df_bp_bp)\n",
    "df_bp_bp.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:20.988390Z",
     "start_time": "2021-08-06T12:18:20.831955Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation</th>\n",
       "      <th>display_relation</th>\n",
       "      <th>x_id</th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_name</th>\n",
       "      <th>x_source</th>\n",
       "      <th>y_id</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_name</th>\n",
       "      <th>y_source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>molfunc_molfunc</td>\n",
       "      <td>parent-child</td>\n",
       "      <td>8168</td>\n",
       "      <td>molecular_function</td>\n",
       "      <td>methyltransferase activity</td>\n",
       "      <td>GO</td>\n",
       "      <td>102130</td>\n",
       "      <td>molecular_function</td>\n",
       "      <td>malonyl-CoA methyltransferase activity</td>\n",
       "      <td>GO</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          relation display_relation  x_id              x_type  \\\n",
       "0  molfunc_molfunc     parent-child  8168  molecular_function   \n",
       "\n",
       "                       x_name x_source    y_id              y_type  \\\n",
       "0  methyltransferase activity       GO  102130  molecular_function   \n",
       "\n",
       "                                   y_name y_source  \n",
       "0  malonyl-CoA methyltransferase activity       GO  "
      ]
     },
     "execution_count": 152,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mf = df_go_terms.query('go_term_type==\"molecular_function\"')\n",
    "df_mf_mf = pd.merge(df_go_edges, mf, 'inner', left_on='x', right_on='go_term_id')\n",
    "df_mf_mf = df_mf_mf.rename(columns={'go_term_id':'x_id','go_term_name':'x_name','go_term_type':'x_type'})\n",
    "df_mf_mf = pd.merge(df_mf_mf, mf, 'inner', left_on='y', right_on='go_term_id')\n",
    "df_mf_mf = df_mf_mf.rename(columns={'go_term_id':'y_id','go_term_name':'y_name','go_term_type':'y_type'})\n",
    "df_mf_mf['relation'] = 'molfunc_molfunc'\n",
    "df_mf_mf['display_relation'] = 'parent-child'\n",
    "df_mf_mf['x_source'] = 'GO'\n",
    "df_mf_mf['y_source'] = 'GO'\n",
    "df_mf_mf = clean_edges(df_mf_mf)\n",
    "df_mf_mf.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:21.149592Z",
     "start_time": "2021-08-06T12:18:20.996132Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation</th>\n",
       "      <th>display_relation</th>\n",
       "      <th>x_id</th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_name</th>\n",
       "      <th>x_source</th>\n",
       "      <th>y_id</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_name</th>\n",
       "      <th>y_source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>cellcomp_cellcomp</td>\n",
       "      <td>parent-child</td>\n",
       "      <td>110165</td>\n",
       "      <td>cellular_component</td>\n",
       "      <td>cellular anatomical entity</td>\n",
       "      <td>GO</td>\n",
       "      <td>90553</td>\n",
       "      <td>cellular_component</td>\n",
       "      <td>unicellular trichome tip</td>\n",
       "      <td>GO</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            relation display_relation    x_id              x_type  \\\n",
       "0  cellcomp_cellcomp     parent-child  110165  cellular_component   \n",
       "\n",
       "                       x_name x_source   y_id              y_type  \\\n",
       "0  cellular anatomical entity       GO  90553  cellular_component   \n",
       "\n",
       "                     y_name y_source  \n",
       "0  unicellular trichome tip       GO  "
      ]
     },
     "execution_count": 153,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cc = df_go_terms.query('go_term_type==\"cellular_component\"')\n",
    "df_cc_cc = pd.merge(df_go_edges, cc, 'inner', left_on='x', right_on='go_term_id')\n",
    "df_cc_cc = df_cc_cc.rename(columns={'go_term_id':'x_id','go_term_name':'x_name','go_term_type':'x_type'})\n",
    "df_cc_cc = pd.merge(df_cc_cc, cc, 'inner', left_on='y', right_on='go_term_id')\n",
    "df_cc_cc = df_cc_cc.rename(columns={'go_term_id':'y_id','go_term_name':'y_name','go_term_type':'y_type'})\n",
    "df_cc_cc['relation'] = 'cellcomp_cellcomp'\n",
    "df_cc_cc['display_relation'] = 'parent-child'\n",
    "df_cc_cc['x_source'] = 'GO'\n",
    "df_cc_cc['y_source'] = 'GO'\n",
    "df_cc_cc = clean_edges(df_cc_cc)\n",
    "df_cc_cc.head(1)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "### Go protein interactions (Gene2GO)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:22.059413Z",
     "start_time": "2021-08-06T12:18:21.156638Z"
    },
    "hidden": true
   },
   "outputs": [],
   "source": [
    "df_prot_path = pd.merge(df_gene2go, df_go_terms, 'inner', 'go_term_id').rename(columns={'go_term_type_x':'go_term_type'})\n",
    "df_prot_path = pd.merge(df_prot_path, df_prot_names, 'left', left_on='ncbi_gene_id', right_on='ncbi_id')\n",
    "df_prot_path = df_prot_path.rename(columns={'ncbi_gene_id':'x_id', 'symbol':'x_name', \n",
    "                             'go_term_id':'y_id','go_term_name':'y_name', 'go_term_type':'y_type'})\n",
    "df_prot_path['x_type'] = 'gene/protein'\n",
    "df_prot_path['x_source'] = 'NCBI'\n",
    "df_prot_path['y_source'] = 'GO'\n",
    "df_prot_path = df_prot_path.get(['x_id','x_type', 'x_name', 'x_source','y_id','y_type', 'y_name', 'y_source'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:22.492514Z",
     "start_time": "2021-08-06T12:18:22.065000Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation</th>\n",
       "      <th>display_relation</th>\n",
       "      <th>x_id</th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_name</th>\n",
       "      <th>x_source</th>\n",
       "      <th>y_id</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_name</th>\n",
       "      <th>y_source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>protein_molfunc</td>\n",
       "      <td>interacts with</td>\n",
       "      <td>2</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>A2M</td>\n",
       "      <td>NCBI</td>\n",
       "      <td>19966</td>\n",
       "      <td>molecular_function</td>\n",
       "      <td>interleukin-1 binding</td>\n",
       "      <td>GO</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          relation display_relation x_id        x_type x_name x_source   y_id  \\\n",
       "0  protein_molfunc   interacts with    2  gene/protein    A2M     NCBI  19966   \n",
       "\n",
       "               y_type                 y_name y_source  \n",
       "0  molecular_function  interleukin-1 binding       GO  "
      ]
     },
     "execution_count": 155,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_prot_mf = df_prot_path.query('y_type==\"molecular_function\"').copy()\n",
    "df_prot_mf['relation'] = 'protein_molfunc'\n",
    "df_prot_mf['display_relation'] = 'interacts with'\n",
    "df_prot_mf = clean_edges(df_prot_mf)\n",
    "df_prot_mf.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:22.897652Z",
     "start_time": "2021-08-06T12:18:22.499167Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation</th>\n",
       "      <th>display_relation</th>\n",
       "      <th>x_id</th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_name</th>\n",
       "      <th>x_source</th>\n",
       "      <th>y_id</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_name</th>\n",
       "      <th>y_source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>214459</th>\n",
       "      <td>protein_cellcomp</td>\n",
       "      <td>interacts with</td>\n",
       "      <td>1</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>A1BG</td>\n",
       "      <td>NCBI</td>\n",
       "      <td>1904813</td>\n",
       "      <td>cellular_component</td>\n",
       "      <td>ficolin-1-rich granule lumen</td>\n",
       "      <td>GO</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                relation display_relation x_id        x_type x_name x_source  \\\n",
       "214459  protein_cellcomp   interacts with    1  gene/protein   A1BG     NCBI   \n",
       "\n",
       "           y_id              y_type                        y_name y_source  \n",
       "214459  1904813  cellular_component  ficolin-1-rich granule lumen       GO  "
      ]
     },
     "execution_count": 156,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_prot_cc = df_prot_path.query('y_type==\"cellular_component\"').copy()\n",
    "df_prot_cc['relation'] = 'protein_cellcomp'\n",
    "df_prot_cc['display_relation'] = 'interacts with'\n",
    "df_prot_cc = clean_edges(df_prot_cc)\n",
    "df_prot_cc.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:23.584170Z",
     "start_time": "2021-08-06T12:18:22.904825Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation</th>\n",
       "      <th>display_relation</th>\n",
       "      <th>x_id</th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_name</th>\n",
       "      <th>x_source</th>\n",
       "      <th>y_id</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_name</th>\n",
       "      <th>y_source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>69588</th>\n",
       "      <td>protein_bioprocess</td>\n",
       "      <td>interacts with</td>\n",
       "      <td>1</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>A1BG</td>\n",
       "      <td>NCBI</td>\n",
       "      <td>43312</td>\n",
       "      <td>biological_process</td>\n",
       "      <td>neutrophil degranulation</td>\n",
       "      <td>GO</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 relation display_relation x_id        x_type x_name x_source  \\\n",
       "69588  protein_bioprocess   interacts with    1  gene/protein   A1BG     NCBI   \n",
       "\n",
       "        y_id              y_type                    y_name y_source  \n",
       "69588  43312  biological_process  neutrophil degranulation       GO  "
      ]
     },
     "execution_count": 157,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_prot_bp = df_prot_path.query('y_type==\"biological_process\"').copy()\n",
    "df_prot_bp['relation'] = 'protein_bioprocess'\n",
    "df_prot_bp['display_relation'] = 'interacts with'\n",
    "df_prot_bp = clean_edges(df_prot_bp)\n",
    "df_prot_bp.head(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "## Exposure"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "### Exposure protein interactions (CTD)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:42.557753Z",
     "start_time": "2021-08-06T12:18:23.586715Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation</th>\n",
       "      <th>display_relation</th>\n",
       "      <th>x_id</th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_name</th>\n",
       "      <th>x_source</th>\n",
       "      <th>y_id</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_name</th>\n",
       "      <th>y_source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>exposure_protein</td>\n",
       "      <td>interacts with</td>\n",
       "      <td>C092102</td>\n",
       "      <td>exposure</td>\n",
       "      <td>1-hydroxyphenanthrene</td>\n",
       "      <td>CTD</td>\n",
       "      <td>1401</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>CRP</td>\n",
       "      <td>NCBI</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           relation display_relation     x_id    x_type  \\\n",
       "0  exposure_protein   interacts with  C092102  exposure   \n",
       "\n",
       "                  x_name x_source  y_id        y_type y_name y_source  \n",
       "0  1-hydroxyphenanthrene      CTD  1401  gene/protein    CRP     NCBI  "
      ]
     },
     "execution_count": 158,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_exp_prot = df_exposures.get(['exposurestressorname', 'exposurestressorid','exposuremarker', 'exposuremarkerid'])\n",
    "df_exp_prot = df_exp_prot.loc[df_exp_prot.get(['exposuremarkerid']).dropna().index, :]\n",
    "\n",
    "gene_row_index = []\n",
    "for idx, data in df_exp_prot.iterrows():\n",
    "    if data.exposuremarkerid.isnumeric(): \n",
    "        gene_row_index.append(idx)\n",
    "\n",
    "df_exp_prot = df_exp_prot.loc[gene_row_index, :].astype({'exposuremarkerid': 'int'}).astype({'exposuremarkerid': 'str'})\n",
    "df_exp_prot = pd.merge(df_exp_prot, df_prot_names, 'left', left_on='exposuremarkerid', right_on='ncbi_id')\n",
    "\n",
    "df_exp_prot = df_exp_prot.rename(columns={'exposurestressorid':'x_id', 'exposurestressorname':'x_name', 'ncbi_id':'y_id', 'symbol':'y_name'})\n",
    "df_exp_prot['x_type'] = 'exposure'\n",
    "df_exp_prot['x_source'] = 'CTD'\n",
    "df_exp_prot['y_type'] = 'gene/protein'\n",
    "df_exp_prot['y_source'] = 'NCBI'\n",
    "df_exp_prot['relation'] = 'exposure_protein'\n",
    "df_exp_prot['display_relation'] = 'interacts with'\n",
    "df_exp_prot = clean_edges(df_exp_prot)\n",
    "df_exp_prot.head(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "### Exposure disease interactions (CTD)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:43.131769Z",
     "start_time": "2021-08-06T12:18:42.744281Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation</th>\n",
       "      <th>display_relation</th>\n",
       "      <th>x_id</th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_name</th>\n",
       "      <th>x_source</th>\n",
       "      <th>y_id</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_name</th>\n",
       "      <th>y_source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>exposure_disease</td>\n",
       "      <td>linked to</td>\n",
       "      <td>C024566</td>\n",
       "      <td>exposure</td>\n",
       "      <td>1,1,1-trichloroethane</td>\n",
       "      <td>CTD</td>\n",
       "      <td>4976</td>\n",
       "      <td>disease</td>\n",
       "      <td>amyotrophic lateral sclerosis</td>\n",
       "      <td>MONDO</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           relation display_relation     x_id    x_type  \\\n",
       "0  exposure_disease        linked to  C024566  exposure   \n",
       "\n",
       "                  x_name x_source  y_id   y_type  \\\n",
       "0  1,1,1-trichloroethane      CTD  4976  disease   \n",
       "\n",
       "                          y_name y_source  \n",
       "0  amyotrophic lateral sclerosis    MONDO  "
      ]
     },
     "execution_count": 159,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_exp_dis = df_exposures.get(['exposurestressorname', 'exposurestressorid','diseasename', 'diseaseid'])\n",
    "df_exp_dis = df_exp_dis.loc[df_exp_dis.get(['diseaseid']).dropna().index, :]\n",
    "df_exp_dis = pd.merge(df_exp_dis, df_mondo_xref.query('ontology==\"MESH\"'), 'left', left_on='diseaseid', right_on='ontology_id')\n",
    "df_exp_dis = pd.merge(df_exp_dis, df_mondo_terms, 'left', left_on='mondo_id', right_on= 'id')\n",
    "\n",
    "df_exp_dis = df_exp_dis.rename(columns={'exposurestressorid':'x_id', 'exposurestressorname':'x_name', 'mondo_id':'y_id', 'name':'y_name'})\n",
    "df_exp_dis['x_type'] = 'exposure'\n",
    "df_exp_dis['x_source'] = 'CTD'\n",
    "df_exp_dis['y_type'] = 'disease'\n",
    "df_exp_dis['y_source'] = 'MONDO'\n",
    "df_exp_dis['relation'] = 'exposure_disease'\n",
    "df_exp_dis['display_relation'] = 'linked to'\n",
    "df_exp_dis = df_exp_dis.query('y_id not in @mondo_r_hp_ids') # Michelle added\n",
    "df_exp_dis = clean_edges(df_exp_dis)\n",
    "df_exp_dis.head(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "### Exposure exposure interactions (CTD)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:43.918651Z",
     "start_time": "2021-08-06T12:18:43.138177Z"
    },
    "hidden": true
   },
   "outputs": [],
   "source": [
    "exposures = np.unique(df_exposures.get('exposurestressorid').values)\n",
    "df_exp_exp = df_exposures.query('exposuremarkerid in @exposures')\n",
    "\n",
    "df_exp_exp = df_exp_exp.get(['exposurestressorname', 'exposurestressorid','exposuremarker', 'exposuremarkerid'])\n",
    "df_exp_exp = df_exp_exp.loc[df_exp_exp.get(['exposuremarkerid']).dropna().index, :]\n",
    "df_exp_exp = df_exp_exp.drop_duplicates()\n",
    "\n",
    "df_exp_exp = df_exp_exp.rename(columns={'exposurestressorid':'x_id', 'exposurestressorname':'x_name', 'exposuremarker':'y_name', 'exposuremarkerid':'y_id'})\n",
    "df_exp_exp['x_type'] = 'exposure'\n",
    "df_exp_exp['x_source'] = 'CTD'\n",
    "df_exp_exp['y_type'] = 'exposure'\n",
    "df_exp_exp['y_source'] = 'CTD'\n",
    "df_exp_exp['relation'] = 'exposure_exposure'\n",
    "df_exp_exp['display_relation'] = 'parent-child'\n",
    "df_exp_exp = clean_edges(df_exp_exp)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "### Exposure pathway interactions (CTD)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:44.045651Z",
     "start_time": "2021-08-06T12:18:43.924387Z"
    },
    "hidden": true
   },
   "outputs": [],
   "source": [
    "# phenotypes are actually pathways \n",
    "\n",
    "df_exp_path = df_exposures.get(['exposurestressorname', 'exposurestressorid','phenotypename', 'phenotypeid'])\n",
    "df_exp_path = df_exp_path.loc[df_exp_path.get(['phenotypeid']).dropna().index, :]\n",
    "df_exp_path.loc[:, 'phenotypeid'] = [str(int(x.split(':')[1])) for x in df_exp_path.get(['phenotypeid']).values.reshape(-1)]\n",
    "df_exp_path = df_exp_path.drop_duplicates()\n",
    "df_exp_path = pd.merge(df_exp_path, df_go_terms, 'inner', left_on='phenotypeid', right_on='go_term_id')\n",
    "df_exp_path = df_exp_path.rename(columns={'exposurestressorid':'x_id', 'exposurestressorname':'x_name', \n",
    "                                          'go_term_id':'y_id', 'go_term_name':'y_name', 'go_term_type':'y_type'})\n",
    "df_exp_path['x_type'] = 'exposure'\n",
    "df_exp_path['x_source'] = 'CTD'\n",
    "df_exp_path['y_source'] = 'GO'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:44.139834Z",
     "start_time": "2021-08-06T12:18:44.048929Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation</th>\n",
       "      <th>display_relation</th>\n",
       "      <th>x_id</th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_name</th>\n",
       "      <th>x_source</th>\n",
       "      <th>y_id</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_name</th>\n",
       "      <th>y_source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>exposure_bioprocess</td>\n",
       "      <td>interacts with</td>\n",
       "      <td>C046839</td>\n",
       "      <td>exposure</td>\n",
       "      <td>1,2,3,4,6,7,8-heptachlorodibenzodioxin</td>\n",
       "      <td>CTD</td>\n",
       "      <td>8217</td>\n",
       "      <td>biological_process</td>\n",
       "      <td>regulation of blood pressure</td>\n",
       "      <td>GO</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              relation display_relation     x_id    x_type  \\\n",
       "0  exposure_bioprocess   interacts with  C046839  exposure   \n",
       "\n",
       "                                   x_name x_source  y_id              y_type  \\\n",
       "0  1,2,3,4,6,7,8-heptachlorodibenzodioxin      CTD  8217  biological_process   \n",
       "\n",
       "                         y_name y_source  \n",
       "0  regulation of blood pressure       GO  "
      ]
     },
     "execution_count": 162,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_exp_bp = df_exp_path.query('y_type==\"biological_process\"').copy()\n",
    "df_exp_bp['relation'] = 'exposure_bioprocess'\n",
    "df_exp_bp['display_relation'] = 'interacts with'\n",
    "df_exp_bp = clean_edges(df_exp_bp)\n",
    "df_exp_bp.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:44.221350Z",
     "start_time": "2021-08-06T12:18:44.143253Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation</th>\n",
       "      <th>display_relation</th>\n",
       "      <th>x_id</th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_name</th>\n",
       "      <th>x_source</th>\n",
       "      <th>y_id</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_name</th>\n",
       "      <th>y_source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>527</th>\n",
       "      <td>exposure_molfunc</td>\n",
       "      <td>interacts with</td>\n",
       "      <td>C014024</td>\n",
       "      <td>exposure</td>\n",
       "      <td>2,4,5,2',4',5'-hexachlorobiphenyl</td>\n",
       "      <td>CTD</td>\n",
       "      <td>19766</td>\n",
       "      <td>molecular_function</td>\n",
       "      <td>IgA receptor activity</td>\n",
       "      <td>GO</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             relation display_relation     x_id    x_type  \\\n",
       "527  exposure_molfunc   interacts with  C014024  exposure   \n",
       "\n",
       "                                x_name x_source   y_id              y_type  \\\n",
       "527  2,4,5,2',4',5'-hexachlorobiphenyl      CTD  19766  molecular_function   \n",
       "\n",
       "                    y_name y_source  \n",
       "527  IgA receptor activity       GO  "
      ]
     },
     "execution_count": 163,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_exp_mf = df_exp_path.query('y_type==\"molecular_function\"').copy()\n",
    "df_exp_mf['relation'] = 'exposure_molfunc'\n",
    "df_exp_mf['display_relation'] = 'interacts with'\n",
    "df_exp_mf = clean_edges(df_exp_mf)\n",
    "df_exp_mf.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:44.291345Z",
     "start_time": "2021-08-06T12:18:44.223480Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation</th>\n",
       "      <th>display_relation</th>\n",
       "      <th>x_id</th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_name</th>\n",
       "      <th>x_source</th>\n",
       "      <th>y_id</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_name</th>\n",
       "      <th>y_source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>833</th>\n",
       "      <td>exposure_cellcomp</td>\n",
       "      <td>interacts with</td>\n",
       "      <td>D000393</td>\n",
       "      <td>exposure</td>\n",
       "      <td>Air Pollutants</td>\n",
       "      <td>CTD</td>\n",
       "      <td>71743</td>\n",
       "      <td>cellular_component</td>\n",
       "      <td>IgE immunoglobulin complex, circulating</td>\n",
       "      <td>GO</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              relation display_relation     x_id    x_type          x_name  \\\n",
       "833  exposure_cellcomp   interacts with  D000393  exposure  Air Pollutants   \n",
       "\n",
       "    x_source   y_id              y_type  \\\n",
       "833      CTD  71743  cellular_component   \n",
       "\n",
       "                                      y_name y_source  \n",
       "833  IgE immunoglobulin complex, circulating       GO  "
      ]
     },
     "execution_count": 164,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_exp_cc = df_exp_path.query('y_type==\"cellular_component\"').copy()\n",
    "df_exp_cc['relation'] = 'exposure_cellcomp'\n",
    "df_exp_cc['display_relation'] = 'interacts with'\n",
    "df_exp_cc = clean_edges(df_exp_cc)\n",
    "df_exp_cc.head(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "## Anatomy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "### Anatomy anatomy interactions (UBERON) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:44.470250Z",
     "start_time": "2021-08-06T12:18:44.294967Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation</th>\n",
       "      <th>display_relation</th>\n",
       "      <th>x_id</th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_name</th>\n",
       "      <th>x_source</th>\n",
       "      <th>y_id</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_name</th>\n",
       "      <th>y_source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>anatomy_anatomy</td>\n",
       "      <td>parent-child</td>\n",
       "      <td>2</td>\n",
       "      <td>anatomy</td>\n",
       "      <td>uterine cervix</td>\n",
       "      <td>UBERON</td>\n",
       "      <td>5156</td>\n",
       "      <td>anatomy</td>\n",
       "      <td>reproductive structure</td>\n",
       "      <td>UBERON</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          relation display_relation x_id   x_type          x_name x_source  \\\n",
       "0  anatomy_anatomy     parent-child    2  anatomy  uterine cervix   UBERON   \n",
       "\n",
       "   y_id   y_type                  y_name y_source  \n",
       "0  5156  anatomy  reproductive structure   UBERON  "
      ]
     },
     "execution_count": 165,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_ana_ana = pd.merge(df_uberon_is_a, df_uberon_terms, 'left', left_on='id', right_on='id')\n",
    "df_ana_ana = df_ana_ana.rename(columns={'id':'x_id', 'name':'x_name'})\n",
    "df_ana_ana = pd.merge(df_ana_ana, df_uberon_terms, 'left', left_on='is_a', right_on='id')\n",
    "df_ana_ana = df_ana_ana.rename(columns={'id':'y_id', 'name':'y_name'})\n",
    "df_ana_ana['x_type'] = 'anatomy'\n",
    "df_ana_ana['x_source'] = 'UBERON'\n",
    "df_ana_ana['y_type'] = 'anatomy'\n",
    "df_ana_ana['y_source'] = 'UBERON'\n",
    "df_ana_ana['relation'] = 'anatomy_anatomy'\n",
    "df_ana_ana['display_relation'] = 'parent-child'\n",
    "df_ana_ana = clean_edges(df_ana_ana)\n",
    "df_ana_ana.head(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "### Anatomy Protein (BGEE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:46.577643Z",
     "start_time": "2021-08-06T12:18:44.475187Z"
    },
    "hidden": true
   },
   "outputs": [],
   "source": [
    "df_bgee = pd.merge(df_bgee, df_prot_names, 'inner', left_on='gene_name', right_on='symbol')\n",
    "df_bgee = df_bgee.rename(columns={'ncbi_id':'x_id', 'symbol':'x_name', \n",
    "                                  'anatomy_id':'y_id', 'anatomy_name':'y_name'})\n",
    "df_bgee['x_source'] = 'NCBI'\n",
    "df_bgee['x_type'] = 'gene/protein'\n",
    "df_bgee['y_source'] = 'UBERON'\n",
    "df_bgee['y_type'] = 'anatomy'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 167,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:50.843665Z",
     "start_time": "2021-08-06T12:18:46.579406Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation</th>\n",
       "      <th>display_relation</th>\n",
       "      <th>x_id</th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_name</th>\n",
       "      <th>x_source</th>\n",
       "      <th>y_id</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_name</th>\n",
       "      <th>y_source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>protein_present_anatomy</td>\n",
       "      <td>expression present</td>\n",
       "      <td>7105</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>TSPAN6</td>\n",
       "      <td>NCBI</td>\n",
       "      <td>2</td>\n",
       "      <td>anatomy</td>\n",
       "      <td>uterine cervix</td>\n",
       "      <td>UBERON</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  relation    display_relation  x_id        x_type  x_name  \\\n",
       "0  protein_present_anatomy  expression present  7105  gene/protein  TSPAN6   \n",
       "\n",
       "  x_source y_id   y_type          y_name y_source  \n",
       "0     NCBI    2  anatomy  uterine cervix   UBERON  "
      ]
     },
     "execution_count": 167,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_ana_prot_pos = df_bgee.query('expression==\"present\"').copy()\n",
    "df_ana_prot_pos['relation'] = 'protein_present_anatomy'\n",
    "df_ana_prot_pos['display_relation'] = 'expression present'\n",
    "df_ana_prot_pos = clean_edges(df_ana_prot_pos)\n",
    "df_ana_prot_pos.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 168,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:51.146611Z",
     "start_time": "2021-08-06T12:18:50.858329Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation</th>\n",
       "      <th>display_relation</th>\n",
       "      <th>x_id</th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_name</th>\n",
       "      <th>x_source</th>\n",
       "      <th>y_id</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_name</th>\n",
       "      <th>y_source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>507</th>\n",
       "      <td>protein_absent_anatomy</td>\n",
       "      <td>expression absent</td>\n",
       "      <td>2268</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>FGR</td>\n",
       "      <td>NCBI</td>\n",
       "      <td>1476</td>\n",
       "      <td>anatomy</td>\n",
       "      <td>deltoid</td>\n",
       "      <td>UBERON</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   relation   display_relation  x_id        x_type x_name  \\\n",
       "507  protein_absent_anatomy  expression absent  2268  gene/protein    FGR   \n",
       "\n",
       "    x_source  y_id   y_type   y_name y_source  \n",
       "507     NCBI  1476  anatomy  deltoid   UBERON  "
      ]
     },
     "execution_count": 168,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_ana_prot_neg = df_bgee.query('expression==\"absent\"').copy()\n",
    "df_ana_prot_neg['relation'] = 'protein_absent_anatomy'\n",
    "df_ana_prot_neg['display_relation'] = 'expression absent'\n",
    "df_ana_prot_neg = clean_edges(df_ana_prot_neg)\n",
    "df_ana_prot_neg.head(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "## Pathways"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 169,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:51.324577Z",
     "start_time": "2021-08-06T12:18:51.152492Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation</th>\n",
       "      <th>display_relation</th>\n",
       "      <th>x_id</th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_name</th>\n",
       "      <th>x_source</th>\n",
       "      <th>y_id</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_name</th>\n",
       "      <th>y_source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>pathway_pathway</td>\n",
       "      <td>parent-child</td>\n",
       "      <td>R-HSA-109581</td>\n",
       "      <td>pathway</td>\n",
       "      <td>Apoptosis</td>\n",
       "      <td>REACTOME</td>\n",
       "      <td>R-HSA-109606</td>\n",
       "      <td>pathway</td>\n",
       "      <td>Intrinsic Pathway for Apoptosis</td>\n",
       "      <td>REACTOME</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          relation display_relation          x_id   x_type     x_name  \\\n",
       "0  pathway_pathway     parent-child  R-HSA-109581  pathway  Apoptosis   \n",
       "\n",
       "   x_source          y_id   y_type                           y_name  y_source  \n",
       "0  REACTOME  R-HSA-109606  pathway  Intrinsic Pathway for Apoptosis  REACTOME  "
      ]
     },
     "execution_count": 169,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_path_path = pd.merge(df_reactome_rels, df_reactome_terms, 'inner', left_on='reactome_id_1', right_on='reactome_id')\n",
    "df_path_path = df_path_path.rename(columns={'reactome_id': 'x_id', 'reactome_name':'x_name'})\n",
    "df_path_path = pd.merge(df_path_path, df_reactome_terms, 'inner', left_on='reactome_id_2', right_on='reactome_id')\n",
    "df_path_path = df_path_path.rename(columns={'reactome_id': 'y_id', 'reactome_name':'y_name'})\n",
    "\n",
    "df_path_path['x_source'] = 'REACTOME'\n",
    "df_path_path['x_type'] = 'pathway'\n",
    "df_path_path['y_source'] = 'REACTOME'\n",
    "df_path_path['y_type'] = 'pathway'\n",
    "df_path_path['relation'] = 'pathway_pathway'\n",
    "df_path_path['display_relation'] = 'parent-child'\n",
    "df_path_path = clean_edges(df_path_path)\n",
    "df_path_path.head(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "### Pathway protein interactions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 170,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:18:51.616244Z",
     "start_time": "2021-08-06T12:18:51.328730Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation</th>\n",
       "      <th>display_relation</th>\n",
       "      <th>x_id</th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_name</th>\n",
       "      <th>x_source</th>\n",
       "      <th>y_id</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_name</th>\n",
       "      <th>y_source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>protein_pathway</td>\n",
       "      <td>interacts with</td>\n",
       "      <td>1</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>A1BG</td>\n",
       "      <td>NCBI</td>\n",
       "      <td>R-HSA-114608</td>\n",
       "      <td>pathway</td>\n",
       "      <td>Platelet degranulation</td>\n",
       "      <td>REACTOME</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          relation display_relation x_id        x_type x_name x_source  \\\n",
       "0  protein_pathway   interacts with    1  gene/protein   A1BG     NCBI   \n",
       "\n",
       "           y_id   y_type                   y_name  y_source  \n",
       "0  R-HSA-114608  pathway  Platelet degranulation   REACTOME  "
      ]
     },
     "execution_count": 170,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_path_prot = pd.merge(df_reactome_ncbi, df_prot_names, 'inner', 'ncbi_id')\n",
    "\n",
    "df_path_prot = df_path_prot.rename(columns={'ncbi_id': 'x_id', 'symbol':'x_name', \n",
    "                                            'reactome_id': 'y_id', 'reactome_name':'y_name'})\n",
    "df_path_prot['x_source'] = 'NCBI'\n",
    "df_path_prot['x_type'] = 'gene/protein'\n",
    "df_path_prot['y_source'] = 'REACTOME'\n",
    "df_path_prot['y_type'] = 'pathway'\n",
    "df_path_prot['relation'] = 'protein_pathway'\n",
    "df_path_prot['display_relation'] = 'interacts with'\n",
    "df_path_prot = clean_edges(df_path_prot)\n",
    "df_path_prot.head(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# Compiling knowledge graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 171,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "df_prot_prot\n",
      "['gene/protein']\n",
      "['gene/protein']\n",
      "['protein_protein']\n",
      "['ppi']\n",
      "\n",
      "df_prot_drug\n",
      "['drug']\n",
      "['gene/protein']\n",
      "['drug_protein']\n",
      "['carrier' 'enzyme' 'target' 'transporter']\n",
      "\n",
      "df_drug_dis\n",
      "['drug']\n",
      "['disease']\n",
      "['contraindication' 'indication' 'off-label use']\n",
      "['contraindication' 'indication' 'off-label use']\n",
      "\n",
      "df_drug_drug\n",
      "['drug']\n",
      "['drug']\n",
      "['drug_drug']\n",
      "['synergistic interaction']\n",
      "\n",
      "df_phe_prot\n",
      "['effect/phenotype']\n",
      "['gene/protein']\n",
      "['phenotype_protein']\n",
      "['associated with']\n",
      "\n",
      "df_phe_phe\n",
      "['effect/phenotype']\n",
      "['effect/phenotype']\n",
      "['phenotype_phenotype']\n",
      "['parent-child']\n",
      "\n",
      "df_dis_phe_neg\n",
      "['disease']\n",
      "['effect/phenotype']\n",
      "['disease_phenotype_negative']\n",
      "['phenotype absent']\n",
      "\n",
      "df_dis_phe_pos\n",
      "['disease']\n",
      "['effect/phenotype']\n",
      "['disease_phenotype_positive']\n",
      "['phenotype present']\n",
      "\n",
      "df_dis_prot\n",
      "['disease']\n",
      "['gene/protein']\n",
      "['disease_protein']\n",
      "['associated with']\n",
      "\n",
      "df_dis_dis\n",
      "['disease']\n",
      "['disease']\n",
      "['disease_disease']\n",
      "['parent-child']\n",
      "\n",
      "df_drug_effect\n",
      "['drug']\n",
      "['effect/phenotype']\n",
      "['drug_effect']\n",
      "['side effect']\n",
      "\n",
      "df_bp_bp\n",
      "['biological_process']\n",
      "['biological_process']\n",
      "['bioprocess_bioprocess']\n",
      "['parent-child']\n",
      "\n",
      "df_mf_mf\n",
      "['molecular_function']\n",
      "['molecular_function']\n",
      "['molfunc_molfunc']\n",
      "['parent-child']\n",
      "\n",
      "df_cc_cc\n",
      "['cellular_component']\n",
      "['cellular_component']\n",
      "['cellcomp_cellcomp']\n",
      "['parent-child']\n",
      "\n",
      "df_prot_mf\n",
      "['gene/protein']\n",
      "['molecular_function']\n",
      "['protein_molfunc']\n",
      "['interacts with']\n",
      "\n",
      "df_prot_cc\n",
      "['gene/protein']\n",
      "['cellular_component']\n",
      "['protein_cellcomp']\n",
      "['interacts with']\n",
      "\n",
      "df_prot_bp\n",
      "['gene/protein']\n",
      "['biological_process']\n",
      "['protein_bioprocess']\n",
      "['interacts with']\n",
      "\n",
      "df_exp_prot\n",
      "['exposure']\n",
      "['gene/protein']\n",
      "['exposure_protein']\n",
      "['interacts with']\n",
      "\n",
      "df_exp_dis\n",
      "['exposure']\n",
      "['disease']\n",
      "['exposure_disease']\n",
      "['linked to']\n",
      "\n",
      "df_exp_exp\n",
      "['exposure']\n",
      "['exposure']\n",
      "['exposure_exposure']\n",
      "['parent-child']\n",
      "\n",
      "df_exp_bp\n",
      "['exposure']\n",
      "['biological_process']\n",
      "['exposure_bioprocess']\n",
      "['interacts with']\n",
      "\n",
      "df_exp_mf\n",
      "['exposure']\n",
      "['molecular_function']\n",
      "['exposure_molfunc']\n",
      "['interacts with']\n",
      "\n",
      "df_exp_cc\n",
      "['exposure']\n",
      "['cellular_component']\n",
      "['exposure_cellcomp']\n",
      "['interacts with']\n",
      "\n",
      "df_path_path\n",
      "['pathway']\n",
      "['pathway']\n",
      "['pathway_pathway']\n",
      "['parent-child']\n",
      "\n",
      "df_path_prot\n",
      "['gene/protein']\n",
      "['pathway']\n",
      "['protein_pathway']\n",
      "['interacts with']\n",
      "\n",
      "df_ana_ana\n",
      "['anatomy']\n",
      "['anatomy']\n",
      "['anatomy_anatomy']\n",
      "['parent-child']\n",
      "\n",
      "df_ana_prot_pos\n",
      "['gene/protein']\n",
      "['anatomy']\n",
      "['protein_present_anatomy']\n",
      "['expression present']\n",
      "\n",
      "df_ana_prot_neg\n",
      "['gene/protein']\n",
      "['anatomy']\n",
      "['protein_absent_anatomy']\n",
      "['expression absent']\n"
     ]
    }
   ],
   "source": [
    "print(\"df_prot_prot\")\n",
    "print(df_prot_prot['x_type'].unique())\n",
    "print(df_prot_prot['y_type'].unique())\n",
    "print(df_prot_prot['relation'].unique())\n",
    "print(df_prot_prot['display_relation'].unique())\n",
    "\n",
    "print(\"\\ndf_prot_drug\")\n",
    "print(df_prot_drug['x_type'].unique())\n",
    "print(df_prot_drug['y_type'].unique())\n",
    "print(df_prot_drug['relation'].unique())\n",
    "print(df_prot_drug['display_relation'].unique())\n",
    "\n",
    "print(\"\\ndf_drug_dis\")\n",
    "print(df_drug_dis['x_type'].unique())\n",
    "print(df_drug_dis['y_type'].unique())\n",
    "print(df_drug_dis['relation'].unique())\n",
    "print(df_drug_dis['display_relation'].unique())\n",
    "\n",
    "print(\"\\ndf_drug_drug\")\n",
    "print(df_drug_drug['x_type'].unique())\n",
    "print(df_drug_drug['y_type'].unique())\n",
    "print(df_drug_drug['relation'].unique())\n",
    "print(df_drug_drug['display_relation'].unique())\n",
    "\n",
    "print(\"\\ndf_phe_prot\")\n",
    "print(df_phe_prot['x_type'].unique())\n",
    "print(df_phe_prot['y_type'].unique())\n",
    "print(df_phe_prot['relation'].unique())\n",
    "print(df_phe_prot['display_relation'].unique())\n",
    "\n",
    "print(\"\\ndf_phe_phe\")\n",
    "print(df_phe_phe['x_type'].unique())\n",
    "print(df_phe_phe['y_type'].unique())\n",
    "print(df_phe_phe['relation'].unique())\n",
    "print(df_phe_phe['display_relation'].unique())\n",
    "\n",
    "print(\"\\ndf_dis_phe_neg\")\n",
    "print(df_dis_phe_neg['x_type'].unique())\n",
    "print(df_dis_phe_neg['y_type'].unique())\n",
    "print(df_dis_phe_neg['relation'].unique())\n",
    "print(df_dis_phe_neg['display_relation'].unique())\n",
    "\n",
    "print(\"\\ndf_dis_phe_pos\")\n",
    "print(df_dis_phe_pos['x_type'].unique())\n",
    "print(df_dis_phe_pos['y_type'].unique())\n",
    "print(df_dis_phe_pos['relation'].unique())\n",
    "print(df_dis_phe_pos['display_relation'].unique())\n",
    "\n",
    "print(\"\\ndf_dis_prot\")\n",
    "print(df_dis_prot['x_type'].unique())\n",
    "print(df_dis_prot['y_type'].unique())\n",
    "print(df_dis_prot['relation'].unique())\n",
    "print(df_dis_prot['display_relation'].unique())\n",
    "\n",
    "print(\"\\ndf_dis_dis\")\n",
    "print(df_dis_dis['x_type'].unique())\n",
    "print(df_dis_dis['y_type'].unique())\n",
    "print(df_dis_dis['relation'].unique())\n",
    "print(df_dis_dis['display_relation'].unique())\n",
    "\n",
    "print(\"\\ndf_drug_effect\")\n",
    "print(df_drug_effect['x_type'].unique())\n",
    "print(df_drug_effect['y_type'].unique())\n",
    "print(df_drug_effect['relation'].unique())\n",
    "print(df_drug_effect['display_relation'].unique())\n",
    "\n",
    "print(\"\\ndf_bp_bp\")\n",
    "print(df_bp_bp['x_type'].unique())\n",
    "print(df_bp_bp['y_type'].unique())\n",
    "print(df_bp_bp['relation'].unique())\n",
    "print(df_bp_bp['display_relation'].unique())\n",
    "\n",
    "print(\"\\ndf_mf_mf\")\n",
    "print(df_mf_mf['x_type'].unique())\n",
    "print(df_mf_mf['y_type'].unique())\n",
    "print(df_mf_mf['relation'].unique())\n",
    "print(df_mf_mf['display_relation'].unique())\n",
    "\n",
    "print(\"\\ndf_cc_cc\")\n",
    "print(df_cc_cc['x_type'].unique())\n",
    "print(df_cc_cc['y_type'].unique())\n",
    "print(df_cc_cc['relation'].unique())\n",
    "print(df_cc_cc['display_relation'].unique())\n",
    "\n",
    "print(\"\\ndf_prot_mf\")\n",
    "print(df_prot_mf['x_type'].unique())\n",
    "print(df_prot_mf['y_type'].unique())\n",
    "print(df_prot_mf['relation'].unique())\n",
    "print(df_prot_mf['display_relation'].unique())\n",
    "\n",
    "print(\"\\ndf_prot_cc\")\n",
    "print(df_prot_cc['x_type'].unique())\n",
    "print(df_prot_cc['y_type'].unique())\n",
    "print(df_prot_cc['relation'].unique())\n",
    "print(df_prot_cc['display_relation'].unique())\n",
    "\n",
    "print(\"\\ndf_prot_bp\")\n",
    "print(df_prot_bp['x_type'].unique())\n",
    "print(df_prot_bp['y_type'].unique())\n",
    "print(df_prot_bp['relation'].unique())\n",
    "print(df_prot_bp['display_relation'].unique())\n",
    "\n",
    "print(\"\\ndf_exp_prot\")\n",
    "print(df_exp_prot['x_type'].unique())\n",
    "print(df_exp_prot['y_type'].unique())\n",
    "print(df_exp_prot['relation'].unique())\n",
    "print(df_exp_prot['display_relation'].unique())\n",
    "\n",
    "print(\"\\ndf_exp_dis\")\n",
    "print(df_exp_dis['x_type'].unique())\n",
    "print(df_exp_dis['y_type'].unique())\n",
    "print(df_exp_dis['relation'].unique())\n",
    "print(df_exp_dis['display_relation'].unique())\n",
    "\n",
    "print(\"\\ndf_exp_exp\")\n",
    "print(df_exp_exp['x_type'].unique())\n",
    "print(df_exp_exp['y_type'].unique())\n",
    "print(df_exp_exp['relation'].unique())\n",
    "print(df_exp_exp['display_relation'].unique())\n",
    "\n",
    "print(\"\\ndf_exp_bp\")\n",
    "print(df_exp_bp['x_type'].unique())\n",
    "print(df_exp_bp['y_type'].unique())\n",
    "print(df_exp_bp['relation'].unique())\n",
    "print(df_exp_bp['display_relation'].unique())\n",
    "\n",
    "print(\"\\ndf_exp_mf\")\n",
    "print(df_exp_mf['x_type'].unique())\n",
    "print(df_exp_mf['y_type'].unique())\n",
    "print(df_exp_mf['relation'].unique())\n",
    "print(df_exp_mf['display_relation'].unique())\n",
    "\n",
    "print(\"\\ndf_exp_cc\")\n",
    "print(df_exp_cc['x_type'].unique())\n",
    "print(df_exp_cc['y_type'].unique())\n",
    "print(df_exp_cc['relation'].unique())\n",
    "print(df_exp_cc['display_relation'].unique())\n",
    "\n",
    "print(\"\\ndf_path_path\")\n",
    "print(df_path_path['x_type'].unique())\n",
    "print(df_path_path['y_type'].unique())\n",
    "print(df_path_path['relation'].unique())\n",
    "print(df_path_path['display_relation'].unique())\n",
    "\n",
    "print(\"\\ndf_path_prot\")\n",
    "print(df_path_prot['x_type'].unique())\n",
    "print(df_path_prot['y_type'].unique())\n",
    "print(df_path_prot['relation'].unique())\n",
    "print(df_path_prot['display_relation'].unique())\n",
    "\n",
    "print(\"\\ndf_ana_ana\")\n",
    "print(df_ana_ana['x_type'].unique())\n",
    "print(df_ana_ana['y_type'].unique())\n",
    "print(df_ana_ana['relation'].unique())\n",
    "print(df_ana_ana['display_relation'].unique())\n",
    "\n",
    "print(\"\\ndf_ana_prot_pos\")\n",
    "print(df_ana_prot_pos['x_type'].unique())\n",
    "print(df_ana_prot_pos['y_type'].unique())\n",
    "print(df_ana_prot_pos['relation'].unique())\n",
    "print(df_ana_prot_pos['display_relation'].unique())\n",
    "\n",
    "print(\"\\ndf_ana_prot_neg\")\n",
    "print(df_ana_prot_neg['x_type'].unique())\n",
    "print(df_ana_prot_neg['y_type'].unique())\n",
    "print(df_ana_prot_neg['relation'].unique())\n",
    "print(df_ana_prot_neg['display_relation'].unique())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 172,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:19:32.597604Z",
     "start_time": "2021-08-06T12:18:51.620711Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>relation</th>\n",
       "      <th>display_relation</th>\n",
       "      <th>x_id</th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_name</th>\n",
       "      <th>x_source</th>\n",
       "      <th>y_id</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_name</th>\n",
       "      <th>y_source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1539160</th>\n",
       "      <td>protein_absent_anatomy</td>\n",
       "      <td>expression absent</td>\n",
       "      <td>140</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>ADORA3</td>\n",
       "      <td>NCBI</td>\n",
       "      <td>4720</td>\n",
       "      <td>anatomy</td>\n",
       "      <td>cerebellar vermis</td>\n",
       "      <td>UBERON</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1539470</th>\n",
       "      <td>protein_absent_anatomy</td>\n",
       "      <td>expression absent</td>\n",
       "      <td>105378952</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>KLF18</td>\n",
       "      <td>NCBI</td>\n",
       "      <td>1377</td>\n",
       "      <td>anatomy</td>\n",
       "      <td>quadriceps femoris</td>\n",
       "      <td>UBERON</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1539471</th>\n",
       "      <td>protein_absent_anatomy</td>\n",
       "      <td>expression absent</td>\n",
       "      <td>105378952</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>KLF18</td>\n",
       "      <td>NCBI</td>\n",
       "      <td>1379</td>\n",
       "      <td>anatomy</td>\n",
       "      <td>vastus lateralis</td>\n",
       "      <td>UBERON</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1539472</th>\n",
       "      <td>protein_absent_anatomy</td>\n",
       "      <td>expression absent</td>\n",
       "      <td>105378952</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>KLF18</td>\n",
       "      <td>NCBI</td>\n",
       "      <td>2084</td>\n",
       "      <td>anatomy</td>\n",
       "      <td>heart left ventricle</td>\n",
       "      <td>UBERON</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1539473</th>\n",
       "      <td>protein_absent_anatomy</td>\n",
       "      <td>expression absent</td>\n",
       "      <td>105378952</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>KLF18</td>\n",
       "      <td>NCBI</td>\n",
       "      <td>5384</td>\n",
       "      <td>anatomy</td>\n",
       "      <td>nasal cavity epithelium</td>\n",
       "      <td>UBERON</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       relation   display_relation       x_id        x_type  \\\n",
       "1539160  protein_absent_anatomy  expression absent        140  gene/protein   \n",
       "1539470  protein_absent_anatomy  expression absent  105378952  gene/protein   \n",
       "1539471  protein_absent_anatomy  expression absent  105378952  gene/protein   \n",
       "1539472  protein_absent_anatomy  expression absent  105378952  gene/protein   \n",
       "1539473  protein_absent_anatomy  expression absent  105378952  gene/protein   \n",
       "\n",
       "         x_name x_source  y_id   y_type                   y_name y_source  \n",
       "1539160  ADORA3     NCBI  4720  anatomy        cerebellar vermis   UBERON  \n",
       "1539470   KLF18     NCBI  1377  anatomy       quadriceps femoris   UBERON  \n",
       "1539471   KLF18     NCBI  1379  anatomy         vastus lateralis   UBERON  \n",
       "1539472   KLF18     NCBI  2084  anatomy     heart left ventricle   UBERON  \n",
       "1539473   KLF18     NCBI  5384  anatomy  nasal cavity epithelium   UBERON  "
      ]
     },
     "execution_count": 172,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kg = pd.concat([df_prot_prot, df_prot_drug, df_drug_dis, df_drug_drug, df_phe_prot,\n",
    "                df_phe_phe, df_dis_phe_neg, df_dis_phe_pos, df_dis_prot, df_dis_dis, \n",
    "                df_drug_effect, df_bp_bp, df_mf_mf, df_cc_cc, df_prot_mf, \n",
    "                df_prot_cc, df_prot_bp, df_exp_prot, df_exp_dis, df_exp_exp, \n",
    "                df_exp_bp, df_exp_mf, df_exp_cc, df_path_path, df_path_prot,\n",
    "                df_ana_ana, df_ana_prot_pos, df_ana_prot_neg]) #28\n",
    "kg = kg.drop_duplicates()\n",
    "#kg_rev = kg.copy().rename(columns={'x_id':'y_id','x_type':'y_type', 'x_name':'y_name', 'x_source':'y_source', 'y_id':'x_id','y_type':'x_type', 'y_name':'x_name', 'y_source':'x_source' }) #add reverse edges\n",
    "#kg_rev['relation'] = kg_rev['relation'] + \"_rev\"\n",
    "#print(len(kg), len(kg_rev))\n",
    "\n",
    "#kg = pd.concat([kg, kg_rev])\n",
    "#kg = kg.drop_duplicates()\n",
    "kg = kg.dropna()\n",
    "# remove self loops from edges \n",
    "kg = kg.query('not ((x_id == y_id) and (x_type == y_type) and (x_source == y_source) and (x_name == y_name))')\n",
    "kg.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 173,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['protein_protein' 'drug_protein' 'contraindication' 'indication'\n",
      " 'off-label use' 'drug_drug' 'phenotype_protein' 'phenotype_phenotype'\n",
      " 'disease_phenotype_negative' 'disease_phenotype_positive'\n",
      " 'disease_protein' 'disease_disease' 'drug_effect' 'bioprocess_bioprocess'\n",
      " 'molfunc_molfunc' 'cellcomp_cellcomp' 'protein_molfunc'\n",
      " 'protein_cellcomp' 'protein_bioprocess' 'exposure_protein'\n",
      " 'exposure_disease' 'exposure_exposure' 'exposure_bioprocess'\n",
      " 'exposure_molfunc' 'exposure_cellcomp' 'pathway_pathway'\n",
      " 'protein_pathway' 'anatomy_anatomy' 'protein_present_anatomy'\n",
      " 'protein_absent_anatomy']\n",
      "['ppi' 'carrier' 'enzyme' 'target' 'transporter' 'contraindication'\n",
      " 'indication' 'off-label use' 'synergistic interaction' 'associated with'\n",
      " 'parent-child' 'phenotype absent' 'phenotype present' 'side effect'\n",
      " 'interacts with' 'linked to' 'expression present' 'expression absent']\n",
      "5463048\n"
     ]
    }
   ],
   "source": [
    "print(kg['relation'].unique())\n",
    "print(kg['display_relation'].unique())\n",
    "print(len(kg))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:20:04.693646Z",
     "start_time": "2021-08-06T12:19:32.602234Z"
    },
    "hidden": true
   },
   "outputs": [],
   "source": [
    "kg.to_csv(save_path+'auxillary/kg_raw.csv', index=False) # No reverse edges"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 174,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Empty DataFrame\n",
      "Columns: [relation, display_relation, x_id, x_type, x_name, x_source, y_id, y_type, y_name, y_source]\n",
      "Index: []\n",
      "Empty DataFrame\n",
      "Columns: [relation, display_relation, x_id, x_type, x_name, x_source, y_id, y_type, y_name, y_source]\n",
      "Index: []\n"
     ]
    }
   ],
   "source": [
    "# Double check that none of the MONDO terms are still in HPO\n",
    "print(kg.query('x_source == \"MONDO\" and x_id in @mondo_r_hp_ids'))\n",
    "print(kg.query('y_source == \"MONDO\" and y_id in @mondo_r_hp_ids'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# Get giant component"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:20:20.238147Z",
     "start_time": "2021-08-06T12:20:04.696534Z"
    },
    "hidden": true
   },
   "outputs": [],
   "source": [
    "kg = pd.read_csv(save_path+'auxillary/kg_raw_orphanet.csv', low_memory=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                           relation   display_relation   x_id  \\\n",
      "0                   protein_protein                ppi   9796   \n",
      "1                   protein_protein                ppi   7918   \n",
      "2                   protein_protein                ppi   8233   \n",
      "3                   protein_protein                ppi   4899   \n",
      "4                   protein_protein                ppi   5297   \n",
      "...                             ...                ...    ...   \n",
      "5471989  disease_phenotype_positive  phenotype present  15942   \n",
      "5471990  disease_phenotype_positive  phenotype present  16355   \n",
      "5471991  disease_phenotype_positive  phenotype present   8294   \n",
      "5471992  disease_phenotype_positive  phenotype present  14412   \n",
      "5471993         phenotype_phenotype       parent-child   8255   \n",
      "\n",
      "                   x_type                                x_name x_source  \\\n",
      "0            gene/protein                                PHYHIP     NCBI   \n",
      "1            gene/protein                                GPANK1     NCBI   \n",
      "2            gene/protein                                 ZRSR2     NCBI   \n",
      "3            gene/protein                                  NRF1     NCBI   \n",
      "4            gene/protein                                 PI4KA     NCBI   \n",
      "...                   ...                                   ...      ...   \n",
      "5471989           disease           frontometaphyseal dysplasia    MONDO   \n",
      "5471990           disease           semilobar holoprosencephaly    MONDO   \n",
      "5471991           disease          acute intermittent porphyria    MONDO   \n",
      "5471992           disease         hyperlipoproteinemia, type 1D    MONDO   \n",
      "5471993  effect/phenotype  Transient neonatal diabetes mellitus      HPO   \n",
      "\n",
      "           y_id            y_type                                  y_name  \\\n",
      "0         56992      gene/protein                                   KIF15   \n",
      "1          9240      gene/protein                                   PNMA1   \n",
      "2         23548      gene/protein                                   TTC33   \n",
      "3         11253      gene/protein                                  MAN1B1   \n",
      "4          8601      gene/protein                                   RGS20   \n",
      "...         ...               ...                                     ...   \n",
      "5471989     218  effect/phenotype                             High palate   \n",
      "5471990     568  effect/phenotype                          Microphthalmia   \n",
      "5471991    2039  effect/phenotype                                Anorexia   \n",
      "5471992  100851  effect/phenotype        Abnormal emotion/affect behavior   \n",
      "5471993   10935  effect/phenotype  Abnormality of the upper urinary tract   \n",
      "\n",
      "        y_source  \n",
      "0           NCBI  \n",
      "1           NCBI  \n",
      "2           NCBI  \n",
      "3           NCBI  \n",
      "4           NCBI  \n",
      "...          ...  \n",
      "5471989      HPO  \n",
      "5471990      HPO  \n",
      "5471991      HPO  \n",
      "5471992      HPO  \n",
      "5471993      HPO  \n",
      "\n",
      "[5471994 rows x 10 columns]\n"
     ]
    }
   ],
   "source": [
    "print(kg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:21:48.910982Z",
     "start_time": "2021-08-06T12:20:20.245604Z"
    },
    "hidden": true
   },
   "outputs": [],
   "source": [
    "nodes = pd.concat([kg.get(['x_id','x_type', 'x_name','x_source']).rename(columns={'x_id':'node_id', 'x_type':'node_type', 'x_name':'node_name','x_source':'node_source'}), \n",
    "                   kg.get(['y_id','y_type', 'y_name','y_source']).rename(columns={'y_id':'node_id', 'y_type':'node_type', 'y_name':'node_name','y_source':'node_source'})])\n",
    "nodes = nodes.drop_duplicates().reset_index().drop('index',axis=1).reset_index().rename(columns={'index':'node_idx'})\n",
    "\n",
    "edges = pd.merge(kg, nodes, 'left', left_on=['x_id','x_type', 'x_name','x_source'], right_on=['node_id','node_type','node_name','node_source'])\n",
    "edges = edges.rename(columns={'node_idx':'x_idx'})\n",
    "edges = pd.merge(edges, nodes, 'left', left_on=['y_id','y_type', 'y_name','y_source'], right_on=['node_id','node_type','node_name','node_source'])\n",
    "edges = edges.rename(columns={'node_idx':'y_idx'})\n",
    "edges = edges.get(['relation', 'display_relation','x_idx', 'y_idx'])\n",
    "edges['combine_idx'] = edges['x_idx'].astype(str) + '-' + edges['y_idx'].astype(str)\n",
    "\n",
    "edge_index = edges.get(['x_idx', 'y_idx']).values.T\n",
    "\n",
    "graph = ig.Graph()\n",
    "graph.add_vertices(list(range(nodes.shape[0])))\n",
    "graph.add_edges([tuple(x) for x in edge_index.T])\n",
    "\n",
    "graph = graph.as_undirected(mode='collapse')\n",
    "\n",
    "c = graph.components(mode='strong')\n",
    "giant = c.giant()\n",
    "\n",
    "#print('Nodes: %d' % giant.vcount())\n",
    "#print('Edges: %d' % giant.ecount())\n",
    "\n",
    "assert not giant.is_directed()\n",
    "assert giant.is_connected()\n",
    "\n",
    "giant_nodes = giant.vs['name']\n",
    "new_nodes = nodes.query('node_idx in @giant_nodes')\n",
    "assert new_nodes.shape[0] == giant.vcount()\n",
    "\n",
    "new_edges = edges.query('x_idx in @giant_nodes and y_idx in @giant_nodes').copy()\n",
    "assert new_edges.shape[0] == giant.ecount()\n",
    "\n",
    "new_kg = pd.merge(new_edges, new_nodes, 'left', left_on='x_idx', right_on='node_idx')\n",
    "new_kg = new_kg.rename(columns={'node_id':'x_id', 'node_type':'x_type', 'node_name':'x_name','node_source':'x_source'}) \n",
    "new_kg = pd.merge(new_kg, new_nodes, 'left', left_on='y_idx', right_on='node_idx')\n",
    "new_kg = new_kg.rename(columns={'node_id':'y_id', 'node_type':'y_type', 'node_name':'y_name','node_source':'y_source'}) \n",
    "new_kg = clean_edges(new_kg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:22:20.208631Z",
     "start_time": "2021-08-06T12:21:48.913545Z"
    },
    "hidden": true
   },
   "outputs": [],
   "source": [
    "kg = new_kg.copy()\n",
    "kg.to_csv(save_path+'auxillary/kg_giant_orphanet.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# Collapse similar diseases"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:22:34.422640Z",
     "start_time": "2021-08-06T12:22:20.210477Z"
    },
    "hidden": true
   },
   "outputs": [],
   "source": [
    "# Disease groupings are independent of the KG (requires only MONDO terms)\n",
    "\n",
    "kg = pd.read_csv(save_path+'auxillary/kg_giant_orphanet.csv', low_memory=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "## Find Groups"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true,
    "hidden": true
   },
   "source": [
    "### Automated grouping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:22:34.495911Z",
     "start_time": "2021-08-06T12:22:34.424764Z"
    },
    "hidden": true
   },
   "outputs": [],
   "source": [
    "'''\n",
    "disease_nodes = pd.concat([kg.get(['x_id','x_type', 'x_name','x_source']).rename(columns={'x_id':'node_id', 'x_type':'node_type', 'x_name':'node_name','x_source':'node_source'}), \n",
    "                   kg.get(['y_id','y_type', 'y_name','y_source']).rename(columns={'y_id':'node_id', 'y_type':'node_type', 'y_name':'node_name','y_source':'node_source'})])\n",
    "disease_nodes = disease_nodes.query('node_type==\"disease\"')\n",
    "disease_nodes = disease_nodes.drop_duplicates().reset_index().drop('index',axis=1).reset_index().rename(columns={'index':'node_idx'})\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:22:34.540058Z",
     "start_time": "2021-08-06T12:22:34.501005Z"
    },
    "hidden": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "'''\n",
    "groups = []\n",
    "seen = set()\n",
    "idx2group = {}\n",
    "no = set()\n",
    "\n",
    "def isroman(s):\n",
    "    return bool(re.search(r\"^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$\",s))\n",
    "\n",
    "def issingleletter(s): \n",
    "    if len(s)>1: return False\n",
    "\n",
    "def same_words(s1, s2): \n",
    "    for word in s1.lower().split(' '): \n",
    "        word = word.split(',')[0]\n",
    "        if word!='type' and word!='(disease)' and word not in s2.lower(): \n",
    "            return False \n",
    "    for word in s2.lower().split(' '): \n",
    "        word = word.split(',')[0]\n",
    "        if word!='type' and word!='(disease)' and word not in s1.lower(): \n",
    "            return False\n",
    "    return True\n",
    "\n",
    "for i in range(disease_nodes.shape[0]):\n",
    "    i_name = disease_nodes.loc[i, 'node_name']\n",
    "    i_idx = disease_nodes.loc[i, 'node_idx']\n",
    "    for w in ['monosomy','disomy', 'trisomy', 'trisomy/tetrasomy', 'chromosome']: \n",
    "        if w in i_name: \n",
    "            no.add(i_idx)\n",
    "\n",
    "for i in range(disease_nodes.shape[0]):\n",
    "    i_idx = disease_nodes.loc[i, 'node_idx']\n",
    "    if i_idx in seen: continue \n",
    "    if i_idx in no: continue \n",
    "    i_name = disease_nodes.loc[i, 'node_name']\n",
    "    i_split = i_name.split(' ')\n",
    "    end = i_split[-1]\n",
    "    if len(end)<=2 or end.isnumeric() or isroman(end):  \n",
    "        main_text = ' '.join(i_split[:-1])\n",
    "        matches = [i_name]\n",
    "        matches_idx = [i_idx]\n",
    "        match_found = False\n",
    "        numeric = True\n",
    "        for j in range(disease_nodes.shape[0]):\n",
    "            j_idx = disease_nodes.loc[j, 'node_idx']\n",
    "            j_name = disease_nodes.loc[j, 'node_name']\n",
    "            m = ' '.join(j_name.split(' ')[:-1])\n",
    "            if m.lower() == main_text.lower() or same_words(m, main_text): \n",
    "                matches.append(j_name)\n",
    "                matches_idx.append(j_idx)\n",
    "                match_found = True\n",
    "        if match_found:\n",
    "            matches_idx = list(set(matches_idx))\n",
    "            matches = list(set(matches))\n",
    "            if len(matches) <= 1: continue \n",
    "            if main_text.endswith('type'): \n",
    "                main_text = main_text[:-4]\n",
    "            if main_text.endswith(','): \n",
    "                main_text = main_text[:-1]\n",
    "            if main_text.endswith(' '): \n",
    "                main_text = main_text[:-1]\n",
    "            print(main_text)\n",
    "            for x in sorted(matches): \n",
    "                print('-  ',x)\n",
    "            for x in matches_idx: \n",
    "                seen.add(x)\n",
    "                idx2group[x] = main_text\n",
    "            groups.append((main_text, matches_idx))\n",
    "\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:22:34.579734Z",
     "start_time": "2021-08-06T12:22:34.542417Z"
    },
    "hidden": true
   },
   "outputs": [],
   "source": [
    "'''\n",
    "disease_nodes.loc[:, 'group_name'] = ''\n",
    "for data in disease_nodes.itertuples():\n",
    "    if data.node_idx in idx2group.keys(): \n",
    "        disease_nodes.loc[data.Index, 'group_name'] = idx2group[data.node_idx]\n",
    "    else: \n",
    "        disease_nodes.loc[data.Index, 'group_name'] = data.node_name\n",
    "        \n",
    "disease_group_1 = disease_nodes.get(['group_name']).drop_duplicates().reset_index().rename(columns={'index':'group_idx'})\n",
    "disease_nodes = pd.merge(disease_nodes, disease_group_1, 'left', 'group_name')\n",
    "'''"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true,
    "hidden": true
   },
   "source": [
    "### Grouping with BERT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:22:34.618408Z",
     "start_time": "2021-08-06T12:22:34.582228Z"
    },
    "code_folding": [],
    "hidden": true
   },
   "outputs": [],
   "source": [
    "# generate embeddings \n",
    "'''\n",
    "input_text = list(disease_group_1.get('group_name').values)\n",
    "\n",
    "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
    "#model_name='dmis-lab/biobert-large-cased-v1.1'\n",
    "model_name = 'emilyalsentzer/Bio_ClinicalBERT'\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
    "model = AutoModel.from_pretrained(model_name)\n",
    "model = model.to(device)\n",
    "model.eval()\n",
    "\n",
    "def batch(iterable, batch_size=4, return_idx=True):\n",
    "    l = len(iterable)\n",
    "    for ndx in range(0, l, batch_size):\n",
    "        if return_idx: \n",
    "            yield (ndx, min(ndx + batch_size, l))\n",
    "        else:\n",
    "            yield iterable[ndx:min(ndx + batch_size, l)]\n",
    "            \n",
    "tmp_dir = 'tmp/'\n",
    "if os.path.isdir(tmp_dir): \n",
    "    shutil.rmtree(tmp_dir)\n",
    "os.mkdir(tmp_dir)\n",
    "\n",
    "batch_size=32\n",
    "input_tokens = tokenizer(input_text, padding=True, return_tensors='pt', truncation=True, max_length=512)\n",
    "for i, (start, end) in tqdm(enumerate(batch(input_text, batch_size))):\n",
    "    input_ids = input_tokens['input_ids'][start:end, :].to(device)\n",
    "    attention_mask = input_tokens['attention_mask'][start:end, :].to(device)\n",
    "    with torch.no_grad():\n",
    "        outputs = model(input_ids=input_ids, attention_mask=attention_mask)\n",
    "        embeds = torch.mean(outputs[0], dim=1)\n",
    "    np.save(tmp_dir+str(i)+'.npy', embeds.numpy())\n",
    "    \n",
    "embeds = []\n",
    "for i, _ in enumerate(batch(input_text, batch_size)):\n",
    "    x = np.load(tmp_dir+str(i)+'.npy')\n",
    "    embeds.append(x)\n",
    "embeds = np.concatenate(embeds)\n",
    "\n",
    "np.save(save_path+'auxillary/kg_disease_bert_embeds.npy', embeds)\n",
    "if os.path.isdir(tmp_dir): \n",
    "    shutil.rmtree(tmp_dir)'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:22:34.656301Z",
     "start_time": "2021-08-06T12:22:34.621379Z"
    },
    "hidden": true
   },
   "outputs": [],
   "source": [
    "'''\n",
    "embeds = np.load(save_path+'auxillary/kg_disease_bert_embeds.npy')\n",
    "cos_sim = cosine_similarity(embeds, embeds)\n",
    "\n",
    "seen = set()\n",
    "groups = []\n",
    "idx2group = {}\n",
    "no = set()\n",
    "\n",
    "for i in range(disease_group_1.shape[0]):\n",
    "    i_name = disease_group_1.loc[i, 'group_name']\n",
    "    i_idx = disease_group_1.loc[i, 'group_idx']\n",
    "    for w in ['cardiomyopathy', 'syndrome', 'combined', 'complement', 'deficiency', \n",
    "              'factor', 'immunodeficiency', 'monosomy','disomy', 'trisomy', \n",
    "              'trisomy/tetrasomy', 'chromosome', 'neuroendocrine tumor', \n",
    "              'neuroendocrine neoplasm', 'cancer', 'tumor', 'neoplasm','carcinoma',\n",
    "              'lymphoma', 'lipoma']: \n",
    "        if w in i_name: \n",
    "            no.add(i_idx)\n",
    "            continue\n",
    "    for w in ['CDG']: \n",
    "        if i_name.endswith(w): \n",
    "            no.add(i_idx)\n",
    "            continue\n",
    "    for w in ['neurodevelopmental disorder', 'glycogen storage disease', \n",
    "              'congenital disorder of glycosylation', 'qualitative or quantitative defects']: \n",
    "        if i_name.startswith(w): \n",
    "            no.add(i_idx)\n",
    "            continue\n",
    "            \n",
    "cutoff = 0.98\n",
    "for i in range(disease_group_1.shape[0]):\n",
    "    i_name = disease_group_1.loc[i, 'group_name']\n",
    "    i_idx = disease_group_1.loc[i, 'group_idx']\n",
    "    if i_idx in no or i_idx in seen: continue\n",
    "    x = disease_group_1[cos_sim[i]>cutoff]\n",
    "    if x.shape[0]>1: \n",
    "        for v in x.get('group_name').values: \n",
    "            print(v)\n",
    "        main_text = input(' Ok? ')\n",
    "        if main_text not in ['','on','no', 'No', 'NO']: \n",
    "            for v in x.get('group_idx').values: \n",
    "                seen.add(v)\n",
    "                idx2group[v] = main_text\n",
    "            g = list(x.get('group_idx').values.reshape(-1))\n",
    "            groups.append((main_text, g)) # main_text contains group name\n",
    "        else: \n",
    "            no.add(i_idx)\n",
    "            print('Not added')\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:22:34.702232Z",
     "start_time": "2021-08-06T12:22:34.662051Z"
    },
    "hidden": true
   },
   "outputs": [],
   "source": [
    "'''\n",
    "disease_group_1.loc[:, 'group_name_2'] = ''\n",
    "for data in disease_group_1.itertuples(): \n",
    "    if data.group_idx in idx2group.keys():\n",
    "        disease_group_1.loc[data.Index, 'group_name_2'] = idx2group[data.group_idx]\n",
    "    else: \n",
    "        disease_group_1.loc[data.Index, 'group_name_2'] = data.group_name\n",
    "        \n",
    "disease_group_2 = disease_group_1.get(['group_name_2']).drop_duplicates().reset_index().rename(columns={'index':'group_idx_2'})\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:22:34.745711Z",
     "start_time": "2021-08-06T12:22:34.709900Z"
    },
    "hidden": true
   },
   "outputs": [],
   "source": [
    "'''\n",
    "df_disease_group = pd.merge(disease_nodes, disease_group_1, 'left', 'group_name')\n",
    "df_disease_group = df_disease_group.get(['node_id', 'node_type', 'node_name', 'node_source',\n",
    "       'group_name', 'group_name_2'])\n",
    "df_disease_group = df_disease_group.rename(columns={'group_name':'group_name_auto',\n",
    "        'group_name_2':'group_name_bert'}).astype({'node_id':str})\n",
    "df_disease_group.to_csv(save_path+'auxillary/kg_grouped_diseases.csv')\n",
    "'''"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "## Apply Groups"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:22:36.824640Z",
     "start_time": "2021-08-06T12:22:34.748321Z"
    },
    "hidden": true
   },
   "outputs": [],
   "source": [
    "grouped_diseases = pd.read_csv(save_path+'auxillary/kg_grouped_diseases.csv').astype({'node_id':str})\n",
    "group_col = 'group_name_bert'\n",
    "\n",
    "groups = grouped_diseases.groupby(group_col).count().query('node_id>1').index.values\n",
    "set_groups = set(groups)\n",
    "\n",
    "id_col = group_col.replace('name','id')\n",
    "group_map = pd.DataFrame(columns=[id_col, group_col])\n",
    "group_map.loc[:, group_col] = groups\n",
    "\n",
    "grouped_diseases = grouped_diseases.query('{} in @set_groups'.format(group_col))\n",
    "\n",
    "for g, data in grouped_diseases.groupby(group_col): \n",
    "    if g in set_groups:\n",
    "        x = '_'.join(list(data.get('node_id').values))\n",
    "        i = group_map.query('{}==@g'.format(group_col)).index[0]\n",
    "        group_map.loc[i, id_col] = x\n",
    "        \n",
    "grouped_diseases = pd.merge(grouped_diseases, group_map)\n",
    "grouped_diseases.to_csv(save_path+'auxillary/kg_grouped_diseases_bert_map.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:26:36.665730Z",
     "start_time": "2021-08-06T12:22:36.827448Z"
    },
    "hidden": true,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "74fa05f65c9b4a269a95e9de21f0ab21",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/6392 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "kg_x_dis = kg.query('x_type==\"disease\" and x_source==\"MONDO\"')\n",
    "kg_y_dis = kg.query('y_type==\"disease\" and y_source==\"MONDO\"')\n",
    "\n",
    "for idx, data in tqdm(grouped_diseases.iterrows(), total=grouped_diseases.shape[0]): \n",
    "    x_index = kg_x_dis.query('x_id==@data.node_id and x_name==@data.node_name').index.values\n",
    "    kg.loc[x_index, 'x_id'] = data.get(id_col)\n",
    "    kg.loc[x_index, 'x_name'] = data.get(group_col)\n",
    "    kg.loc[x_index, 'x_source'] = 'MONDO_grouped'\n",
    "\n",
    "    y_index = kg_y_dis.query('y_id==@data.node_id and y_name==@data.node_name').index.values\n",
    "    kg.loc[y_index, 'y_id'] = data.get(id_col)\n",
    "    kg.loc[y_index, 'y_name'] = data.get(group_col)\n",
    "    kg.loc[y_index, 'y_source'] = 'MONDO_grouped'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:27:07.721261Z",
     "start_time": "2021-08-06T12:26:36.673297Z"
    },
    "hidden": true
   },
   "outputs": [],
   "source": [
    "kg = kg.drop_duplicates()\n",
    "#kg_rev = kg.copy().rename(columns={'x_id':'y_id','x_type':'y_type', 'x_name':'y_name', 'x_source':'y_source', 'y_id':'x_id','y_type':'x_type', 'y_name':'x_name', 'y_source':'x_source' }) #add reverse edges\n",
    "#kg_rev['relation'] = kg_rev['relation'] + \"_rev\"\n",
    "#print(kg_rev)\n",
    "\n",
    "#kg = pd.concat([kg, kg_rev])\n",
    "#kg = kg.drop_duplicates()\n",
    "kg = kg.dropna()\n",
    "# remove self loops from edges \n",
    "kg = kg.query('not ((x_id == y_id) and (x_type == y_type) and (x_source == y_source) and (x_name == y_name))')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:27:41.975721Z",
     "start_time": "2021-08-06T12:27:07.723129Z"
    },
    "hidden": true
   },
   "outputs": [],
   "source": [
    "kg.to_csv(save_path+'auxillary/kg_grouped_orphanet.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Knowledge graph description"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:27:56.845094Z",
     "start_time": "2021-08-06T12:27:41.977822Z"
    }
   },
   "outputs": [],
   "source": [
    "kg = pd.read_csv(save_path+'auxillary/kg_grouped_orphanet.csv', low_memory=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:28:19.775375Z",
     "start_time": "2021-08-06T12:27:56.847134Z"
    }
   },
   "outputs": [],
   "source": [
    "# nodes file \n",
    "nodes = pd.concat([kg.get(['x_id','x_type', 'x_name','x_source']).rename(columns={'x_id':'node_id', 'x_type':'node_type', 'x_name':'node_name', 'x_source':'node_source'}), \n",
    "                   kg.get(['y_id','y_type', 'y_name','y_source']).rename(columns={'y_id':'node_id', 'y_type':'node_type', 'y_name':'node_name', 'y_source':'node_source'})])\n",
    "nodes = nodes.drop_duplicates().reset_index().drop('index',axis=1).reset_index().rename(columns={'index':'node_index'})\n",
    "\n",
    "# assign index \n",
    "kg = pd.merge(kg, nodes.rename(columns={'node_index':'x_index',\n",
    "                                        'node_id':'x_id',\n",
    "                                        'node_type':'x_type',\n",
    "                                        'node_name':'x_name',\n",
    "                                        'node_source':'x_source'}), 'left').dropna()\n",
    "kg = pd.merge(kg, nodes.rename(columns={'node_index':'y_index',\n",
    "                                        'node_id':'y_id',\n",
    "                                        'node_type':'y_type',\n",
    "                                        'node_name':'y_name',\n",
    "                                        'node_source':'y_source'}), 'left').dropna()\n",
    "kg = kg.get(['relation', 'display_relation', 'x_index', 'x_id', 'x_type', 'x_name', 'x_source',\n",
    "       'y_index', 'y_id', 'y_type', 'y_name', 'y_source'])\n",
    "\n",
    "# edges file \n",
    "edges = kg.get(['relation', 'display_relation', 'x_index', 'y_index']).copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:29:17.530524Z",
     "start_time": "2021-08-06T12:28:19.778095Z"
    }
   },
   "outputs": [],
   "source": [
    "kg.to_csv(save_path+'kg.csv', index=False)\n",
    "nodes.to_csv(save_path+'nodes.csv', index=False)\n",
    "edges.to_csv(save_path+'edges.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:29:17.586693Z",
     "start_time": "2021-08-06T12:29:17.532592Z"
    },
    "code_folding": []
   },
   "outputs": [],
   "source": [
    "def kg_describe(df, by, count_col): \n",
    "    df = df.groupby(by).count().sort_values(by=count_col, ascending=False).rename(columns={count_col:'count'}).get(['count'])\n",
    "    total = np.sum(df.get('count').values)\n",
    "    df = df.eval('percent = 100*count/@total')\n",
    "    df = df.append(df.sum(0).rename('total'))\n",
    "    df['count'] = df.get(['count']).astype('int')\n",
    "    df['percent'] = df.get(['percent']).round(1)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:29:17.776738Z",
     "start_time": "2021-08-06T12:29:17.591473Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>percent</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>node_type</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>biological_process</th>\n",
       "      <td>28642</td>\n",
       "      <td>22.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gene/protein</th>\n",
       "      <td>27671</td>\n",
       "      <td>21.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>disease</th>\n",
       "      <td>16305</td>\n",
       "      <td>12.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>effect/phenotype</th>\n",
       "      <td>15874</td>\n",
       "      <td>12.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>anatomy</th>\n",
       "      <td>14035</td>\n",
       "      <td>10.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>molecular_function</th>\n",
       "      <td>11169</td>\n",
       "      <td>8.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>drug</th>\n",
       "      <td>7949</td>\n",
       "      <td>6.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cellular_component</th>\n",
       "      <td>4176</td>\n",
       "      <td>3.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pathway</th>\n",
       "      <td>2516</td>\n",
       "      <td>1.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>exposure</th>\n",
       "      <td>802</td>\n",
       "      <td>0.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>total</th>\n",
       "      <td>129139</td>\n",
       "      <td>100.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     count  percent\n",
       "node_type                          \n",
       "biological_process   28642     22.2\n",
       "gene/protein         27671     21.4\n",
       "disease              16305     12.6\n",
       "effect/phenotype     15874     12.3\n",
       "anatomy              14035     10.9\n",
       "molecular_function   11169      8.6\n",
       "drug                  7949      6.2\n",
       "cellular_component    4176      3.2\n",
       "pathway               2516      1.9\n",
       "exposure               802      0.6\n",
       "total               129139    100.0"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kg_describe(nodes,'node_type','node_index')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-08-06T12:29:18.783745Z",
     "start_time": "2021-08-06T12:29:17.779433Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>percent</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>relation</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>drug_drug</th>\n",
       "      <td>2672628</td>\n",
       "      <td>49.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>protein_present_anatomy</th>\n",
       "      <td>1518203</td>\n",
       "      <td>28.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>protein_protein</th>\n",
       "      <td>321075</td>\n",
       "      <td>5.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>disease_phenotype_positive</th>\n",
       "      <td>172469</td>\n",
       "      <td>3.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>protein_bioprocess</th>\n",
       "      <td>144805</td>\n",
       "      <td>2.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>protein_cellcomp</th>\n",
       "      <td>83402</td>\n",
       "      <td>1.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>drug_effect</th>\n",
       "      <td>79137</td>\n",
       "      <td>1.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>disease_protein</th>\n",
       "      <td>74752</td>\n",
       "      <td>1.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>protein_molfunc</th>\n",
       "      <td>69530</td>\n",
       "      <td>1.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bioprocess_bioprocess</th>\n",
       "      <td>52886</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>protein_pathway</th>\n",
       "      <td>42646</td>\n",
       "      <td>0.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>disease_disease</th>\n",
       "      <td>30066</td>\n",
       "      <td>0.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>contraindication</th>\n",
       "      <td>25716</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>drug_protein</th>\n",
       "      <td>25653</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>phenotype_phenotype</th>\n",
       "      <td>21925</td>\n",
       "      <td>0.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>protein_absent_anatomy</th>\n",
       "      <td>19887</td>\n",
       "      <td>0.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>anatomy_anatomy</th>\n",
       "      <td>14032</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>molfunc_molfunc</th>\n",
       "      <td>13574</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>phenotype_protein</th>\n",
       "      <td>10518</td>\n",
       "      <td>0.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>indication</th>\n",
       "      <td>8115</td>\n",
       "      <td>0.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cellcomp_cellcomp</th>\n",
       "      <td>4845</td>\n",
       "      <td>0.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pathway_pathway</th>\n",
       "      <td>2535</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>off-label use</th>\n",
       "      <td>2299</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>exposure_exposure</th>\n",
       "      <td>2140</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>exposure_disease</th>\n",
       "      <td>1788</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>exposure_bioprocess</th>\n",
       "      <td>1625</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>disease_phenotype_negative</th>\n",
       "      <td>1318</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>exposure_protein</th>\n",
       "      <td>1212</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>exposure_molfunc</th>\n",
       "      <td>45</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>exposure_cellcomp</th>\n",
       "      <td>10</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>total</th>\n",
       "      <td>5418836</td>\n",
       "      <td>100.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                              count  percent\n",
       "relation                                    \n",
       "drug_drug                   2672628     49.3\n",
       "protein_present_anatomy     1518203     28.0\n",
       "protein_protein              321075      5.9\n",
       "disease_phenotype_positive   172469      3.2\n",
       "protein_bioprocess           144805      2.7\n",
       "protein_cellcomp              83402      1.5\n",
       "drug_effect                   79137      1.5\n",
       "disease_protein               74752      1.4\n",
       "protein_molfunc               69530      1.3\n",
       "bioprocess_bioprocess         52886      1.0\n",
       "protein_pathway               42646      0.8\n",
       "disease_disease               30066      0.6\n",
       "contraindication              25716      0.5\n",
       "drug_protein                  25653      0.5\n",
       "phenotype_phenotype           21925      0.4\n",
       "protein_absent_anatomy        19887      0.4\n",
       "anatomy_anatomy               14032      0.3\n",
       "molfunc_molfunc               13574      0.3\n",
       "phenotype_protein             10518      0.2\n",
       "indication                     8115      0.1\n",
       "cellcomp_cellcomp              4845      0.1\n",
       "pathway_pathway                2535      0.0\n",
       "off-label use                  2299      0.0\n",
       "exposure_exposure              2140      0.0\n",
       "exposure_disease               1788      0.0\n",
       "exposure_bioprocess            1625      0.0\n",
       "disease_phenotype_negative     1318      0.0\n",
       "exposure_protein               1212      0.0\n",
       "exposure_molfunc                 45      0.0\n",
       "exposure_cellcomp                10      0.0\n",
       "total                       5418836    100.0"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kg_describe(edges,'relation','x_index')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "hide_input": false,
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  },
  "notify_time": "10",
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": false,
   "sideBar": false,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "549.5652465820312px",
    "left": "993.9918212890625px",
    "top": "49.45652389526367px",
    "width": "161.64402770996094px"
   },
   "toc_section_display": false,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}