4909 lines (4908 with data), 167.3 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:17:42.529843Z",
"start_time": "2021-08-06T12:17:37.821976Z"
}
},
"outputs": [],
"source": [
"from tqdm.notebook import tqdm\n",
"import re\n",
"import os \n",
"import shutil\n",
"import numpy as np\n",
"import pandas as pd\n",
"import igraph as ig\n",
"from scipy.sparse import lil_matrix, save_npz\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"import torch\n",
"from transformers import AutoTokenizer, AutoModel, pipeline\n",
"%load_ext autoreload\n",
"%autoreload 2\n",
"\n",
"#data_path = '../../datasets/'\n",
"#save_path = data_path +'kg/'\n",
"data_path = '/n/data1/hms/dbmi/zaklab/emily/rare_disease_dx/data/8.9.21_kg/raw/sources/'\n",
"save_path = '/n/data1/hms/dbmi/zaklab/emily/rare_disease_dx/data/8.9.21_kg/our_kg/'"
]
},
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true
},
"source": [
"# Read datasets"
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:17:42.583094Z",
"start_time": "2021-08-06T12:17:42.532233Z"
},
"hidden": true
},
"outputs": [],
"source": [
"def assert_dtypes(df): \n",
" all_string = True\n",
" for i, x in enumerate(df.dtypes.values): \n",
" if x != np.dtype('O'): \n",
" all_string = False\n",
" print(df.columns[i], x)\n",
" if not all_string: assert False"
]
},
{
"cell_type": "code",
"execution_count": 132,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:17:58.905940Z",
"start_time": "2021-08-06T12:17:42.598948Z"
},
"hidden": true
},
"outputs": [],
"source": [
"df_ppi = pd.read_csv(data_path+'ppi/protein_protein.csv', low_memory=False).dropna()\n",
"df_ppi = df_ppi.astype({'proteinA_entrezid':int}).astype({'proteinA_entrezid':str})\n",
"df_ppi = df_ppi.astype({'proteinB_entrezid':int}).astype({'proteinB_entrezid':str})\n",
"assert_dtypes(df_ppi)\n",
"\n",
"df_drugbank = pd.read_csv(data_path+'drugbank/drug_protein.csv', low_memory=False)\n",
"df_drugbank = df_drugbank.get(['DrugBank', 'relation', 'NCBIGeneID','DrugBankName']).dropna()\n",
"df_drugbank = df_drugbank.astype({'NCBIGeneID':int}).astype({'NCBIGeneID':str})\n",
"assert_dtypes(df_drugbank)\n",
"\n",
"df_disgenet = pd.read_csv(data_path+'disgenet/curated_gene_disease_associations.tsv', sep='\\t', low_memory=False)\n",
"df_disgenet = df_disgenet.astype({'geneId':int}).astype({'geneId':str})\n",
"\n",
"df_mondo_terms = pd.read_csv(data_path+'mondo/mondo_terms.csv', low_memory=False)\n",
"df_mondo_terms = df_mondo_terms.astype({'id':int}).astype({'id':str})\n",
"\n",
"df_mondo_xref = pd.read_csv(data_path+'mondo/mondo_references.csv', low_memory=False)\n",
"df_mondo_xref = df_mondo_xref.astype({'mondo_id':int}).astype({'mondo_id':str})\n",
"assert_dtypes(df_mondo_xref)\n",
"\n",
"df_mondo_parents = pd.read_csv(data_path+'mondo/mondo_parents.csv', low_memory=False)\n",
"df_mondo_parents = df_mondo_parents.astype({'parent':int}).astype({'parent':str})\n",
"df_mondo_parents = df_mondo_parents.astype({'child':int}).astype({'child':str})\n",
"assert_dtypes(df_mondo_parents)\n",
"\n",
"df_drug_central = pd.read_csv(data_path+'drugcentral/drug_disease.csv', low_memory=False)\n",
"df_drug_central = df_drug_central.get(['cas_reg_no','relationship_name', 'umls_cui']) # 'concept_id', 'concept_name', 'snomed_conceptid'\n",
"df_drug_central = df_drug_central.query('not @df_drug_central.cas_reg_no.isna()')\n",
"df_drug_central = df_drug_central.query('not @df_drug_central.umls_cui.isna()')\n",
"assert_dtypes(df_drug_central)\n",
"\n",
"df_ddi = pd.read_csv(data_path+'drugbank/drug_drug.csv', low_memory=False)\n",
"assert_dtypes(df_ddi)\n",
"\n",
"df_hp_terms = pd.read_csv(data_path+'hpo/hp_terms.csv', low_memory=False)\n",
"df_hp_terms = df_hp_terms.astype({'id':int}).astype({'id':str})\n",
"\n",
"df_hp_xref = pd.read_csv(data_path+'hpo/hp_references.csv', low_memory=False)\n",
"df_hp_xref = df_hp_xref.astype({'hp_id':int}).astype({'hp_id':str})\n",
"\n",
"df_hp_parents = pd.read_csv(data_path+'hpo/hp_parents.csv', low_memory=False)\n",
"df_hp_parents = df_hp_parents.astype({'parent':int}).astype({'parent':str})\n",
"df_hp_parents = df_hp_parents.astype({'child':int}).astype({'child':str})\n",
"assert_dtypes(df_hp_parents)\n",
"\n",
"df_hpoa_pos = pd.read_csv(data_path+'hpo/disease_phenotype_pos.csv', low_memory=False)\n",
"df_hpoa_pos = df_hpoa_pos.astype({'hp_id':int}).astype({'hp_id':str})\n",
"df_hpoa_pos = df_hpoa_pos.astype({'disease_ontology_id':int}).astype({'disease_ontology_id':str})\n",
"assert_dtypes(df_hpoa_pos)\n",
"\n",
"df_hpoa_neg = pd.read_csv(data_path+'hpo/disease_phenotype_neg.csv', low_memory=False)\n",
"df_hpoa_neg = df_hpoa_neg.astype({'hp_id':int}).astype({'hp_id':str})\n",
"df_hpoa_neg = df_hpoa_neg.astype({'disease_ontology_id':int}).astype({'disease_ontology_id':str})\n",
"assert_dtypes(df_hpoa_neg)\n",
"\n",
"df_sider = pd.read_csv(data_path+'sider/sider.csv', low_memory=False)\n",
"assert_dtypes(df_sider)\n",
"\n",
"df_go_terms = pd.read_csv(data_path+'go/go_terms_info.csv', low_memory=False)\n",
"df_go_terms = df_go_terms.astype({'go_term_id':int}).astype({'go_term_id':str})\n",
"assert_dtypes(df_go_terms)\n",
"\n",
"df_go_edges = pd.read_csv(data_path+'go/go_terms_relations.csv', low_memory=False)\n",
"df_go_edges = df_go_edges.astype({'x':int}).astype({'x':str})\n",
"df_go_edges = df_go_edges.astype({'y':int}).astype({'y':str})\n",
"assert_dtypes(df_go_edges)\n",
"\n",
"df_gene2go = pd.read_csv(data_path+'ncbigene/protein_go_associations.csv', low_memory=False)\n",
"df_gene2go = df_gene2go.astype({'ncbi_gene_id':int}).astype({'ncbi_gene_id':str})\n",
"df_gene2go = df_gene2go.astype({'go_term_id':int}).astype({'go_term_id':str})\n",
"assert_dtypes(df_gene2go)\n",
"\n",
"df_exposures = pd.read_csv(data_path+'ctd/exposure_data.csv', low_memory=False)\n",
"df_exposures = df_exposures.get(['exposurestressorname', 'exposurestressorid',\n",
" 'exposuremarker', 'exposuremarkerid',\n",
" 'diseasename', 'diseaseid',\n",
" 'phenotypename', 'phenotypeid'])\n",
"assert_dtypes(df_exposures)\n",
"\n",
"df_uberon_terms = pd.read_csv(data_path+'uberon/uberon_terms.csv', low_memory=False)\n",
"df_uberon_terms = df_uberon_terms.astype({'id':int}).astype({'id':str})\n",
"assert_dtypes(df_uberon_terms)\n",
"\n",
"df_uberon_is_a = pd.read_csv(data_path+'uberon/uberon_is_a.csv', low_memory=False)\n",
"df_uberon_is_a = df_uberon_is_a.astype({'id':int}).astype({'id':str})\n",
"df_uberon_is_a = df_uberon_is_a.astype({'is_a':int}).astype({'is_a':str})\n",
"assert_dtypes(df_uberon_is_a)\n",
"\n",
"df_uberon_rels = pd.read_csv(data_path+'uberon/uberon_rels.csv', low_memory=False)\n",
"df_uberon_rels = df_uberon_rels.astype({'id':int}).astype({'id':str})\n",
"df_uberon_rels = df_uberon_rels.astype({'relation_id':int}).astype({'relation_id':str})\n",
"assert_dtypes(df_uberon_rels)\n",
"\n",
"df_bgee = pd.read_csv(data_path+'bgee/anatomy_gene.csv', low_memory=False)\n",
"df_bgee = df_bgee.astype({'expression_rank':int}).astype({'expression_rank':str})\n",
"df_bgee = df_bgee.astype({'anatomy_id':int}).astype({'anatomy_id':str})\n",
"assert_dtypes(df_bgee)\n",
"\n",
"df_reactome_terms = pd.read_csv(data_path+'reactome/reactome_terms.csv', low_memory=False)\n",
"assert_dtypes(df_reactome_terms)\n",
"\n",
"df_reactome_rels = pd.read_csv(data_path+'reactome/reactome_relations.csv', low_memory=False)\n",
"assert_dtypes(df_reactome_rels)\n",
"\n",
"df_reactome_ncbi = pd.read_csv(data_path+'reactome/reactome_ncbi.csv', low_memory=False)\n",
"df_reactome_ncbi = df_reactome_ncbi[df_reactome_ncbi.ncbi_id.str.isnumeric()]\n",
"assert_dtypes(df_reactome_ncbi)\n",
"\n",
"df_umls_mondo = pd.read_csv(data_path+'vocab/umls_mondo.csv', low_memory=False)\n",
"df_umls_mondo = df_umls_mondo.astype({'mondo_id':int}).astype({'mondo_id':str})\n",
"assert_dtypes(df_umls_mondo)\n",
"\n",
"df_prot_names = pd.read_csv(data_path+'vocab/gene_names.csv', low_memory=False, sep='\\t')\n",
"df_prot_names = df_prot_names.rename(columns={'NCBI Gene ID(supplied by NCBI)':'ncbi_id', 'NCBI Gene ID':'ncbi_id2', 'Approved symbol':'symbol', 'Approved name':'name'})\n",
"df_prot_names = df_prot_names.get(['ncbi_id', 'symbol']).dropna()\n",
"df_prot_names = df_prot_names.astype({'ncbi_id':int}).astype({'ncbi_id':str})\n",
"assert_dtypes(df_prot_names)\n",
"\n",
"db_vocab = pd.read_csv(data_path+'vocab/drugbank_vocabulary.csv', low_memory=False)\n",
"assert_dtypes(db_vocab)\n",
"\n",
"df_db_atc = pd.read_csv(data_path+'vocab/drugbank_atc_codes.csv', low_memory=False).get(['atc_code','parent_key'])\n",
"assert_dtypes(df_db_atc)"
]
},
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true
},
"source": [
"# Converting databases into graph edges"
]
},
{
"cell_type": "code",
"execution_count": 133,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:17:59.008491Z",
"start_time": "2021-08-06T12:17:58.909175Z"
},
"hidden": true
},
"outputs": [],
"source": [
"def clean_edges(df): \n",
" df = df.get(['relation', 'display_relation', 'x_id','x_type', 'x_name', 'x_source','y_id','y_type', 'y_name', 'y_source'])\n",
" df = df.dropna()\n",
" df = df.drop_duplicates()\n",
" df = df.query('not ((x_id == y_id) and (x_type == y_type) and (x_source == y_source) and (x_name == y_name))')\n",
" return df"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"## Basic"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"### Protein protein interactions (NCBI)"
]
},
{
"cell_type": "code",
"execution_count": 134,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:00.653072Z",
"start_time": "2021-08-06T12:17:59.013365Z"
},
"hidden": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation</th>\n",
" <th>display_relation</th>\n",
" <th>x_id</th>\n",
" <th>x_type</th>\n",
" <th>x_name</th>\n",
" <th>x_source</th>\n",
" <th>y_id</th>\n",
" <th>y_type</th>\n",
" <th>y_name</th>\n",
" <th>y_source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>protein_protein</td>\n",
" <td>ppi</td>\n",
" <td>9796</td>\n",
" <td>gene/protein</td>\n",
" <td>PHYHIP</td>\n",
" <td>NCBI</td>\n",
" <td>56992</td>\n",
" <td>gene/protein</td>\n",
" <td>KIF15</td>\n",
" <td>NCBI</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation display_relation x_id x_type x_name x_source \\\n",
"0 protein_protein ppi 9796 gene/protein PHYHIP NCBI \n",
"\n",
" y_id y_type y_name y_source \n",
"0 56992 gene/protein KIF15 NCBI "
]
},
"execution_count": 134,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_prot_prot = pd.merge(df_ppi, df_prot_names, 'left', left_on='proteinA_entrezid', right_on='ncbi_id').rename(columns={'symbol':'symbolA'})\n",
"df_prot_prot = pd.merge(df_prot_prot, df_prot_names, 'left', left_on='proteinB_entrezid', right_on='ncbi_id').rename(columns={'symbol':'symbolB'})\n",
"\n",
"df_prot_prot = df_prot_prot.rename(columns={'proteinA_entrezid':'x_id', 'proteinB_entrezid':'y_id', 'symbolA':'x_name', 'symbolB':'y_name'})\n",
"df_prot_prot['x_type'] = 'gene/protein'\n",
"df_prot_prot['x_source'] = 'NCBI'\n",
"df_prot_prot['y_type'] = 'gene/protein'\n",
"df_prot_prot['y_source'] = 'NCBI'\n",
"df_prot_prot['relation'] = 'protein_protein'\n",
"df_prot_prot['display_relation'] = 'ppi'\n",
"df_prot_prot = clean_edges(df_prot_prot)\n",
"df_prot_prot.head(1)"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"### Drug protein interactions (DrugBank)"
]
},
{
"cell_type": "code",
"execution_count": 135,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:00.937639Z",
"start_time": "2021-08-06T12:18:00.656244Z"
},
"hidden": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation</th>\n",
" <th>display_relation</th>\n",
" <th>x_id</th>\n",
" <th>x_type</th>\n",
" <th>x_name</th>\n",
" <th>x_source</th>\n",
" <th>y_id</th>\n",
" <th>y_type</th>\n",
" <th>y_name</th>\n",
" <th>y_source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>drug_protein</td>\n",
" <td>carrier</td>\n",
" <td>DB09130</td>\n",
" <td>drug</td>\n",
" <td>Copper</td>\n",
" <td>DrugBank</td>\n",
" <td>2157</td>\n",
" <td>gene/protein</td>\n",
" <td>F8</td>\n",
" <td>NCBI</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation display_relation x_id x_type x_name x_source y_id \\\n",
"0 drug_protein carrier DB09130 drug Copper DrugBank 2157 \n",
"\n",
" y_type y_name y_source \n",
"0 gene/protein F8 NCBI "
]
},
"execution_count": 135,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_prot_drug = pd.merge(df_drugbank, df_prot_names, 'left', left_on='NCBIGeneID', right_on='ncbi_id')\n",
"\n",
"df_prot_drug = df_prot_drug.rename(columns={'DrugBank':'x_id', 'NCBIGeneID':'y_id', 'DrugBankName':'x_name', 'symbol':'y_name'})\n",
"df_prot_drug['x_type'] = 'drug'\n",
"df_prot_drug['x_source'] = 'DrugBank'\n",
"df_prot_drug['y_type'] = 'gene/protein'\n",
"df_prot_drug['y_source'] = 'NCBI'\n",
"df_prot_drug['display_relation'] = df_prot_drug.get('relation').values\n",
"df_prot_drug['relation'] = 'drug_protein' # combine targets, carrier, enzyme and transporter\n",
"df_prot_drug = clean_edges(df_prot_drug)\n",
"df_prot_drug.head(1)"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"### Drug disease interactions (DiseaseCentral) –– PENDING"
]
},
{
"cell_type": "code",
"execution_count": 136,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:01.357506Z",
"start_time": "2021-08-06T12:18:00.940724Z"
},
"hidden": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation</th>\n",
" <th>display_relation</th>\n",
" <th>x_id</th>\n",
" <th>x_type</th>\n",
" <th>x_name</th>\n",
" <th>x_source</th>\n",
" <th>y_id</th>\n",
" <th>y_type</th>\n",
" <th>y_name</th>\n",
" <th>y_source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>contraindication</td>\n",
" <td>contraindication</td>\n",
" <td>DB05271</td>\n",
" <td>drug</td>\n",
" <td>Rotigotine</td>\n",
" <td>DrugBank</td>\n",
" <td>5044</td>\n",
" <td>disease</td>\n",
" <td>hypertensive disorder</td>\n",
" <td>MONDO</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation display_relation x_id x_type x_name x_source \\\n",
"0 contraindication contraindication DB05271 drug Rotigotine DrugBank \n",
"\n",
" y_id y_type y_name y_source \n",
"0 5044 disease hypertensive disorder MONDO "
]
},
"execution_count": 136,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_drug_dis = pd.merge(df_drug_central, db_vocab, 'left', left_on='cas_reg_no', right_on='CAS')\n",
"df_drug_dis = pd.merge(df_drug_dis, df_umls_mondo, 'inner', left_on='umls_cui', right_on='umls_id')\n",
"df_drug_dis = pd.merge(df_drug_dis, df_mondo_terms, 'left', left_on='mondo_id', right_on='id')\n",
"\n",
"df_drug_dis = df_drug_dis.get(['relationship_name','DrugBank ID', 'Common name', 'mondo_id', 'name'])\n",
"df_drug_dis = df_drug_dis.dropna().drop_duplicates()\n",
"\n",
"df_drug_dis = df_drug_dis.rename(columns={'DrugBank ID':'x_id', 'mondo_id':'y_id', 'Common name':'x_name', 'name':'y_name', 'relationship_name':'relation'})\n",
"df_drug_dis['x_type'] = 'drug'\n",
"df_drug_dis['x_source'] = 'DrugBank'\n",
"df_drug_dis['y_type'] = 'disease'\n",
"df_drug_dis['y_source'] = 'MONDO'\n",
"df_drug_dis['display_relation'] = df_drug_dis.get('relation').values\n",
"df_drug_dis = clean_edges(df_drug_dis)\n",
"df_drug_dis.head(1)"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"### Disease protein interactions (DisGenNet)"
]
},
{
"cell_type": "code",
"execution_count": 137,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:02.363099Z",
"start_time": "2021-08-06T12:18:01.381960Z"
},
"hidden": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation</th>\n",
" <th>display_relation</th>\n",
" <th>x_id</th>\n",
" <th>x_type</th>\n",
" <th>x_name</th>\n",
" <th>x_source</th>\n",
" <th>y_id</th>\n",
" <th>y_type</th>\n",
" <th>y_name</th>\n",
" <th>y_source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>disease_protein</td>\n",
" <td>associated with</td>\n",
" <td>5090</td>\n",
" <td>disease</td>\n",
" <td>schizophrenia (disease)</td>\n",
" <td>MONDO</td>\n",
" <td>1</td>\n",
" <td>gene/protein</td>\n",
" <td>A1BG</td>\n",
" <td>NCBI</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation display_relation x_id x_type x_name \\\n",
"0 disease_protein associated with 5090 disease schizophrenia (disease) \n",
"\n",
" x_source y_id y_type y_name y_source \n",
"0 MONDO 1 gene/protein A1BG NCBI "
]
},
"execution_count": 137,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_dis_prot1 = df_disgenet.query('diseaseType==\"disease\"')\n",
"\n",
"df_dis_prot1 = pd.merge(df_dis_prot1, df_umls_mondo, 'inner', left_on='diseaseId', right_on='umls_id')\n",
"df_dis_prot1 = pd.merge(df_dis_prot1, df_mondo_terms, 'left', left_on='mondo_id', right_on='id')\n",
"\n",
"df_dis_prot1 = df_dis_prot1.rename(columns={'geneId':'y_id', 'geneSymbol':'y_name', 'mondo_id':'x_id', 'name':'x_name'})\n",
"df_dis_prot1['x_type'] = 'disease'\n",
"df_dis_prot1['x_source'] = 'MONDO'\n",
"df_dis_prot1['y_type'] = 'gene/protein'\n",
"df_dis_prot1['y_source'] = 'NCBI'\n",
"df_dis_prot1['relation'] = 'disease_protein'\n",
"df_dis_prot1['display_relation'] = 'associated with'\n",
"df_dis_prot1 = clean_edges(df_dis_prot1)\n",
"df_dis_prot1.head(1)"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"### Disease disease interations (MONDO)"
]
},
{
"cell_type": "code",
"execution_count": 138,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:02.612068Z",
"start_time": "2021-08-06T12:18:02.367422Z"
},
"hidden": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation</th>\n",
" <th>display_relation</th>\n",
" <th>x_id</th>\n",
" <th>x_type</th>\n",
" <th>x_name</th>\n",
" <th>x_source</th>\n",
" <th>y_id</th>\n",
" <th>y_type</th>\n",
" <th>y_name</th>\n",
" <th>y_source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>disease_disease</td>\n",
" <td>parent-child</td>\n",
" <td>2816</td>\n",
" <td>disease</td>\n",
" <td>adrenal cortex disease</td>\n",
" <td>MONDO</td>\n",
" <td>4</td>\n",
" <td>disease</td>\n",
" <td>adrenocortical insufficiency</td>\n",
" <td>MONDO</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation display_relation x_id x_type x_name \\\n",
"0 disease_disease parent-child 2816 disease adrenal cortex disease \n",
"\n",
" x_source y_id y_type y_name y_source \n",
"0 MONDO 4 disease adrenocortical insufficiency MONDO "
]
},
"execution_count": 138,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_dis_dis1 = pd.merge(df_mondo_parents, df_mondo_terms, 'left', left_on='parent', right_on='id')\n",
"df_dis_dis1 = df_dis_dis1.rename(columns={'parent':'x_id', 'name':'x_name'})\n",
"df_dis_dis1 = pd.merge(df_dis_dis1, df_mondo_terms, 'left', left_on='child', right_on='id')\n",
"df_dis_dis1 = df_dis_dis1.rename(columns={'child':'y_id', 'name':'y_name'})\n",
"df_dis_dis1['x_type'] = 'disease'\n",
"df_dis_dis1['x_source'] = 'MONDO'\n",
"df_dis_dis1['y_type'] = 'disease'\n",
"df_dis_dis1['y_source'] = 'MONDO'\n",
"df_dis_dis1['relation'] = 'disease_disease'\n",
"df_dis_dis1['display_relation'] = 'parent-child'\n",
"df_dis_dis1 = clean_edges(df_dis_dis1)\n",
"df_dis_dis1.head(1)"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"### Drug drug interactions (DrugBank)"
]
},
{
"cell_type": "code",
"execution_count": 139,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:15.697033Z",
"start_time": "2021-08-06T12:18:02.616417Z"
},
"hidden": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation</th>\n",
" <th>display_relation</th>\n",
" <th>x_id</th>\n",
" <th>x_type</th>\n",
" <th>x_name</th>\n",
" <th>x_source</th>\n",
" <th>y_id</th>\n",
" <th>y_type</th>\n",
" <th>y_name</th>\n",
" <th>y_source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>drug_drug</td>\n",
" <td>synergistic interaction</td>\n",
" <td>DB00001</td>\n",
" <td>drug</td>\n",
" <td>Lepirudin</td>\n",
" <td>DrugBank</td>\n",
" <td>DB06605</td>\n",
" <td>drug</td>\n",
" <td>Apixaban</td>\n",
" <td>DrugBank</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation display_relation x_id x_type x_name x_source \\\n",
"0 drug_drug synergistic interaction DB00001 drug Lepirudin DrugBank \n",
"\n",
" y_id y_type y_name y_source \n",
"0 DB06605 drug Apixaban DrugBank "
]
},
"execution_count": 139,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_drug_drug = pd.merge(df_ddi, db_vocab, 'inner', left_on='drug1', right_on='DrugBank ID')\n",
"df_drug_drug = df_drug_drug.rename(columns={'drug1':'x_id', 'Common name':'x_name'})\n",
"df_drug_drug = pd.merge(df_drug_drug.astype({'drug2':'str'}), db_vocab, 'inner', left_on='drug2', right_on='DrugBank ID')\n",
"df_drug_drug = df_drug_drug.rename(columns={'drug2':'y_id', 'Common name':'y_name'})\n",
"df_drug_drug['x_type'] = 'drug'\n",
"df_drug_drug['x_source'] = 'DrugBank'\n",
"df_drug_drug['y_type'] = 'drug'\n",
"df_drug_drug['y_source'] = 'DrugBank'\n",
"df_drug_drug['relation'] = 'drug_drug'\n",
"df_drug_drug['display_relation'] = 'synergistic interaction'\n",
"df_drug_drug = clean_edges(df_drug_drug)\n",
"df_drug_drug.head(1)"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"## Effect/Phenotype"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"### Effect protein interactions (DisGenNet)"
]
},
{
"cell_type": "code",
"execution_count": 140,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:16.132855Z",
"start_time": "2021-08-06T12:18:15.701935Z"
},
"hidden": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation</th>\n",
" <th>display_relation</th>\n",
" <th>x_id</th>\n",
" <th>x_type</th>\n",
" <th>x_name</th>\n",
" <th>x_source</th>\n",
" <th>y_id</th>\n",
" <th>y_type</th>\n",
" <th>y_name</th>\n",
" <th>y_source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>phenotype_protein</td>\n",
" <td>associated with</td>\n",
" <td>2240</td>\n",
" <td>effect/phenotype</td>\n",
" <td>Hepatomegaly</td>\n",
" <td>HPO</td>\n",
" <td>1</td>\n",
" <td>gene/protein</td>\n",
" <td>A1BG</td>\n",
" <td>NCBI</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation display_relation x_id x_type x_name \\\n",
"0 phenotype_protein associated with 2240 effect/phenotype Hepatomegaly \n",
"\n",
" x_source y_id y_type y_name y_source \n",
"0 HPO 1 gene/protein A1BG NCBI "
]
},
"execution_count": 140,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_phe_prot = df_disgenet.query('diseaseType==\"phenotype\"')\n",
"\n",
"df_phe_prot = pd.merge(df_phe_prot, df_hp_xref, 'inner', left_on='diseaseId', right_on='ontology_id')\n",
"df_phe_prot = pd.merge(df_phe_prot, df_hp_terms, 'left', left_on='hp_id', right_on='id')\n",
"\n",
"df_phe_prot = df_phe_prot.rename(columns={'geneId':'y_id', 'geneSymbol':'y_name', 'hp_id':'x_id', 'name':'x_name'})\n",
"df_phe_prot['x_type'] = 'effect/phenotype'\n",
"df_phe_prot['x_source'] = 'HPO'\n",
"df_phe_prot['y_type'] = 'gene/protein'\n",
"df_phe_prot['y_source'] = 'NCBI'\n",
"df_phe_prot['relation'] = 'phenotype_protein'\n",
"df_phe_prot['display_relation'] = 'associated with'\n",
"df_phe_prot = clean_edges(df_phe_prot)\n",
"df_phe_prot.head(1)"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"### Effect effect interactions (HPO)"
]
},
{
"cell_type": "code",
"execution_count": 141,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:16.566142Z",
"start_time": "2021-08-06T12:18:16.138257Z"
},
"hidden": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation</th>\n",
" <th>display_relation</th>\n",
" <th>x_id</th>\n",
" <th>x_type</th>\n",
" <th>x_name</th>\n",
" <th>x_source</th>\n",
" <th>y_id</th>\n",
" <th>y_type</th>\n",
" <th>y_name</th>\n",
" <th>y_source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>phenotype_phenotype</td>\n",
" <td>parent-child</td>\n",
" <td>1507</td>\n",
" <td>effect/phenotype</td>\n",
" <td>Growth abnormality</td>\n",
" <td>HPO</td>\n",
" <td>2</td>\n",
" <td>effect/phenotype</td>\n",
" <td>Abnormality of body height</td>\n",
" <td>HPO</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation display_relation x_id x_type \\\n",
"0 phenotype_phenotype parent-child 1507 effect/phenotype \n",
"\n",
" x_name x_source y_id y_type \\\n",
"0 Growth abnormality HPO 2 effect/phenotype \n",
"\n",
" y_name y_source \n",
"0 Abnormality of body height HPO "
]
},
"execution_count": 141,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_phe_phe = pd.merge(df_hp_parents, df_hp_terms, 'left', left_on='parent', right_on='id')\n",
"df_phe_phe = df_phe_phe.rename(columns={'name':'parent_name'})\n",
"df_phe_phe = pd.merge(df_phe_phe, df_hp_terms, 'left', left_on='child', right_on='id')\n",
"df_phe_phe = df_phe_phe.rename(columns={'name':'child_name'})\n",
"df_phe_phe = df_phe_phe.get(['parent', 'child', 'parent_name', 'child_name'])\n",
"\n",
"df_phe_phe = df_phe_phe.rename(columns={'parent':'x_id', 'child':'y_id', 'parent_name':'x_name', 'child_name':'y_name'})\n",
"df_phe_phe['x_type'] = 'effect/phenotype'\n",
"df_phe_phe['x_source'] = 'HPO'\n",
"df_phe_phe['y_type'] = 'effect/phenotype'\n",
"df_phe_phe['y_source'] = 'HPO'\n",
"df_phe_phe['relation'] = 'phenotype_phenotype'\n",
"df_phe_phe['display_relation'] = 'parent-child'\n",
"df_phe_phe = clean_edges(df_phe_phe)\n",
"df_phe_phe.head(1)"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"### Disease effect interactions (HPO-A)"
]
},
{
"cell_type": "code",
"execution_count": 142,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:18.049528Z",
"start_time": "2021-08-06T12:18:16.571436Z"
},
"hidden": true,
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation</th>\n",
" <th>display_relation</th>\n",
" <th>x_id</th>\n",
" <th>x_type</th>\n",
" <th>x_name</th>\n",
" <th>x_source</th>\n",
" <th>y_id</th>\n",
" <th>y_type</th>\n",
" <th>y_name</th>\n",
" <th>y_source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>disease_phenotype_positive</td>\n",
" <td>phenotype present</td>\n",
" <td>10761</td>\n",
" <td>disease</td>\n",
" <td>retinitis pigmentosa Y-linked</td>\n",
" <td>MONDO</td>\n",
" <td>510</td>\n",
" <td>effect/phenotype</td>\n",
" <td>Rod-cone dystrophy</td>\n",
" <td>HPO</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation display_relation x_id x_type \\\n",
"0 disease_phenotype_positive phenotype present 10761 disease \n",
"\n",
" x_name x_source y_id y_type \\\n",
"0 retinitis pigmentosa Y-linked MONDO 510 effect/phenotype \n",
"\n",
" y_name y_source \n",
"0 Rod-cone dystrophy HPO "
]
},
"execution_count": 142,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_dis_phe_pos1 = pd.merge(df_hpoa_pos, df_mondo_xref, 'left', left_on='disease_ontology_id', right_on='ontology_id')\n",
"df_dis_phe_pos1 = df_dis_phe_pos1.query('(disease_ontology==ontology) or (disease_ontology==\"ORPHA\" and ontology==\"Orphanet\")')\n",
"df_dis_phe_pos1 = pd.merge(df_dis_phe_pos1, df_hp_terms, 'left', left_on='hp_id', right_on='id').rename(columns={'name':'hp_name'})\n",
"df_dis_phe_pos1 = pd.merge(df_dis_phe_pos1, df_mondo_terms, 'left', left_on='mondo_id', right_on='id').rename(columns={'name':'mondo_name'})\n",
"df_dis_phe_pos1 = df_dis_phe_pos1.get(['mondo_id', 'mondo_name', 'hp_id', 'hp_name'])\n",
"df_dis_phe_pos1 = df_dis_phe_pos1.rename(columns={'mondo_id':'x_id', 'mondo_name':'x_name', 'hp_id': 'y_id', 'hp_name':'y_name'})\n",
"df_dis_phe_pos1.loc[:, 'x_source'] = 'MONDO'\n",
"df_dis_phe_pos1.loc[:, 'x_type'] = 'disease'\n",
"df_dis_phe_pos1.loc[:, 'y_source'] = 'HPO'\n",
"df_dis_phe_pos1.loc[:, 'y_type'] = 'effect/phenotype'\n",
"df_dis_phe_pos1.loc[:, 'relation'] = 'disease_phenotype_positive'\n",
"df_dis_phe_pos1.loc[:, 'display_relation'] = 'phenotype present'\n",
"df_dis_phe_pos1 = clean_edges(df_dis_phe_pos1)\n",
"df_dis_phe_pos1.head(1)"
]
},
{
"cell_type": "code",
"execution_count": 143,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:18.294348Z",
"start_time": "2021-08-06T12:18:18.051937Z"
},
"hidden": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation</th>\n",
" <th>display_relation</th>\n",
" <th>x_id</th>\n",
" <th>x_type</th>\n",
" <th>x_name</th>\n",
" <th>x_source</th>\n",
" <th>y_id</th>\n",
" <th>y_type</th>\n",
" <th>y_name</th>\n",
" <th>y_source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>disease_phenotype_negative</td>\n",
" <td>phenotype absent</td>\n",
" <td>13924</td>\n",
" <td>disease</td>\n",
" <td>osteogenesis imperfecta type 13</td>\n",
" <td>MONDO</td>\n",
" <td>365</td>\n",
" <td>effect/phenotype</td>\n",
" <td>Hearing impairment</td>\n",
" <td>HPO</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation display_relation x_id x_type \\\n",
"0 disease_phenotype_negative phenotype absent 13924 disease \n",
"\n",
" x_name x_source y_id y_type \\\n",
"0 osteogenesis imperfecta type 13 MONDO 365 effect/phenotype \n",
"\n",
" y_name y_source \n",
"0 Hearing impairment HPO "
]
},
"execution_count": 143,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_dis_phe_neg = pd.merge(df_hpoa_neg, df_mondo_xref, 'left', left_on='disease_ontology_id', right_on='ontology_id')\n",
"df_dis_phe_neg = df_dis_phe_neg.query('(disease_ontology==ontology) or (disease_ontology==\"ORPHA\" and ontology==\"Orphanet\")')\n",
"df_dis_phe_neg = pd.merge(df_dis_phe_neg, df_hp_terms, 'left', left_on='hp_id', right_on='id').rename(columns={'name':'hp_name'})\n",
"df_dis_phe_neg = pd.merge(df_dis_phe_neg, df_mondo_terms, 'left', left_on='mondo_id', right_on='id').rename(columns={'name':'mondo_name'})\n",
"df_dis_phe_neg = df_dis_phe_neg.get(['mondo_id', 'mondo_name', 'hp_id', 'hp_name'])\n",
"df_dis_phe_neg = df_dis_phe_neg.rename(columns={'mondo_id':'x_id', 'mondo_name':'x_name', 'hp_id': 'y_id', 'hp_name':'y_name'})\n",
"df_dis_phe_neg.loc[:, 'x_source'] = 'MONDO'\n",
"df_dis_phe_neg.loc[:, 'x_type'] = 'disease'\n",
"df_dis_phe_neg.loc[:, 'y_source'] = 'HPO'\n",
"df_dis_phe_neg.loc[:, 'y_type'] = 'effect/phenotype'\n",
"df_dis_phe_neg.loc[:, 'relation'] = 'disease_phenotype_negative'\n",
"df_dis_phe_neg.loc[:, 'display_relation'] = 'phenotype absent'\n",
"df_dis_phe_neg = clean_edges(df_dis_phe_neg)\n",
"df_dis_phe_neg.head(1)"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"### Remove MONDO nodes if they exist in HPO (Modified)"
]
},
{
"cell_type": "code",
"execution_count": 144,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:18.395112Z",
"start_time": "2021-08-06T12:18:18.296744Z"
},
"code_folding": [],
"hidden": true,
"scrolled": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ema30/zaklab/envs/rare_disease/lib/python3.8/site-packages/pandas/core/indexing.py:1843: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" self.obj[item_labels[indexer[info_axis]]] = value\n"
]
}
],
"source": [
"# phenotypes that are actually diseases in MONDO\n",
"# avoid duplicate nodes and convert disease nodes to phenotype nodes\n",
"mondo_xref_hp_subset = df_mondo_xref.query('ontology==\"HP\"')\n",
"mondo_xref_hp_subset.loc[:, 'ontology_id'] = mondo_xref_hp_subset.get('ontology_id').astype(int).astype(str).values\n",
"merged_mondo_hpo = pd.merge(mondo_xref_hp_subset, df_hp_terms, 'inner', left_on='ontology_id', right_on='id')\n",
"\n",
"merged_mondo_hpo[['ontology_id', 'mondo_id']].to_csv(save_path+'auxillary/mondo2hpo.csv', index=False)\n",
"mondo_r_hp_ids = merged_mondo_hpo.get('mondo_id').values\n",
"\n",
"def replace_mondo_w_hpo(df, mondo_id_col, drop_cols=[]): \n",
" cols = list(df.columns.values)\n",
" cols.extend(['ontology_id', 'ontology_name'])\n",
" [cols.remove(x) for x in drop_cols]\n",
" df = pd.merge(df, mondo_xref_hp_subset, 'left', left_on=mondo_id_col, right_on='mondo_id')\n",
" df = pd.merge(df, df_hp_terms, 'left', left_on='ontology_id', right_on='id')\n",
" df = df.rename(columns={'name':'ontology_name'}).get(cols)\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# HANDLE DISEASE DISEASE --> EFFECT EFFECT\n",
"\n",
"df_phe_phe2 = df_dis_dis1.query('x_id in @mondo_r_hp_ids and y_id in @mondo_r_hp_ids')\n",
"df_phe_phe2 = replace_mondo_w_hpo(df=df_phe_phe2, mondo_id_col='x_id', drop_cols=[c for c in df_phe_phe2.columns.values if 'x_' in c])\n",
"df_phe_phe2 = df_phe_phe2.rename(columns={'ontology_id':'x_id', 'ontology_name':'x_name'})\n",
"df_phe_phe2 = replace_mondo_w_hpo(df=df_phe_phe2, mondo_id_col='y_id', drop_cols=[c for c in df_phe_phe2.columns.values if 'y_' in c])\n",
"df_phe_phe2 = df_phe_phe2.rename(columns={'ontology_id':'y_id', 'ontology_name':'y_name'})\n",
"df_phe_phe2.loc[:, 'x_source'] = 'HPO'\n",
"df_phe_phe2.loc[:, 'x_type'] = 'effect/phenotype'\n",
"df_phe_phe2.loc[:, 'y_source'] = 'HPO'\n",
"df_phe_phe2.loc[:, 'y_type'] = 'effect/phenotype'\n",
"df_phe_phe2.loc[:,'relation'] = 'phenotype_phenotype'\n",
"df_phe_phe2.loc[:,'display_relation'] = 'parent-child'\n",
"df_phe_phe2 = clean_edges(df_phe_phe2)\n",
"\n",
"# drop relations in DIS DIS if either DIS is in HPO\n",
"# disease disease should have no phenotype nodes\n",
"df_dis_dis = df_dis_dis1.query('x_id not in @mondo_r_hp_ids and y_id not in @mondo_r_hp_ids')\n",
"\n",
"# ensure that none of the disease nodes (source or target) are hpo nodes\n",
"assert len(df_dis_dis.query('x_id in @mondo_r_hp_ids')) == 0\n",
"assert len(df_dis_dis.query('y_id in @mondo_r_hp_ids')) == 0"
]
},
{
"cell_type": "code",
"execution_count": 146,
"metadata": {},
"outputs": [],
"source": [
"# HANDLE DISEASE EFFECT NEGATIVE --> EFFECT EFFECT\n",
"df_phe_phe3 = df_dis_phe_neg.query('x_id in @mondo_r_hp_ids')\n",
"df_phe_phe3 = replace_mondo_w_hpo(df=df_phe_phe3, mondo_id_col='x_id', drop_cols=[c for c in df_phe_phe3.columns.values if 'x_' in c])\n",
"df_phe_phe3 = df_phe_phe3.rename(columns={'ontology_id':'x_id', 'ontology_name':'x_name'})\n",
"\n",
"df_phe_phe3.loc[:, 'x_source'] = 'HPO'\n",
"df_phe_phe3.loc[:, 'x_type'] = 'effect/phenotype'\n",
"df_phe_phe3.loc[:, 'relation'] = 'phenotype_phenotype'\n",
"df_phe_phe3.loc[:, 'display_relation'] = 'parent-child'\n",
"df_phe_phe3 = clean_edges(df_phe_phe3)\n",
"\n",
"# drop relations in DIS PHE if DIS is in HPO\n",
"df_dis_phe_neg = df_dis_phe_neg.query('x_id not in @mondo_r_hp_ids')\n",
"\n",
"# HANDLE DISEASE EFFECT POSITIVE --> EFFECT EFFECT\n",
"df_phe_phe4 = df_dis_phe_pos1.query('x_id in @mondo_r_hp_ids')\n",
"df_phe_phe4 = replace_mondo_w_hpo(df=df_phe_phe4, mondo_id_col='x_id', drop_cols=[c for c in df_phe_phe4.columns.values if 'x_' in c])\n",
"df_phe_phe4 = df_phe_phe4.rename(columns={'ontology_id':'x_id', 'ontology_name':'x_name'})\n",
"\n",
"df_phe_phe4.loc[:, 'x_source'] = 'HPO'\n",
"df_phe_phe4.loc[:, 'x_type'] = 'effect/phenotype'\n",
"df_phe_phe4.loc[:,'relation'] = 'phenotype_phenotype'\n",
"df_phe_phe4.loc[:,'display_relation'] = 'parent-child'\n",
"df_phe_phe4 = clean_edges(df_phe_phe4)\n",
"\n",
"# drop relations in DIS PHE if DIS is in HPO\n",
"df_dis_phe_pos = df_dis_phe_pos1.query('x_id not in @mondo_r_hp_ids')"
]
},
{
"cell_type": "code",
"execution_count": 147,
"metadata": {},
"outputs": [],
"source": [
"# HANDLE DISEASE PROTEIN --> EFFECT PROTEIN\n",
"df_phe_prot2 = df_dis_prot1.query('x_id in @mondo_r_hp_ids')\n",
"df_phe_prot2 = replace_mondo_w_hpo(df=df_phe_prot2, mondo_id_col='x_id', drop_cols=[c for c in df_phe_prot2.columns.values if 'x_' in c])\n",
"df_phe_prot2 = df_phe_prot2.rename(columns={'ontology_id':'x_id', 'ontology_name':'x_name'})\n",
"\n",
"df_phe_prot2.loc[:, 'x_source'] = 'HPO'\n",
"df_phe_prot2.loc[:, 'x_type'] = 'effect/phenotype'\n",
"df_phe_prot2.loc[:, 'relation'] = 'phenotype_protein'\n",
"df_phe_prot2.loc[:, 'display_relation'] = 'associated with'\n",
"df_phe_prot2 = clean_edges(df_phe_prot2)\n",
"\n",
"# drop relations in DIS GENE if DIS is in HPO\n",
"df_dis_prot = df_dis_prot1.query('x_id not in @mondo_r_hp_ids')"
]
},
{
"cell_type": "code",
"execution_count": 148,
"metadata": {},
"outputs": [],
"source": [
"# HANDLE DISEASE DRUG --> Remove (does not make sense to have EFFECT DRUG)\n",
"\n",
"df_drug_dis = df_drug_dis.query('y_id not in @mondo_r_hp_ids')"
]
},
{
"cell_type": "code",
"execution_count": 149,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Empty DataFrame\n",
"Columns: [relation, display_relation, x_id, x_type, x_name, x_source, y_id, y_type, y_name, y_source]\n",
"Index: []\n",
"Empty DataFrame\n",
"Columns: [relation, display_relation, x_id, x_type, x_name, x_source, y_id, y_type, y_name, y_source]\n",
"Index: []\n"
]
}
],
"source": [
"# COMBINE DATAFRAMES\n",
"\n",
"df_phe_phe = pd.concat([df_phe_phe, df_phe_phe2, df_phe_phe3, df_phe_phe4], ignore_index=True).drop_duplicates()\n",
"df_phe_prot = pd.concat([df_phe_prot, df_phe_prot2], ignore_index=True).drop_duplicates()\n",
"\n",
"print(df_phe_phe.query('x_source == \"MONDO\" and y_source == \"MONDO\" and x_id in @mondo_r_hp_ids and y_id in @mondo_r_hp_ids'))\n",
"print(df_phe_prot.query('x_source == \"MONDO\" and x_id in @mondo_r_hp_ids'))"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"### Drug effect interactions (SIDER)"
]
},
{
"cell_type": "code",
"execution_count": 150,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:20.396548Z",
"start_time": "2021-08-06T12:18:19.535589Z"
},
"hidden": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation</th>\n",
" <th>display_relation</th>\n",
" <th>x_id</th>\n",
" <th>x_type</th>\n",
" <th>x_name</th>\n",
" <th>x_source</th>\n",
" <th>y_id</th>\n",
" <th>y_type</th>\n",
" <th>y_name</th>\n",
" <th>y_source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>drug_effect</td>\n",
" <td>side effect</td>\n",
" <td>DB00583</td>\n",
" <td>drug</td>\n",
" <td>Levocarnitine</td>\n",
" <td>DrugBank</td>\n",
" <td>2027</td>\n",
" <td>effect/phenotype</td>\n",
" <td>Abdominal pain</td>\n",
" <td>HPO</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation display_relation x_id x_type x_name x_source \\\n",
"0 drug_effect side effect DB00583 drug Levocarnitine DrugBank \n",
"\n",
" y_id y_type y_name y_source \n",
"0 2027 effect/phenotype Abdominal pain HPO "
]
},
"execution_count": 150,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_drug_effect = pd.merge(df_sider, df_db_atc, 'left', left_on='atc', right_on='atc_code')\n",
"df_drug_effect = df_drug_effect.rename(columns={'parent_key':'DrugBank', 'UMLS_from_meddra':'UMLS'})\n",
"df_drug_effect = pd.merge(df_drug_effect, db_vocab, 'left', left_on='DrugBank', right_on='DrugBank ID')\n",
"df_drug_effect = pd.merge(df_drug_effect, df_hp_xref, 'left', left_on='UMLS' , right_on='ontology_id')\n",
"df_drug_effect = pd.merge(df_drug_effect, df_hp_terms, 'left', left_on='hp_id' , right_on='id')\n",
"df_drug_effect = df_drug_effect.get(['DrugBank ID','Common name','hp_id', 'name'])\n",
"df_drug_effect = df_drug_effect.dropna().drop_duplicates()\n",
"\n",
"df_drug_effect = df_drug_effect.rename(columns={'DrugBank ID':'x_id', 'Common name':'x_name', 'hp_id':'y_id', 'name':'y_name'})\n",
"df_drug_effect['x_type'] = 'drug'\n",
"df_drug_effect['x_source'] = 'DrugBank'\n",
"df_drug_effect['y_type'] = 'effect/phenotype'\n",
"df_drug_effect['y_source'] = 'HPO'\n",
"df_drug_effect['relation'] = 'drug_effect'\n",
"df_drug_effect['display_relation'] = 'side effect'\n",
"#df_drug_effect = df_drug_effect.query('y_id not in @hp_ids_r_mondo')\n",
"df_drug_effect = clean_edges(df_drug_effect)\n",
"df_drug_effect.head(1)"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"## GO Terms"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"### Go terms interactions (GO)"
]
},
{
"cell_type": "code",
"execution_count": 151,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:20.828806Z",
"start_time": "2021-08-06T12:18:20.400380Z"
},
"hidden": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation</th>\n",
" <th>display_relation</th>\n",
" <th>x_id</th>\n",
" <th>x_type</th>\n",
" <th>x_name</th>\n",
" <th>x_source</th>\n",
" <th>y_id</th>\n",
" <th>y_type</th>\n",
" <th>y_name</th>\n",
" <th>y_source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>bioprocess_bioprocess</td>\n",
" <td>parent-child</td>\n",
" <td>51581</td>\n",
" <td>biological_process</td>\n",
" <td>negative regulation of neurotransmitter uptake</td>\n",
" <td>GO</td>\n",
" <td>51612</td>\n",
" <td>biological_process</td>\n",
" <td>negative regulation of serotonin uptake</td>\n",
" <td>GO</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation display_relation x_id x_type \\\n",
"0 bioprocess_bioprocess parent-child 51581 biological_process \n",
"\n",
" x_name x_source y_id \\\n",
"0 negative regulation of neurotransmitter uptake GO 51612 \n",
"\n",
" y_type y_name y_source \n",
"0 biological_process negative regulation of serotonin uptake GO "
]
},
"execution_count": 151,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bp = df_go_terms.query('go_term_type==\"biological_process\"')\n",
"df_bp_bp = pd.merge(df_go_edges, bp, 'inner', left_on='x', right_on='go_term_id')\n",
"df_bp_bp = df_bp_bp.rename(columns={'go_term_id':'x_id','go_term_name':'x_name','go_term_type':'x_type'})\n",
"df_bp_bp = pd.merge(df_bp_bp, bp, 'inner', left_on='y', right_on='go_term_id')\n",
"df_bp_bp = df_bp_bp.rename(columns={'go_term_id':'y_id','go_term_name':'y_name','go_term_type':'y_type'})\n",
"df_bp_bp['relation'] = 'bioprocess_bioprocess'\n",
"df_bp_bp['x_source'] = 'GO'\n",
"df_bp_bp['y_source'] = 'GO'\n",
"df_bp_bp['display_relation'] = 'parent-child'\n",
"df_bp_bp = clean_edges(df_bp_bp)\n",
"df_bp_bp.head(1)"
]
},
{
"cell_type": "code",
"execution_count": 152,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:20.988390Z",
"start_time": "2021-08-06T12:18:20.831955Z"
},
"hidden": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation</th>\n",
" <th>display_relation</th>\n",
" <th>x_id</th>\n",
" <th>x_type</th>\n",
" <th>x_name</th>\n",
" <th>x_source</th>\n",
" <th>y_id</th>\n",
" <th>y_type</th>\n",
" <th>y_name</th>\n",
" <th>y_source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>molfunc_molfunc</td>\n",
" <td>parent-child</td>\n",
" <td>8168</td>\n",
" <td>molecular_function</td>\n",
" <td>methyltransferase activity</td>\n",
" <td>GO</td>\n",
" <td>102130</td>\n",
" <td>molecular_function</td>\n",
" <td>malonyl-CoA methyltransferase activity</td>\n",
" <td>GO</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation display_relation x_id x_type \\\n",
"0 molfunc_molfunc parent-child 8168 molecular_function \n",
"\n",
" x_name x_source y_id y_type \\\n",
"0 methyltransferase activity GO 102130 molecular_function \n",
"\n",
" y_name y_source \n",
"0 malonyl-CoA methyltransferase activity GO "
]
},
"execution_count": 152,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mf = df_go_terms.query('go_term_type==\"molecular_function\"')\n",
"df_mf_mf = pd.merge(df_go_edges, mf, 'inner', left_on='x', right_on='go_term_id')\n",
"df_mf_mf = df_mf_mf.rename(columns={'go_term_id':'x_id','go_term_name':'x_name','go_term_type':'x_type'})\n",
"df_mf_mf = pd.merge(df_mf_mf, mf, 'inner', left_on='y', right_on='go_term_id')\n",
"df_mf_mf = df_mf_mf.rename(columns={'go_term_id':'y_id','go_term_name':'y_name','go_term_type':'y_type'})\n",
"df_mf_mf['relation'] = 'molfunc_molfunc'\n",
"df_mf_mf['display_relation'] = 'parent-child'\n",
"df_mf_mf['x_source'] = 'GO'\n",
"df_mf_mf['y_source'] = 'GO'\n",
"df_mf_mf = clean_edges(df_mf_mf)\n",
"df_mf_mf.head(1)"
]
},
{
"cell_type": "code",
"execution_count": 153,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:21.149592Z",
"start_time": "2021-08-06T12:18:20.996132Z"
},
"hidden": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation</th>\n",
" <th>display_relation</th>\n",
" <th>x_id</th>\n",
" <th>x_type</th>\n",
" <th>x_name</th>\n",
" <th>x_source</th>\n",
" <th>y_id</th>\n",
" <th>y_type</th>\n",
" <th>y_name</th>\n",
" <th>y_source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>cellcomp_cellcomp</td>\n",
" <td>parent-child</td>\n",
" <td>110165</td>\n",
" <td>cellular_component</td>\n",
" <td>cellular anatomical entity</td>\n",
" <td>GO</td>\n",
" <td>90553</td>\n",
" <td>cellular_component</td>\n",
" <td>unicellular trichome tip</td>\n",
" <td>GO</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation display_relation x_id x_type \\\n",
"0 cellcomp_cellcomp parent-child 110165 cellular_component \n",
"\n",
" x_name x_source y_id y_type \\\n",
"0 cellular anatomical entity GO 90553 cellular_component \n",
"\n",
" y_name y_source \n",
"0 unicellular trichome tip GO "
]
},
"execution_count": 153,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cc = df_go_terms.query('go_term_type==\"cellular_component\"')\n",
"df_cc_cc = pd.merge(df_go_edges, cc, 'inner', left_on='x', right_on='go_term_id')\n",
"df_cc_cc = df_cc_cc.rename(columns={'go_term_id':'x_id','go_term_name':'x_name','go_term_type':'x_type'})\n",
"df_cc_cc = pd.merge(df_cc_cc, cc, 'inner', left_on='y', right_on='go_term_id')\n",
"df_cc_cc = df_cc_cc.rename(columns={'go_term_id':'y_id','go_term_name':'y_name','go_term_type':'y_type'})\n",
"df_cc_cc['relation'] = 'cellcomp_cellcomp'\n",
"df_cc_cc['display_relation'] = 'parent-child'\n",
"df_cc_cc['x_source'] = 'GO'\n",
"df_cc_cc['y_source'] = 'GO'\n",
"df_cc_cc = clean_edges(df_cc_cc)\n",
"df_cc_cc.head(1)\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"### Go protein interactions (Gene2GO)"
]
},
{
"cell_type": "code",
"execution_count": 154,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:22.059413Z",
"start_time": "2021-08-06T12:18:21.156638Z"
},
"hidden": true
},
"outputs": [],
"source": [
"df_prot_path = pd.merge(df_gene2go, df_go_terms, 'inner', 'go_term_id').rename(columns={'go_term_type_x':'go_term_type'})\n",
"df_prot_path = pd.merge(df_prot_path, df_prot_names, 'left', left_on='ncbi_gene_id', right_on='ncbi_id')\n",
"df_prot_path = df_prot_path.rename(columns={'ncbi_gene_id':'x_id', 'symbol':'x_name', \n",
" 'go_term_id':'y_id','go_term_name':'y_name', 'go_term_type':'y_type'})\n",
"df_prot_path['x_type'] = 'gene/protein'\n",
"df_prot_path['x_source'] = 'NCBI'\n",
"df_prot_path['y_source'] = 'GO'\n",
"df_prot_path = df_prot_path.get(['x_id','x_type', 'x_name', 'x_source','y_id','y_type', 'y_name', 'y_source'])"
]
},
{
"cell_type": "code",
"execution_count": 155,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:22.492514Z",
"start_time": "2021-08-06T12:18:22.065000Z"
},
"hidden": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation</th>\n",
" <th>display_relation</th>\n",
" <th>x_id</th>\n",
" <th>x_type</th>\n",
" <th>x_name</th>\n",
" <th>x_source</th>\n",
" <th>y_id</th>\n",
" <th>y_type</th>\n",
" <th>y_name</th>\n",
" <th>y_source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>protein_molfunc</td>\n",
" <td>interacts with</td>\n",
" <td>2</td>\n",
" <td>gene/protein</td>\n",
" <td>A2M</td>\n",
" <td>NCBI</td>\n",
" <td>19966</td>\n",
" <td>molecular_function</td>\n",
" <td>interleukin-1 binding</td>\n",
" <td>GO</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation display_relation x_id x_type x_name x_source y_id \\\n",
"0 protein_molfunc interacts with 2 gene/protein A2M NCBI 19966 \n",
"\n",
" y_type y_name y_source \n",
"0 molecular_function interleukin-1 binding GO "
]
},
"execution_count": 155,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_prot_mf = df_prot_path.query('y_type==\"molecular_function\"').copy()\n",
"df_prot_mf['relation'] = 'protein_molfunc'\n",
"df_prot_mf['display_relation'] = 'interacts with'\n",
"df_prot_mf = clean_edges(df_prot_mf)\n",
"df_prot_mf.head(1)"
]
},
{
"cell_type": "code",
"execution_count": 156,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:22.897652Z",
"start_time": "2021-08-06T12:18:22.499167Z"
},
"hidden": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation</th>\n",
" <th>display_relation</th>\n",
" <th>x_id</th>\n",
" <th>x_type</th>\n",
" <th>x_name</th>\n",
" <th>x_source</th>\n",
" <th>y_id</th>\n",
" <th>y_type</th>\n",
" <th>y_name</th>\n",
" <th>y_source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>214459</th>\n",
" <td>protein_cellcomp</td>\n",
" <td>interacts with</td>\n",
" <td>1</td>\n",
" <td>gene/protein</td>\n",
" <td>A1BG</td>\n",
" <td>NCBI</td>\n",
" <td>1904813</td>\n",
" <td>cellular_component</td>\n",
" <td>ficolin-1-rich granule lumen</td>\n",
" <td>GO</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation display_relation x_id x_type x_name x_source \\\n",
"214459 protein_cellcomp interacts with 1 gene/protein A1BG NCBI \n",
"\n",
" y_id y_type y_name y_source \n",
"214459 1904813 cellular_component ficolin-1-rich granule lumen GO "
]
},
"execution_count": 156,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_prot_cc = df_prot_path.query('y_type==\"cellular_component\"').copy()\n",
"df_prot_cc['relation'] = 'protein_cellcomp'\n",
"df_prot_cc['display_relation'] = 'interacts with'\n",
"df_prot_cc = clean_edges(df_prot_cc)\n",
"df_prot_cc.head(1)"
]
},
{
"cell_type": "code",
"execution_count": 157,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:23.584170Z",
"start_time": "2021-08-06T12:18:22.904825Z"
},
"hidden": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation</th>\n",
" <th>display_relation</th>\n",
" <th>x_id</th>\n",
" <th>x_type</th>\n",
" <th>x_name</th>\n",
" <th>x_source</th>\n",
" <th>y_id</th>\n",
" <th>y_type</th>\n",
" <th>y_name</th>\n",
" <th>y_source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>69588</th>\n",
" <td>protein_bioprocess</td>\n",
" <td>interacts with</td>\n",
" <td>1</td>\n",
" <td>gene/protein</td>\n",
" <td>A1BG</td>\n",
" <td>NCBI</td>\n",
" <td>43312</td>\n",
" <td>biological_process</td>\n",
" <td>neutrophil degranulation</td>\n",
" <td>GO</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation display_relation x_id x_type x_name x_source \\\n",
"69588 protein_bioprocess interacts with 1 gene/protein A1BG NCBI \n",
"\n",
" y_id y_type y_name y_source \n",
"69588 43312 biological_process neutrophil degranulation GO "
]
},
"execution_count": 157,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_prot_bp = df_prot_path.query('y_type==\"biological_process\"').copy()\n",
"df_prot_bp['relation'] = 'protein_bioprocess'\n",
"df_prot_bp['display_relation'] = 'interacts with'\n",
"df_prot_bp = clean_edges(df_prot_bp)\n",
"df_prot_bp.head(1)"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"## Exposure"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"### Exposure protein interactions (CTD)"
]
},
{
"cell_type": "code",
"execution_count": 158,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:42.557753Z",
"start_time": "2021-08-06T12:18:23.586715Z"
},
"hidden": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation</th>\n",
" <th>display_relation</th>\n",
" <th>x_id</th>\n",
" <th>x_type</th>\n",
" <th>x_name</th>\n",
" <th>x_source</th>\n",
" <th>y_id</th>\n",
" <th>y_type</th>\n",
" <th>y_name</th>\n",
" <th>y_source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>exposure_protein</td>\n",
" <td>interacts with</td>\n",
" <td>C092102</td>\n",
" <td>exposure</td>\n",
" <td>1-hydroxyphenanthrene</td>\n",
" <td>CTD</td>\n",
" <td>1401</td>\n",
" <td>gene/protein</td>\n",
" <td>CRP</td>\n",
" <td>NCBI</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation display_relation x_id x_type \\\n",
"0 exposure_protein interacts with C092102 exposure \n",
"\n",
" x_name x_source y_id y_type y_name y_source \n",
"0 1-hydroxyphenanthrene CTD 1401 gene/protein CRP NCBI "
]
},
"execution_count": 158,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_exp_prot = df_exposures.get(['exposurestressorname', 'exposurestressorid','exposuremarker', 'exposuremarkerid'])\n",
"df_exp_prot = df_exp_prot.loc[df_exp_prot.get(['exposuremarkerid']).dropna().index, :]\n",
"\n",
"gene_row_index = []\n",
"for idx, data in df_exp_prot.iterrows():\n",
" if data.exposuremarkerid.isnumeric(): \n",
" gene_row_index.append(idx)\n",
"\n",
"df_exp_prot = df_exp_prot.loc[gene_row_index, :].astype({'exposuremarkerid': 'int'}).astype({'exposuremarkerid': 'str'})\n",
"df_exp_prot = pd.merge(df_exp_prot, df_prot_names, 'left', left_on='exposuremarkerid', right_on='ncbi_id')\n",
"\n",
"df_exp_prot = df_exp_prot.rename(columns={'exposurestressorid':'x_id', 'exposurestressorname':'x_name', 'ncbi_id':'y_id', 'symbol':'y_name'})\n",
"df_exp_prot['x_type'] = 'exposure'\n",
"df_exp_prot['x_source'] = 'CTD'\n",
"df_exp_prot['y_type'] = 'gene/protein'\n",
"df_exp_prot['y_source'] = 'NCBI'\n",
"df_exp_prot['relation'] = 'exposure_protein'\n",
"df_exp_prot['display_relation'] = 'interacts with'\n",
"df_exp_prot = clean_edges(df_exp_prot)\n",
"df_exp_prot.head(1)"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"### Exposure disease interactions (CTD)"
]
},
{
"cell_type": "code",
"execution_count": 159,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:43.131769Z",
"start_time": "2021-08-06T12:18:42.744281Z"
},
"hidden": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation</th>\n",
" <th>display_relation</th>\n",
" <th>x_id</th>\n",
" <th>x_type</th>\n",
" <th>x_name</th>\n",
" <th>x_source</th>\n",
" <th>y_id</th>\n",
" <th>y_type</th>\n",
" <th>y_name</th>\n",
" <th>y_source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>exposure_disease</td>\n",
" <td>linked to</td>\n",
" <td>C024566</td>\n",
" <td>exposure</td>\n",
" <td>1,1,1-trichloroethane</td>\n",
" <td>CTD</td>\n",
" <td>4976</td>\n",
" <td>disease</td>\n",
" <td>amyotrophic lateral sclerosis</td>\n",
" <td>MONDO</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation display_relation x_id x_type \\\n",
"0 exposure_disease linked to C024566 exposure \n",
"\n",
" x_name x_source y_id y_type \\\n",
"0 1,1,1-trichloroethane CTD 4976 disease \n",
"\n",
" y_name y_source \n",
"0 amyotrophic lateral sclerosis MONDO "
]
},
"execution_count": 159,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_exp_dis = df_exposures.get(['exposurestressorname', 'exposurestressorid','diseasename', 'diseaseid'])\n",
"df_exp_dis = df_exp_dis.loc[df_exp_dis.get(['diseaseid']).dropna().index, :]\n",
"df_exp_dis = pd.merge(df_exp_dis, df_mondo_xref.query('ontology==\"MESH\"'), 'left', left_on='diseaseid', right_on='ontology_id')\n",
"df_exp_dis = pd.merge(df_exp_dis, df_mondo_terms, 'left', left_on='mondo_id', right_on= 'id')\n",
"\n",
"df_exp_dis = df_exp_dis.rename(columns={'exposurestressorid':'x_id', 'exposurestressorname':'x_name', 'mondo_id':'y_id', 'name':'y_name'})\n",
"df_exp_dis['x_type'] = 'exposure'\n",
"df_exp_dis['x_source'] = 'CTD'\n",
"df_exp_dis['y_type'] = 'disease'\n",
"df_exp_dis['y_source'] = 'MONDO'\n",
"df_exp_dis['relation'] = 'exposure_disease'\n",
"df_exp_dis['display_relation'] = 'linked to'\n",
"df_exp_dis = df_exp_dis.query('y_id not in @mondo_r_hp_ids') # Michelle added\n",
"df_exp_dis = clean_edges(df_exp_dis)\n",
"df_exp_dis.head(1)"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"### Exposure exposure interactions (CTD)"
]
},
{
"cell_type": "code",
"execution_count": 160,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:43.918651Z",
"start_time": "2021-08-06T12:18:43.138177Z"
},
"hidden": true
},
"outputs": [],
"source": [
"exposures = np.unique(df_exposures.get('exposurestressorid').values)\n",
"df_exp_exp = df_exposures.query('exposuremarkerid in @exposures')\n",
"\n",
"df_exp_exp = df_exp_exp.get(['exposurestressorname', 'exposurestressorid','exposuremarker', 'exposuremarkerid'])\n",
"df_exp_exp = df_exp_exp.loc[df_exp_exp.get(['exposuremarkerid']).dropna().index, :]\n",
"df_exp_exp = df_exp_exp.drop_duplicates()\n",
"\n",
"df_exp_exp = df_exp_exp.rename(columns={'exposurestressorid':'x_id', 'exposurestressorname':'x_name', 'exposuremarker':'y_name', 'exposuremarkerid':'y_id'})\n",
"df_exp_exp['x_type'] = 'exposure'\n",
"df_exp_exp['x_source'] = 'CTD'\n",
"df_exp_exp['y_type'] = 'exposure'\n",
"df_exp_exp['y_source'] = 'CTD'\n",
"df_exp_exp['relation'] = 'exposure_exposure'\n",
"df_exp_exp['display_relation'] = 'parent-child'\n",
"df_exp_exp = clean_edges(df_exp_exp)"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"### Exposure pathway interactions (CTD)"
]
},
{
"cell_type": "code",
"execution_count": 161,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:44.045651Z",
"start_time": "2021-08-06T12:18:43.924387Z"
},
"hidden": true
},
"outputs": [],
"source": [
"# phenotypes are actually pathways \n",
"\n",
"df_exp_path = df_exposures.get(['exposurestressorname', 'exposurestressorid','phenotypename', 'phenotypeid'])\n",
"df_exp_path = df_exp_path.loc[df_exp_path.get(['phenotypeid']).dropna().index, :]\n",
"df_exp_path.loc[:, 'phenotypeid'] = [str(int(x.split(':')[1])) for x in df_exp_path.get(['phenotypeid']).values.reshape(-1)]\n",
"df_exp_path = df_exp_path.drop_duplicates()\n",
"df_exp_path = pd.merge(df_exp_path, df_go_terms, 'inner', left_on='phenotypeid', right_on='go_term_id')\n",
"df_exp_path = df_exp_path.rename(columns={'exposurestressorid':'x_id', 'exposurestressorname':'x_name', \n",
" 'go_term_id':'y_id', 'go_term_name':'y_name', 'go_term_type':'y_type'})\n",
"df_exp_path['x_type'] = 'exposure'\n",
"df_exp_path['x_source'] = 'CTD'\n",
"df_exp_path['y_source'] = 'GO'"
]
},
{
"cell_type": "code",
"execution_count": 162,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:44.139834Z",
"start_time": "2021-08-06T12:18:44.048929Z"
},
"hidden": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation</th>\n",
" <th>display_relation</th>\n",
" <th>x_id</th>\n",
" <th>x_type</th>\n",
" <th>x_name</th>\n",
" <th>x_source</th>\n",
" <th>y_id</th>\n",
" <th>y_type</th>\n",
" <th>y_name</th>\n",
" <th>y_source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>exposure_bioprocess</td>\n",
" <td>interacts with</td>\n",
" <td>C046839</td>\n",
" <td>exposure</td>\n",
" <td>1,2,3,4,6,7,8-heptachlorodibenzodioxin</td>\n",
" <td>CTD</td>\n",
" <td>8217</td>\n",
" <td>biological_process</td>\n",
" <td>regulation of blood pressure</td>\n",
" <td>GO</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation display_relation x_id x_type \\\n",
"0 exposure_bioprocess interacts with C046839 exposure \n",
"\n",
" x_name x_source y_id y_type \\\n",
"0 1,2,3,4,6,7,8-heptachlorodibenzodioxin CTD 8217 biological_process \n",
"\n",
" y_name y_source \n",
"0 regulation of blood pressure GO "
]
},
"execution_count": 162,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_exp_bp = df_exp_path.query('y_type==\"biological_process\"').copy()\n",
"df_exp_bp['relation'] = 'exposure_bioprocess'\n",
"df_exp_bp['display_relation'] = 'interacts with'\n",
"df_exp_bp = clean_edges(df_exp_bp)\n",
"df_exp_bp.head(1)"
]
},
{
"cell_type": "code",
"execution_count": 163,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:44.221350Z",
"start_time": "2021-08-06T12:18:44.143253Z"
},
"hidden": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation</th>\n",
" <th>display_relation</th>\n",
" <th>x_id</th>\n",
" <th>x_type</th>\n",
" <th>x_name</th>\n",
" <th>x_source</th>\n",
" <th>y_id</th>\n",
" <th>y_type</th>\n",
" <th>y_name</th>\n",
" <th>y_source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>527</th>\n",
" <td>exposure_molfunc</td>\n",
" <td>interacts with</td>\n",
" <td>C014024</td>\n",
" <td>exposure</td>\n",
" <td>2,4,5,2',4',5'-hexachlorobiphenyl</td>\n",
" <td>CTD</td>\n",
" <td>19766</td>\n",
" <td>molecular_function</td>\n",
" <td>IgA receptor activity</td>\n",
" <td>GO</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation display_relation x_id x_type \\\n",
"527 exposure_molfunc interacts with C014024 exposure \n",
"\n",
" x_name x_source y_id y_type \\\n",
"527 2,4,5,2',4',5'-hexachlorobiphenyl CTD 19766 molecular_function \n",
"\n",
" y_name y_source \n",
"527 IgA receptor activity GO "
]
},
"execution_count": 163,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_exp_mf = df_exp_path.query('y_type==\"molecular_function\"').copy()\n",
"df_exp_mf['relation'] = 'exposure_molfunc'\n",
"df_exp_mf['display_relation'] = 'interacts with'\n",
"df_exp_mf = clean_edges(df_exp_mf)\n",
"df_exp_mf.head(1)"
]
},
{
"cell_type": "code",
"execution_count": 164,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:44.291345Z",
"start_time": "2021-08-06T12:18:44.223480Z"
},
"hidden": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation</th>\n",
" <th>display_relation</th>\n",
" <th>x_id</th>\n",
" <th>x_type</th>\n",
" <th>x_name</th>\n",
" <th>x_source</th>\n",
" <th>y_id</th>\n",
" <th>y_type</th>\n",
" <th>y_name</th>\n",
" <th>y_source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>833</th>\n",
" <td>exposure_cellcomp</td>\n",
" <td>interacts with</td>\n",
" <td>D000393</td>\n",
" <td>exposure</td>\n",
" <td>Air Pollutants</td>\n",
" <td>CTD</td>\n",
" <td>71743</td>\n",
" <td>cellular_component</td>\n",
" <td>IgE immunoglobulin complex, circulating</td>\n",
" <td>GO</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation display_relation x_id x_type x_name \\\n",
"833 exposure_cellcomp interacts with D000393 exposure Air Pollutants \n",
"\n",
" x_source y_id y_type \\\n",
"833 CTD 71743 cellular_component \n",
"\n",
" y_name y_source \n",
"833 IgE immunoglobulin complex, circulating GO "
]
},
"execution_count": 164,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_exp_cc = df_exp_path.query('y_type==\"cellular_component\"').copy()\n",
"df_exp_cc['relation'] = 'exposure_cellcomp'\n",
"df_exp_cc['display_relation'] = 'interacts with'\n",
"df_exp_cc = clean_edges(df_exp_cc)\n",
"df_exp_cc.head(1)"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"## Anatomy"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"### Anatomy anatomy interactions (UBERON) "
]
},
{
"cell_type": "code",
"execution_count": 165,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:44.470250Z",
"start_time": "2021-08-06T12:18:44.294967Z"
},
"hidden": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation</th>\n",
" <th>display_relation</th>\n",
" <th>x_id</th>\n",
" <th>x_type</th>\n",
" <th>x_name</th>\n",
" <th>x_source</th>\n",
" <th>y_id</th>\n",
" <th>y_type</th>\n",
" <th>y_name</th>\n",
" <th>y_source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>anatomy_anatomy</td>\n",
" <td>parent-child</td>\n",
" <td>2</td>\n",
" <td>anatomy</td>\n",
" <td>uterine cervix</td>\n",
" <td>UBERON</td>\n",
" <td>5156</td>\n",
" <td>anatomy</td>\n",
" <td>reproductive structure</td>\n",
" <td>UBERON</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation display_relation x_id x_type x_name x_source \\\n",
"0 anatomy_anatomy parent-child 2 anatomy uterine cervix UBERON \n",
"\n",
" y_id y_type y_name y_source \n",
"0 5156 anatomy reproductive structure UBERON "
]
},
"execution_count": 165,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_ana_ana = pd.merge(df_uberon_is_a, df_uberon_terms, 'left', left_on='id', right_on='id')\n",
"df_ana_ana = df_ana_ana.rename(columns={'id':'x_id', 'name':'x_name'})\n",
"df_ana_ana = pd.merge(df_ana_ana, df_uberon_terms, 'left', left_on='is_a', right_on='id')\n",
"df_ana_ana = df_ana_ana.rename(columns={'id':'y_id', 'name':'y_name'})\n",
"df_ana_ana['x_type'] = 'anatomy'\n",
"df_ana_ana['x_source'] = 'UBERON'\n",
"df_ana_ana['y_type'] = 'anatomy'\n",
"df_ana_ana['y_source'] = 'UBERON'\n",
"df_ana_ana['relation'] = 'anatomy_anatomy'\n",
"df_ana_ana['display_relation'] = 'parent-child'\n",
"df_ana_ana = clean_edges(df_ana_ana)\n",
"df_ana_ana.head(1)"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"### Anatomy Protein (BGEE)"
]
},
{
"cell_type": "code",
"execution_count": 166,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:46.577643Z",
"start_time": "2021-08-06T12:18:44.475187Z"
},
"hidden": true
},
"outputs": [],
"source": [
"df_bgee = pd.merge(df_bgee, df_prot_names, 'inner', left_on='gene_name', right_on='symbol')\n",
"df_bgee = df_bgee.rename(columns={'ncbi_id':'x_id', 'symbol':'x_name', \n",
" 'anatomy_id':'y_id', 'anatomy_name':'y_name'})\n",
"df_bgee['x_source'] = 'NCBI'\n",
"df_bgee['x_type'] = 'gene/protein'\n",
"df_bgee['y_source'] = 'UBERON'\n",
"df_bgee['y_type'] = 'anatomy'"
]
},
{
"cell_type": "code",
"execution_count": 167,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:50.843665Z",
"start_time": "2021-08-06T12:18:46.579406Z"
},
"hidden": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation</th>\n",
" <th>display_relation</th>\n",
" <th>x_id</th>\n",
" <th>x_type</th>\n",
" <th>x_name</th>\n",
" <th>x_source</th>\n",
" <th>y_id</th>\n",
" <th>y_type</th>\n",
" <th>y_name</th>\n",
" <th>y_source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>protein_present_anatomy</td>\n",
" <td>expression present</td>\n",
" <td>7105</td>\n",
" <td>gene/protein</td>\n",
" <td>TSPAN6</td>\n",
" <td>NCBI</td>\n",
" <td>2</td>\n",
" <td>anatomy</td>\n",
" <td>uterine cervix</td>\n",
" <td>UBERON</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation display_relation x_id x_type x_name \\\n",
"0 protein_present_anatomy expression present 7105 gene/protein TSPAN6 \n",
"\n",
" x_source y_id y_type y_name y_source \n",
"0 NCBI 2 anatomy uterine cervix UBERON "
]
},
"execution_count": 167,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_ana_prot_pos = df_bgee.query('expression==\"present\"').copy()\n",
"df_ana_prot_pos['relation'] = 'protein_present_anatomy'\n",
"df_ana_prot_pos['display_relation'] = 'expression present'\n",
"df_ana_prot_pos = clean_edges(df_ana_prot_pos)\n",
"df_ana_prot_pos.head(1)"
]
},
{
"cell_type": "code",
"execution_count": 168,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:51.146611Z",
"start_time": "2021-08-06T12:18:50.858329Z"
},
"hidden": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation</th>\n",
" <th>display_relation</th>\n",
" <th>x_id</th>\n",
" <th>x_type</th>\n",
" <th>x_name</th>\n",
" <th>x_source</th>\n",
" <th>y_id</th>\n",
" <th>y_type</th>\n",
" <th>y_name</th>\n",
" <th>y_source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>507</th>\n",
" <td>protein_absent_anatomy</td>\n",
" <td>expression absent</td>\n",
" <td>2268</td>\n",
" <td>gene/protein</td>\n",
" <td>FGR</td>\n",
" <td>NCBI</td>\n",
" <td>1476</td>\n",
" <td>anatomy</td>\n",
" <td>deltoid</td>\n",
" <td>UBERON</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation display_relation x_id x_type x_name \\\n",
"507 protein_absent_anatomy expression absent 2268 gene/protein FGR \n",
"\n",
" x_source y_id y_type y_name y_source \n",
"507 NCBI 1476 anatomy deltoid UBERON "
]
},
"execution_count": 168,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_ana_prot_neg = df_bgee.query('expression==\"absent\"').copy()\n",
"df_ana_prot_neg['relation'] = 'protein_absent_anatomy'\n",
"df_ana_prot_neg['display_relation'] = 'expression absent'\n",
"df_ana_prot_neg = clean_edges(df_ana_prot_neg)\n",
"df_ana_prot_neg.head(1)"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"## Pathways"
]
},
{
"cell_type": "code",
"execution_count": 169,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:51.324577Z",
"start_time": "2021-08-06T12:18:51.152492Z"
},
"hidden": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation</th>\n",
" <th>display_relation</th>\n",
" <th>x_id</th>\n",
" <th>x_type</th>\n",
" <th>x_name</th>\n",
" <th>x_source</th>\n",
" <th>y_id</th>\n",
" <th>y_type</th>\n",
" <th>y_name</th>\n",
" <th>y_source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>pathway_pathway</td>\n",
" <td>parent-child</td>\n",
" <td>R-HSA-109581</td>\n",
" <td>pathway</td>\n",
" <td>Apoptosis</td>\n",
" <td>REACTOME</td>\n",
" <td>R-HSA-109606</td>\n",
" <td>pathway</td>\n",
" <td>Intrinsic Pathway for Apoptosis</td>\n",
" <td>REACTOME</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation display_relation x_id x_type x_name \\\n",
"0 pathway_pathway parent-child R-HSA-109581 pathway Apoptosis \n",
"\n",
" x_source y_id y_type y_name y_source \n",
"0 REACTOME R-HSA-109606 pathway Intrinsic Pathway for Apoptosis REACTOME "
]
},
"execution_count": 169,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_path_path = pd.merge(df_reactome_rels, df_reactome_terms, 'inner', left_on='reactome_id_1', right_on='reactome_id')\n",
"df_path_path = df_path_path.rename(columns={'reactome_id': 'x_id', 'reactome_name':'x_name'})\n",
"df_path_path = pd.merge(df_path_path, df_reactome_terms, 'inner', left_on='reactome_id_2', right_on='reactome_id')\n",
"df_path_path = df_path_path.rename(columns={'reactome_id': 'y_id', 'reactome_name':'y_name'})\n",
"\n",
"df_path_path['x_source'] = 'REACTOME'\n",
"df_path_path['x_type'] = 'pathway'\n",
"df_path_path['y_source'] = 'REACTOME'\n",
"df_path_path['y_type'] = 'pathway'\n",
"df_path_path['relation'] = 'pathway_pathway'\n",
"df_path_path['display_relation'] = 'parent-child'\n",
"df_path_path = clean_edges(df_path_path)\n",
"df_path_path.head(1)"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"### Pathway protein interactions"
]
},
{
"cell_type": "code",
"execution_count": 170,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:18:51.616244Z",
"start_time": "2021-08-06T12:18:51.328730Z"
},
"hidden": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation</th>\n",
" <th>display_relation</th>\n",
" <th>x_id</th>\n",
" <th>x_type</th>\n",
" <th>x_name</th>\n",
" <th>x_source</th>\n",
" <th>y_id</th>\n",
" <th>y_type</th>\n",
" <th>y_name</th>\n",
" <th>y_source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>protein_pathway</td>\n",
" <td>interacts with</td>\n",
" <td>1</td>\n",
" <td>gene/protein</td>\n",
" <td>A1BG</td>\n",
" <td>NCBI</td>\n",
" <td>R-HSA-114608</td>\n",
" <td>pathway</td>\n",
" <td>Platelet degranulation</td>\n",
" <td>REACTOME</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation display_relation x_id x_type x_name x_source \\\n",
"0 protein_pathway interacts with 1 gene/protein A1BG NCBI \n",
"\n",
" y_id y_type y_name y_source \n",
"0 R-HSA-114608 pathway Platelet degranulation REACTOME "
]
},
"execution_count": 170,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_path_prot = pd.merge(df_reactome_ncbi, df_prot_names, 'inner', 'ncbi_id')\n",
"\n",
"df_path_prot = df_path_prot.rename(columns={'ncbi_id': 'x_id', 'symbol':'x_name', \n",
" 'reactome_id': 'y_id', 'reactome_name':'y_name'})\n",
"df_path_prot['x_source'] = 'NCBI'\n",
"df_path_prot['x_type'] = 'gene/protein'\n",
"df_path_prot['y_source'] = 'REACTOME'\n",
"df_path_prot['y_type'] = 'pathway'\n",
"df_path_prot['relation'] = 'protein_pathway'\n",
"df_path_prot['display_relation'] = 'interacts with'\n",
"df_path_prot = clean_edges(df_path_prot)\n",
"df_path_prot.head(1)"
]
},
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true
},
"source": [
"# Compiling knowledge graph"
]
},
{
"cell_type": "code",
"execution_count": 171,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"df_prot_prot\n",
"['gene/protein']\n",
"['gene/protein']\n",
"['protein_protein']\n",
"['ppi']\n",
"\n",
"df_prot_drug\n",
"['drug']\n",
"['gene/protein']\n",
"['drug_protein']\n",
"['carrier' 'enzyme' 'target' 'transporter']\n",
"\n",
"df_drug_dis\n",
"['drug']\n",
"['disease']\n",
"['contraindication' 'indication' 'off-label use']\n",
"['contraindication' 'indication' 'off-label use']\n",
"\n",
"df_drug_drug\n",
"['drug']\n",
"['drug']\n",
"['drug_drug']\n",
"['synergistic interaction']\n",
"\n",
"df_phe_prot\n",
"['effect/phenotype']\n",
"['gene/protein']\n",
"['phenotype_protein']\n",
"['associated with']\n",
"\n",
"df_phe_phe\n",
"['effect/phenotype']\n",
"['effect/phenotype']\n",
"['phenotype_phenotype']\n",
"['parent-child']\n",
"\n",
"df_dis_phe_neg\n",
"['disease']\n",
"['effect/phenotype']\n",
"['disease_phenotype_negative']\n",
"['phenotype absent']\n",
"\n",
"df_dis_phe_pos\n",
"['disease']\n",
"['effect/phenotype']\n",
"['disease_phenotype_positive']\n",
"['phenotype present']\n",
"\n",
"df_dis_prot\n",
"['disease']\n",
"['gene/protein']\n",
"['disease_protein']\n",
"['associated with']\n",
"\n",
"df_dis_dis\n",
"['disease']\n",
"['disease']\n",
"['disease_disease']\n",
"['parent-child']\n",
"\n",
"df_drug_effect\n",
"['drug']\n",
"['effect/phenotype']\n",
"['drug_effect']\n",
"['side effect']\n",
"\n",
"df_bp_bp\n",
"['biological_process']\n",
"['biological_process']\n",
"['bioprocess_bioprocess']\n",
"['parent-child']\n",
"\n",
"df_mf_mf\n",
"['molecular_function']\n",
"['molecular_function']\n",
"['molfunc_molfunc']\n",
"['parent-child']\n",
"\n",
"df_cc_cc\n",
"['cellular_component']\n",
"['cellular_component']\n",
"['cellcomp_cellcomp']\n",
"['parent-child']\n",
"\n",
"df_prot_mf\n",
"['gene/protein']\n",
"['molecular_function']\n",
"['protein_molfunc']\n",
"['interacts with']\n",
"\n",
"df_prot_cc\n",
"['gene/protein']\n",
"['cellular_component']\n",
"['protein_cellcomp']\n",
"['interacts with']\n",
"\n",
"df_prot_bp\n",
"['gene/protein']\n",
"['biological_process']\n",
"['protein_bioprocess']\n",
"['interacts with']\n",
"\n",
"df_exp_prot\n",
"['exposure']\n",
"['gene/protein']\n",
"['exposure_protein']\n",
"['interacts with']\n",
"\n",
"df_exp_dis\n",
"['exposure']\n",
"['disease']\n",
"['exposure_disease']\n",
"['linked to']\n",
"\n",
"df_exp_exp\n",
"['exposure']\n",
"['exposure']\n",
"['exposure_exposure']\n",
"['parent-child']\n",
"\n",
"df_exp_bp\n",
"['exposure']\n",
"['biological_process']\n",
"['exposure_bioprocess']\n",
"['interacts with']\n",
"\n",
"df_exp_mf\n",
"['exposure']\n",
"['molecular_function']\n",
"['exposure_molfunc']\n",
"['interacts with']\n",
"\n",
"df_exp_cc\n",
"['exposure']\n",
"['cellular_component']\n",
"['exposure_cellcomp']\n",
"['interacts with']\n",
"\n",
"df_path_path\n",
"['pathway']\n",
"['pathway']\n",
"['pathway_pathway']\n",
"['parent-child']\n",
"\n",
"df_path_prot\n",
"['gene/protein']\n",
"['pathway']\n",
"['protein_pathway']\n",
"['interacts with']\n",
"\n",
"df_ana_ana\n",
"['anatomy']\n",
"['anatomy']\n",
"['anatomy_anatomy']\n",
"['parent-child']\n",
"\n",
"df_ana_prot_pos\n",
"['gene/protein']\n",
"['anatomy']\n",
"['protein_present_anatomy']\n",
"['expression present']\n",
"\n",
"df_ana_prot_neg\n",
"['gene/protein']\n",
"['anatomy']\n",
"['protein_absent_anatomy']\n",
"['expression absent']\n"
]
}
],
"source": [
"print(\"df_prot_prot\")\n",
"print(df_prot_prot['x_type'].unique())\n",
"print(df_prot_prot['y_type'].unique())\n",
"print(df_prot_prot['relation'].unique())\n",
"print(df_prot_prot['display_relation'].unique())\n",
"\n",
"print(\"\\ndf_prot_drug\")\n",
"print(df_prot_drug['x_type'].unique())\n",
"print(df_prot_drug['y_type'].unique())\n",
"print(df_prot_drug['relation'].unique())\n",
"print(df_prot_drug['display_relation'].unique())\n",
"\n",
"print(\"\\ndf_drug_dis\")\n",
"print(df_drug_dis['x_type'].unique())\n",
"print(df_drug_dis['y_type'].unique())\n",
"print(df_drug_dis['relation'].unique())\n",
"print(df_drug_dis['display_relation'].unique())\n",
"\n",
"print(\"\\ndf_drug_drug\")\n",
"print(df_drug_drug['x_type'].unique())\n",
"print(df_drug_drug['y_type'].unique())\n",
"print(df_drug_drug['relation'].unique())\n",
"print(df_drug_drug['display_relation'].unique())\n",
"\n",
"print(\"\\ndf_phe_prot\")\n",
"print(df_phe_prot['x_type'].unique())\n",
"print(df_phe_prot['y_type'].unique())\n",
"print(df_phe_prot['relation'].unique())\n",
"print(df_phe_prot['display_relation'].unique())\n",
"\n",
"print(\"\\ndf_phe_phe\")\n",
"print(df_phe_phe['x_type'].unique())\n",
"print(df_phe_phe['y_type'].unique())\n",
"print(df_phe_phe['relation'].unique())\n",
"print(df_phe_phe['display_relation'].unique())\n",
"\n",
"print(\"\\ndf_dis_phe_neg\")\n",
"print(df_dis_phe_neg['x_type'].unique())\n",
"print(df_dis_phe_neg['y_type'].unique())\n",
"print(df_dis_phe_neg['relation'].unique())\n",
"print(df_dis_phe_neg['display_relation'].unique())\n",
"\n",
"print(\"\\ndf_dis_phe_pos\")\n",
"print(df_dis_phe_pos['x_type'].unique())\n",
"print(df_dis_phe_pos['y_type'].unique())\n",
"print(df_dis_phe_pos['relation'].unique())\n",
"print(df_dis_phe_pos['display_relation'].unique())\n",
"\n",
"print(\"\\ndf_dis_prot\")\n",
"print(df_dis_prot['x_type'].unique())\n",
"print(df_dis_prot['y_type'].unique())\n",
"print(df_dis_prot['relation'].unique())\n",
"print(df_dis_prot['display_relation'].unique())\n",
"\n",
"print(\"\\ndf_dis_dis\")\n",
"print(df_dis_dis['x_type'].unique())\n",
"print(df_dis_dis['y_type'].unique())\n",
"print(df_dis_dis['relation'].unique())\n",
"print(df_dis_dis['display_relation'].unique())\n",
"\n",
"print(\"\\ndf_drug_effect\")\n",
"print(df_drug_effect['x_type'].unique())\n",
"print(df_drug_effect['y_type'].unique())\n",
"print(df_drug_effect['relation'].unique())\n",
"print(df_drug_effect['display_relation'].unique())\n",
"\n",
"print(\"\\ndf_bp_bp\")\n",
"print(df_bp_bp['x_type'].unique())\n",
"print(df_bp_bp['y_type'].unique())\n",
"print(df_bp_bp['relation'].unique())\n",
"print(df_bp_bp['display_relation'].unique())\n",
"\n",
"print(\"\\ndf_mf_mf\")\n",
"print(df_mf_mf['x_type'].unique())\n",
"print(df_mf_mf['y_type'].unique())\n",
"print(df_mf_mf['relation'].unique())\n",
"print(df_mf_mf['display_relation'].unique())\n",
"\n",
"print(\"\\ndf_cc_cc\")\n",
"print(df_cc_cc['x_type'].unique())\n",
"print(df_cc_cc['y_type'].unique())\n",
"print(df_cc_cc['relation'].unique())\n",
"print(df_cc_cc['display_relation'].unique())\n",
"\n",
"print(\"\\ndf_prot_mf\")\n",
"print(df_prot_mf['x_type'].unique())\n",
"print(df_prot_mf['y_type'].unique())\n",
"print(df_prot_mf['relation'].unique())\n",
"print(df_prot_mf['display_relation'].unique())\n",
"\n",
"print(\"\\ndf_prot_cc\")\n",
"print(df_prot_cc['x_type'].unique())\n",
"print(df_prot_cc['y_type'].unique())\n",
"print(df_prot_cc['relation'].unique())\n",
"print(df_prot_cc['display_relation'].unique())\n",
"\n",
"print(\"\\ndf_prot_bp\")\n",
"print(df_prot_bp['x_type'].unique())\n",
"print(df_prot_bp['y_type'].unique())\n",
"print(df_prot_bp['relation'].unique())\n",
"print(df_prot_bp['display_relation'].unique())\n",
"\n",
"print(\"\\ndf_exp_prot\")\n",
"print(df_exp_prot['x_type'].unique())\n",
"print(df_exp_prot['y_type'].unique())\n",
"print(df_exp_prot['relation'].unique())\n",
"print(df_exp_prot['display_relation'].unique())\n",
"\n",
"print(\"\\ndf_exp_dis\")\n",
"print(df_exp_dis['x_type'].unique())\n",
"print(df_exp_dis['y_type'].unique())\n",
"print(df_exp_dis['relation'].unique())\n",
"print(df_exp_dis['display_relation'].unique())\n",
"\n",
"print(\"\\ndf_exp_exp\")\n",
"print(df_exp_exp['x_type'].unique())\n",
"print(df_exp_exp['y_type'].unique())\n",
"print(df_exp_exp['relation'].unique())\n",
"print(df_exp_exp['display_relation'].unique())\n",
"\n",
"print(\"\\ndf_exp_bp\")\n",
"print(df_exp_bp['x_type'].unique())\n",
"print(df_exp_bp['y_type'].unique())\n",
"print(df_exp_bp['relation'].unique())\n",
"print(df_exp_bp['display_relation'].unique())\n",
"\n",
"print(\"\\ndf_exp_mf\")\n",
"print(df_exp_mf['x_type'].unique())\n",
"print(df_exp_mf['y_type'].unique())\n",
"print(df_exp_mf['relation'].unique())\n",
"print(df_exp_mf['display_relation'].unique())\n",
"\n",
"print(\"\\ndf_exp_cc\")\n",
"print(df_exp_cc['x_type'].unique())\n",
"print(df_exp_cc['y_type'].unique())\n",
"print(df_exp_cc['relation'].unique())\n",
"print(df_exp_cc['display_relation'].unique())\n",
"\n",
"print(\"\\ndf_path_path\")\n",
"print(df_path_path['x_type'].unique())\n",
"print(df_path_path['y_type'].unique())\n",
"print(df_path_path['relation'].unique())\n",
"print(df_path_path['display_relation'].unique())\n",
"\n",
"print(\"\\ndf_path_prot\")\n",
"print(df_path_prot['x_type'].unique())\n",
"print(df_path_prot['y_type'].unique())\n",
"print(df_path_prot['relation'].unique())\n",
"print(df_path_prot['display_relation'].unique())\n",
"\n",
"print(\"\\ndf_ana_ana\")\n",
"print(df_ana_ana['x_type'].unique())\n",
"print(df_ana_ana['y_type'].unique())\n",
"print(df_ana_ana['relation'].unique())\n",
"print(df_ana_ana['display_relation'].unique())\n",
"\n",
"print(\"\\ndf_ana_prot_pos\")\n",
"print(df_ana_prot_pos['x_type'].unique())\n",
"print(df_ana_prot_pos['y_type'].unique())\n",
"print(df_ana_prot_pos['relation'].unique())\n",
"print(df_ana_prot_pos['display_relation'].unique())\n",
"\n",
"print(\"\\ndf_ana_prot_neg\")\n",
"print(df_ana_prot_neg['x_type'].unique())\n",
"print(df_ana_prot_neg['y_type'].unique())\n",
"print(df_ana_prot_neg['relation'].unique())\n",
"print(df_ana_prot_neg['display_relation'].unique())\n"
]
},
{
"cell_type": "code",
"execution_count": 172,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:19:32.597604Z",
"start_time": "2021-08-06T12:18:51.620711Z"
},
"hidden": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>relation</th>\n",
" <th>display_relation</th>\n",
" <th>x_id</th>\n",
" <th>x_type</th>\n",
" <th>x_name</th>\n",
" <th>x_source</th>\n",
" <th>y_id</th>\n",
" <th>y_type</th>\n",
" <th>y_name</th>\n",
" <th>y_source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1539160</th>\n",
" <td>protein_absent_anatomy</td>\n",
" <td>expression absent</td>\n",
" <td>140</td>\n",
" <td>gene/protein</td>\n",
" <td>ADORA3</td>\n",
" <td>NCBI</td>\n",
" <td>4720</td>\n",
" <td>anatomy</td>\n",
" <td>cerebellar vermis</td>\n",
" <td>UBERON</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1539470</th>\n",
" <td>protein_absent_anatomy</td>\n",
" <td>expression absent</td>\n",
" <td>105378952</td>\n",
" <td>gene/protein</td>\n",
" <td>KLF18</td>\n",
" <td>NCBI</td>\n",
" <td>1377</td>\n",
" <td>anatomy</td>\n",
" <td>quadriceps femoris</td>\n",
" <td>UBERON</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1539471</th>\n",
" <td>protein_absent_anatomy</td>\n",
" <td>expression absent</td>\n",
" <td>105378952</td>\n",
" <td>gene/protein</td>\n",
" <td>KLF18</td>\n",
" <td>NCBI</td>\n",
" <td>1379</td>\n",
" <td>anatomy</td>\n",
" <td>vastus lateralis</td>\n",
" <td>UBERON</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1539472</th>\n",
" <td>protein_absent_anatomy</td>\n",
" <td>expression absent</td>\n",
" <td>105378952</td>\n",
" <td>gene/protein</td>\n",
" <td>KLF18</td>\n",
" <td>NCBI</td>\n",
" <td>2084</td>\n",
" <td>anatomy</td>\n",
" <td>heart left ventricle</td>\n",
" <td>UBERON</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1539473</th>\n",
" <td>protein_absent_anatomy</td>\n",
" <td>expression absent</td>\n",
" <td>105378952</td>\n",
" <td>gene/protein</td>\n",
" <td>KLF18</td>\n",
" <td>NCBI</td>\n",
" <td>5384</td>\n",
" <td>anatomy</td>\n",
" <td>nasal cavity epithelium</td>\n",
" <td>UBERON</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" relation display_relation x_id x_type \\\n",
"1539160 protein_absent_anatomy expression absent 140 gene/protein \n",
"1539470 protein_absent_anatomy expression absent 105378952 gene/protein \n",
"1539471 protein_absent_anatomy expression absent 105378952 gene/protein \n",
"1539472 protein_absent_anatomy expression absent 105378952 gene/protein \n",
"1539473 protein_absent_anatomy expression absent 105378952 gene/protein \n",
"\n",
" x_name x_source y_id y_type y_name y_source \n",
"1539160 ADORA3 NCBI 4720 anatomy cerebellar vermis UBERON \n",
"1539470 KLF18 NCBI 1377 anatomy quadriceps femoris UBERON \n",
"1539471 KLF18 NCBI 1379 anatomy vastus lateralis UBERON \n",
"1539472 KLF18 NCBI 2084 anatomy heart left ventricle UBERON \n",
"1539473 KLF18 NCBI 5384 anatomy nasal cavity epithelium UBERON "
]
},
"execution_count": 172,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kg = pd.concat([df_prot_prot, df_prot_drug, df_drug_dis, df_drug_drug, df_phe_prot,\n",
" df_phe_phe, df_dis_phe_neg, df_dis_phe_pos, df_dis_prot, df_dis_dis, \n",
" df_drug_effect, df_bp_bp, df_mf_mf, df_cc_cc, df_prot_mf, \n",
" df_prot_cc, df_prot_bp, df_exp_prot, df_exp_dis, df_exp_exp, \n",
" df_exp_bp, df_exp_mf, df_exp_cc, df_path_path, df_path_prot,\n",
" df_ana_ana, df_ana_prot_pos, df_ana_prot_neg]) #28\n",
"kg = kg.drop_duplicates()\n",
"#kg_rev = kg.copy().rename(columns={'x_id':'y_id','x_type':'y_type', 'x_name':'y_name', 'x_source':'y_source', 'y_id':'x_id','y_type':'x_type', 'y_name':'x_name', 'y_source':'x_source' }) #add reverse edges\n",
"#kg_rev['relation'] = kg_rev['relation'] + \"_rev\"\n",
"#print(len(kg), len(kg_rev))\n",
"\n",
"#kg = pd.concat([kg, kg_rev])\n",
"#kg = kg.drop_duplicates()\n",
"kg = kg.dropna()\n",
"# remove self loops from edges \n",
"kg = kg.query('not ((x_id == y_id) and (x_type == y_type) and (x_source == y_source) and (x_name == y_name))')\n",
"kg.tail()"
]
},
{
"cell_type": "code",
"execution_count": 173,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['protein_protein' 'drug_protein' 'contraindication' 'indication'\n",
" 'off-label use' 'drug_drug' 'phenotype_protein' 'phenotype_phenotype'\n",
" 'disease_phenotype_negative' 'disease_phenotype_positive'\n",
" 'disease_protein' 'disease_disease' 'drug_effect' 'bioprocess_bioprocess'\n",
" 'molfunc_molfunc' 'cellcomp_cellcomp' 'protein_molfunc'\n",
" 'protein_cellcomp' 'protein_bioprocess' 'exposure_protein'\n",
" 'exposure_disease' 'exposure_exposure' 'exposure_bioprocess'\n",
" 'exposure_molfunc' 'exposure_cellcomp' 'pathway_pathway'\n",
" 'protein_pathway' 'anatomy_anatomy' 'protein_present_anatomy'\n",
" 'protein_absent_anatomy']\n",
"['ppi' 'carrier' 'enzyme' 'target' 'transporter' 'contraindication'\n",
" 'indication' 'off-label use' 'synergistic interaction' 'associated with'\n",
" 'parent-child' 'phenotype absent' 'phenotype present' 'side effect'\n",
" 'interacts with' 'linked to' 'expression present' 'expression absent']\n",
"5463048\n"
]
}
],
"source": [
"print(kg['relation'].unique())\n",
"print(kg['display_relation'].unique())\n",
"print(len(kg))"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:20:04.693646Z",
"start_time": "2021-08-06T12:19:32.602234Z"
},
"hidden": true
},
"outputs": [],
"source": [
"kg.to_csv(save_path+'auxillary/kg_raw.csv', index=False) # No reverse edges"
]
},
{
"cell_type": "code",
"execution_count": 174,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Empty DataFrame\n",
"Columns: [relation, display_relation, x_id, x_type, x_name, x_source, y_id, y_type, y_name, y_source]\n",
"Index: []\n",
"Empty DataFrame\n",
"Columns: [relation, display_relation, x_id, x_type, x_name, x_source, y_id, y_type, y_name, y_source]\n",
"Index: []\n"
]
}
],
"source": [
"# Double check that none of the MONDO terms are still in HPO\n",
"print(kg.query('x_source == \"MONDO\" and x_id in @mondo_r_hp_ids'))\n",
"print(kg.query('y_source == \"MONDO\" and y_id in @mondo_r_hp_ids'))"
]
},
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true
},
"source": [
"# Get giant component"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:20:20.238147Z",
"start_time": "2021-08-06T12:20:04.696534Z"
},
"hidden": true
},
"outputs": [],
"source": [
"kg = pd.read_csv(save_path+'auxillary/kg_raw_orphanet.csv', low_memory=False)"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" relation display_relation x_id \\\n",
"0 protein_protein ppi 9796 \n",
"1 protein_protein ppi 7918 \n",
"2 protein_protein ppi 8233 \n",
"3 protein_protein ppi 4899 \n",
"4 protein_protein ppi 5297 \n",
"... ... ... ... \n",
"5471989 disease_phenotype_positive phenotype present 15942 \n",
"5471990 disease_phenotype_positive phenotype present 16355 \n",
"5471991 disease_phenotype_positive phenotype present 8294 \n",
"5471992 disease_phenotype_positive phenotype present 14412 \n",
"5471993 phenotype_phenotype parent-child 8255 \n",
"\n",
" x_type x_name x_source \\\n",
"0 gene/protein PHYHIP NCBI \n",
"1 gene/protein GPANK1 NCBI \n",
"2 gene/protein ZRSR2 NCBI \n",
"3 gene/protein NRF1 NCBI \n",
"4 gene/protein PI4KA NCBI \n",
"... ... ... ... \n",
"5471989 disease frontometaphyseal dysplasia MONDO \n",
"5471990 disease semilobar holoprosencephaly MONDO \n",
"5471991 disease acute intermittent porphyria MONDO \n",
"5471992 disease hyperlipoproteinemia, type 1D MONDO \n",
"5471993 effect/phenotype Transient neonatal diabetes mellitus HPO \n",
"\n",
" y_id y_type y_name \\\n",
"0 56992 gene/protein KIF15 \n",
"1 9240 gene/protein PNMA1 \n",
"2 23548 gene/protein TTC33 \n",
"3 11253 gene/protein MAN1B1 \n",
"4 8601 gene/protein RGS20 \n",
"... ... ... ... \n",
"5471989 218 effect/phenotype High palate \n",
"5471990 568 effect/phenotype Microphthalmia \n",
"5471991 2039 effect/phenotype Anorexia \n",
"5471992 100851 effect/phenotype Abnormal emotion/affect behavior \n",
"5471993 10935 effect/phenotype Abnormality of the upper urinary tract \n",
"\n",
" y_source \n",
"0 NCBI \n",
"1 NCBI \n",
"2 NCBI \n",
"3 NCBI \n",
"4 NCBI \n",
"... ... \n",
"5471989 HPO \n",
"5471990 HPO \n",
"5471991 HPO \n",
"5471992 HPO \n",
"5471993 HPO \n",
"\n",
"[5471994 rows x 10 columns]\n"
]
}
],
"source": [
"print(kg)"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:21:48.910982Z",
"start_time": "2021-08-06T12:20:20.245604Z"
},
"hidden": true
},
"outputs": [],
"source": [
"nodes = pd.concat([kg.get(['x_id','x_type', 'x_name','x_source']).rename(columns={'x_id':'node_id', 'x_type':'node_type', 'x_name':'node_name','x_source':'node_source'}), \n",
" kg.get(['y_id','y_type', 'y_name','y_source']).rename(columns={'y_id':'node_id', 'y_type':'node_type', 'y_name':'node_name','y_source':'node_source'})])\n",
"nodes = nodes.drop_duplicates().reset_index().drop('index',axis=1).reset_index().rename(columns={'index':'node_idx'})\n",
"\n",
"edges = pd.merge(kg, nodes, 'left', left_on=['x_id','x_type', 'x_name','x_source'], right_on=['node_id','node_type','node_name','node_source'])\n",
"edges = edges.rename(columns={'node_idx':'x_idx'})\n",
"edges = pd.merge(edges, nodes, 'left', left_on=['y_id','y_type', 'y_name','y_source'], right_on=['node_id','node_type','node_name','node_source'])\n",
"edges = edges.rename(columns={'node_idx':'y_idx'})\n",
"edges = edges.get(['relation', 'display_relation','x_idx', 'y_idx'])\n",
"edges['combine_idx'] = edges['x_idx'].astype(str) + '-' + edges['y_idx'].astype(str)\n",
"\n",
"edge_index = edges.get(['x_idx', 'y_idx']).values.T\n",
"\n",
"graph = ig.Graph()\n",
"graph.add_vertices(list(range(nodes.shape[0])))\n",
"graph.add_edges([tuple(x) for x in edge_index.T])\n",
"\n",
"graph = graph.as_undirected(mode='collapse')\n",
"\n",
"c = graph.components(mode='strong')\n",
"giant = c.giant()\n",
"\n",
"#print('Nodes: %d' % giant.vcount())\n",
"#print('Edges: %d' % giant.ecount())\n",
"\n",
"assert not giant.is_directed()\n",
"assert giant.is_connected()\n",
"\n",
"giant_nodes = giant.vs['name']\n",
"new_nodes = nodes.query('node_idx in @giant_nodes')\n",
"assert new_nodes.shape[0] == giant.vcount()\n",
"\n",
"new_edges = edges.query('x_idx in @giant_nodes and y_idx in @giant_nodes').copy()\n",
"assert new_edges.shape[0] == giant.ecount()\n",
"\n",
"new_kg = pd.merge(new_edges, new_nodes, 'left', left_on='x_idx', right_on='node_idx')\n",
"new_kg = new_kg.rename(columns={'node_id':'x_id', 'node_type':'x_type', 'node_name':'x_name','node_source':'x_source'}) \n",
"new_kg = pd.merge(new_kg, new_nodes, 'left', left_on='y_idx', right_on='node_idx')\n",
"new_kg = new_kg.rename(columns={'node_id':'y_id', 'node_type':'y_type', 'node_name':'y_name','node_source':'y_source'}) \n",
"new_kg = clean_edges(new_kg)"
]
},
{
"cell_type": "code",
"execution_count": 124,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:22:20.208631Z",
"start_time": "2021-08-06T12:21:48.913545Z"
},
"hidden": true
},
"outputs": [],
"source": [
"kg = new_kg.copy()\n",
"kg.to_csv(save_path+'auxillary/kg_giant_orphanet.csv', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true
},
"source": [
"# Collapse similar diseases"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:22:34.422640Z",
"start_time": "2021-08-06T12:22:20.210477Z"
},
"hidden": true
},
"outputs": [],
"source": [
"# Disease groupings are independent of the KG (requires only MONDO terms)\n",
"\n",
"kg = pd.read_csv(save_path+'auxillary/kg_giant_orphanet.csv', low_memory=False)"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"## Find Groups"
]
},
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true,
"hidden": true
},
"source": [
"### Automated grouping"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:22:34.495911Z",
"start_time": "2021-08-06T12:22:34.424764Z"
},
"hidden": true
},
"outputs": [],
"source": [
"'''\n",
"disease_nodes = pd.concat([kg.get(['x_id','x_type', 'x_name','x_source']).rename(columns={'x_id':'node_id', 'x_type':'node_type', 'x_name':'node_name','x_source':'node_source'}), \n",
" kg.get(['y_id','y_type', 'y_name','y_source']).rename(columns={'y_id':'node_id', 'y_type':'node_type', 'y_name':'node_name','y_source':'node_source'})])\n",
"disease_nodes = disease_nodes.query('node_type==\"disease\"')\n",
"disease_nodes = disease_nodes.drop_duplicates().reset_index().drop('index',axis=1).reset_index().rename(columns={'index':'node_idx'})\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:22:34.540058Z",
"start_time": "2021-08-06T12:22:34.501005Z"
},
"hidden": true,
"scrolled": true
},
"outputs": [],
"source": [
"'''\n",
"groups = []\n",
"seen = set()\n",
"idx2group = {}\n",
"no = set()\n",
"\n",
"def isroman(s):\n",
" return bool(re.search(r\"^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$\",s))\n",
"\n",
"def issingleletter(s): \n",
" if len(s)>1: return False\n",
"\n",
"def same_words(s1, s2): \n",
" for word in s1.lower().split(' '): \n",
" word = word.split(',')[0]\n",
" if word!='type' and word!='(disease)' and word not in s2.lower(): \n",
" return False \n",
" for word in s2.lower().split(' '): \n",
" word = word.split(',')[0]\n",
" if word!='type' and word!='(disease)' and word not in s1.lower(): \n",
" return False\n",
" return True\n",
"\n",
"for i in range(disease_nodes.shape[0]):\n",
" i_name = disease_nodes.loc[i, 'node_name']\n",
" i_idx = disease_nodes.loc[i, 'node_idx']\n",
" for w in ['monosomy','disomy', 'trisomy', 'trisomy/tetrasomy', 'chromosome']: \n",
" if w in i_name: \n",
" no.add(i_idx)\n",
"\n",
"for i in range(disease_nodes.shape[0]):\n",
" i_idx = disease_nodes.loc[i, 'node_idx']\n",
" if i_idx in seen: continue \n",
" if i_idx in no: continue \n",
" i_name = disease_nodes.loc[i, 'node_name']\n",
" i_split = i_name.split(' ')\n",
" end = i_split[-1]\n",
" if len(end)<=2 or end.isnumeric() or isroman(end): \n",
" main_text = ' '.join(i_split[:-1])\n",
" matches = [i_name]\n",
" matches_idx = [i_idx]\n",
" match_found = False\n",
" numeric = True\n",
" for j in range(disease_nodes.shape[0]):\n",
" j_idx = disease_nodes.loc[j, 'node_idx']\n",
" j_name = disease_nodes.loc[j, 'node_name']\n",
" m = ' '.join(j_name.split(' ')[:-1])\n",
" if m.lower() == main_text.lower() or same_words(m, main_text): \n",
" matches.append(j_name)\n",
" matches_idx.append(j_idx)\n",
" match_found = True\n",
" if match_found:\n",
" matches_idx = list(set(matches_idx))\n",
" matches = list(set(matches))\n",
" if len(matches) <= 1: continue \n",
" if main_text.endswith('type'): \n",
" main_text = main_text[:-4]\n",
" if main_text.endswith(','): \n",
" main_text = main_text[:-1]\n",
" if main_text.endswith(' '): \n",
" main_text = main_text[:-1]\n",
" print(main_text)\n",
" for x in sorted(matches): \n",
" print('- ',x)\n",
" for x in matches_idx: \n",
" seen.add(x)\n",
" idx2group[x] = main_text\n",
" groups.append((main_text, matches_idx))\n",
"\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:22:34.579734Z",
"start_time": "2021-08-06T12:22:34.542417Z"
},
"hidden": true
},
"outputs": [],
"source": [
"'''\n",
"disease_nodes.loc[:, 'group_name'] = ''\n",
"for data in disease_nodes.itertuples():\n",
" if data.node_idx in idx2group.keys(): \n",
" disease_nodes.loc[data.Index, 'group_name'] = idx2group[data.node_idx]\n",
" else: \n",
" disease_nodes.loc[data.Index, 'group_name'] = data.node_name\n",
" \n",
"disease_group_1 = disease_nodes.get(['group_name']).drop_duplicates().reset_index().rename(columns={'index':'group_idx'})\n",
"disease_nodes = pd.merge(disease_nodes, disease_group_1, 'left', 'group_name')\n",
"'''"
]
},
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true,
"hidden": true
},
"source": [
"### Grouping with BERT"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:22:34.618408Z",
"start_time": "2021-08-06T12:22:34.582228Z"
},
"code_folding": [],
"hidden": true
},
"outputs": [],
"source": [
"# generate embeddings \n",
"'''\n",
"input_text = list(disease_group_1.get('group_name').values)\n",
"\n",
"device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
"#model_name='dmis-lab/biobert-large-cased-v1.1'\n",
"model_name = 'emilyalsentzer/Bio_ClinicalBERT'\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
"model = AutoModel.from_pretrained(model_name)\n",
"model = model.to(device)\n",
"model.eval()\n",
"\n",
"def batch(iterable, batch_size=4, return_idx=True):\n",
" l = len(iterable)\n",
" for ndx in range(0, l, batch_size):\n",
" if return_idx: \n",
" yield (ndx, min(ndx + batch_size, l))\n",
" else:\n",
" yield iterable[ndx:min(ndx + batch_size, l)]\n",
" \n",
"tmp_dir = 'tmp/'\n",
"if os.path.isdir(tmp_dir): \n",
" shutil.rmtree(tmp_dir)\n",
"os.mkdir(tmp_dir)\n",
"\n",
"batch_size=32\n",
"input_tokens = tokenizer(input_text, padding=True, return_tensors='pt', truncation=True, max_length=512)\n",
"for i, (start, end) in tqdm(enumerate(batch(input_text, batch_size))):\n",
" input_ids = input_tokens['input_ids'][start:end, :].to(device)\n",
" attention_mask = input_tokens['attention_mask'][start:end, :].to(device)\n",
" with torch.no_grad():\n",
" outputs = model(input_ids=input_ids, attention_mask=attention_mask)\n",
" embeds = torch.mean(outputs[0], dim=1)\n",
" np.save(tmp_dir+str(i)+'.npy', embeds.numpy())\n",
" \n",
"embeds = []\n",
"for i, _ in enumerate(batch(input_text, batch_size)):\n",
" x = np.load(tmp_dir+str(i)+'.npy')\n",
" embeds.append(x)\n",
"embeds = np.concatenate(embeds)\n",
"\n",
"np.save(save_path+'auxillary/kg_disease_bert_embeds.npy', embeds)\n",
"if os.path.isdir(tmp_dir): \n",
" shutil.rmtree(tmp_dir)'''"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:22:34.656301Z",
"start_time": "2021-08-06T12:22:34.621379Z"
},
"hidden": true
},
"outputs": [],
"source": [
"'''\n",
"embeds = np.load(save_path+'auxillary/kg_disease_bert_embeds.npy')\n",
"cos_sim = cosine_similarity(embeds, embeds)\n",
"\n",
"seen = set()\n",
"groups = []\n",
"idx2group = {}\n",
"no = set()\n",
"\n",
"for i in range(disease_group_1.shape[0]):\n",
" i_name = disease_group_1.loc[i, 'group_name']\n",
" i_idx = disease_group_1.loc[i, 'group_idx']\n",
" for w in ['cardiomyopathy', 'syndrome', 'combined', 'complement', 'deficiency', \n",
" 'factor', 'immunodeficiency', 'monosomy','disomy', 'trisomy', \n",
" 'trisomy/tetrasomy', 'chromosome', 'neuroendocrine tumor', \n",
" 'neuroendocrine neoplasm', 'cancer', 'tumor', 'neoplasm','carcinoma',\n",
" 'lymphoma', 'lipoma']: \n",
" if w in i_name: \n",
" no.add(i_idx)\n",
" continue\n",
" for w in ['CDG']: \n",
" if i_name.endswith(w): \n",
" no.add(i_idx)\n",
" continue\n",
" for w in ['neurodevelopmental disorder', 'glycogen storage disease', \n",
" 'congenital disorder of glycosylation', 'qualitative or quantitative defects']: \n",
" if i_name.startswith(w): \n",
" no.add(i_idx)\n",
" continue\n",
" \n",
"cutoff = 0.98\n",
"for i in range(disease_group_1.shape[0]):\n",
" i_name = disease_group_1.loc[i, 'group_name']\n",
" i_idx = disease_group_1.loc[i, 'group_idx']\n",
" if i_idx in no or i_idx in seen: continue\n",
" x = disease_group_1[cos_sim[i]>cutoff]\n",
" if x.shape[0]>1: \n",
" for v in x.get('group_name').values: \n",
" print(v)\n",
" main_text = input(' Ok? ')\n",
" if main_text not in ['','on','no', 'No', 'NO']: \n",
" for v in x.get('group_idx').values: \n",
" seen.add(v)\n",
" idx2group[v] = main_text\n",
" g = list(x.get('group_idx').values.reshape(-1))\n",
" groups.append((main_text, g)) # main_text contains group name\n",
" else: \n",
" no.add(i_idx)\n",
" print('Not added')\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:22:34.702232Z",
"start_time": "2021-08-06T12:22:34.662051Z"
},
"hidden": true
},
"outputs": [],
"source": [
"'''\n",
"disease_group_1.loc[:, 'group_name_2'] = ''\n",
"for data in disease_group_1.itertuples(): \n",
" if data.group_idx in idx2group.keys():\n",
" disease_group_1.loc[data.Index, 'group_name_2'] = idx2group[data.group_idx]\n",
" else: \n",
" disease_group_1.loc[data.Index, 'group_name_2'] = data.group_name\n",
" \n",
"disease_group_2 = disease_group_1.get(['group_name_2']).drop_duplicates().reset_index().rename(columns={'index':'group_idx_2'})\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:22:34.745711Z",
"start_time": "2021-08-06T12:22:34.709900Z"
},
"hidden": true
},
"outputs": [],
"source": [
"'''\n",
"df_disease_group = pd.merge(disease_nodes, disease_group_1, 'left', 'group_name')\n",
"df_disease_group = df_disease_group.get(['node_id', 'node_type', 'node_name', 'node_source',\n",
" 'group_name', 'group_name_2'])\n",
"df_disease_group = df_disease_group.rename(columns={'group_name':'group_name_auto',\n",
" 'group_name_2':'group_name_bert'}).astype({'node_id':str})\n",
"df_disease_group.to_csv(save_path+'auxillary/kg_grouped_diseases.csv')\n",
"'''"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"## Apply Groups"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:22:36.824640Z",
"start_time": "2021-08-06T12:22:34.748321Z"
},
"hidden": true
},
"outputs": [],
"source": [
"grouped_diseases = pd.read_csv(save_path+'auxillary/kg_grouped_diseases.csv').astype({'node_id':str})\n",
"group_col = 'group_name_bert'\n",
"\n",
"groups = grouped_diseases.groupby(group_col).count().query('node_id>1').index.values\n",
"set_groups = set(groups)\n",
"\n",
"id_col = group_col.replace('name','id')\n",
"group_map = pd.DataFrame(columns=[id_col, group_col])\n",
"group_map.loc[:, group_col] = groups\n",
"\n",
"grouped_diseases = grouped_diseases.query('{} in @set_groups'.format(group_col))\n",
"\n",
"for g, data in grouped_diseases.groupby(group_col): \n",
" if g in set_groups:\n",
" x = '_'.join(list(data.get('node_id').values))\n",
" i = group_map.query('{}==@g'.format(group_col)).index[0]\n",
" group_map.loc[i, id_col] = x\n",
" \n",
"grouped_diseases = pd.merge(grouped_diseases, group_map)\n",
"grouped_diseases.to_csv(save_path+'auxillary/kg_grouped_diseases_bert_map.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:26:36.665730Z",
"start_time": "2021-08-06T12:22:36.827448Z"
},
"hidden": true,
"scrolled": true
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "74fa05f65c9b4a269a95e9de21f0ab21",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/6392 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"kg_x_dis = kg.query('x_type==\"disease\" and x_source==\"MONDO\"')\n",
"kg_y_dis = kg.query('y_type==\"disease\" and y_source==\"MONDO\"')\n",
"\n",
"for idx, data in tqdm(grouped_diseases.iterrows(), total=grouped_diseases.shape[0]): \n",
" x_index = kg_x_dis.query('x_id==@data.node_id and x_name==@data.node_name').index.values\n",
" kg.loc[x_index, 'x_id'] = data.get(id_col)\n",
" kg.loc[x_index, 'x_name'] = data.get(group_col)\n",
" kg.loc[x_index, 'x_source'] = 'MONDO_grouped'\n",
"\n",
" y_index = kg_y_dis.query('y_id==@data.node_id and y_name==@data.node_name').index.values\n",
" kg.loc[y_index, 'y_id'] = data.get(id_col)\n",
" kg.loc[y_index, 'y_name'] = data.get(group_col)\n",
" kg.loc[y_index, 'y_source'] = 'MONDO_grouped'"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:27:07.721261Z",
"start_time": "2021-08-06T12:26:36.673297Z"
},
"hidden": true
},
"outputs": [],
"source": [
"kg = kg.drop_duplicates()\n",
"#kg_rev = kg.copy().rename(columns={'x_id':'y_id','x_type':'y_type', 'x_name':'y_name', 'x_source':'y_source', 'y_id':'x_id','y_type':'x_type', 'y_name':'x_name', 'y_source':'x_source' }) #add reverse edges\n",
"#kg_rev['relation'] = kg_rev['relation'] + \"_rev\"\n",
"#print(kg_rev)\n",
"\n",
"#kg = pd.concat([kg, kg_rev])\n",
"#kg = kg.drop_duplicates()\n",
"kg = kg.dropna()\n",
"# remove self loops from edges \n",
"kg = kg.query('not ((x_id == y_id) and (x_type == y_type) and (x_source == y_source) and (x_name == y_name))')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:27:41.975721Z",
"start_time": "2021-08-06T12:27:07.723129Z"
},
"hidden": true
},
"outputs": [],
"source": [
"kg.to_csv(save_path+'auxillary/kg_grouped_orphanet.csv', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Knowledge graph description"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:27:56.845094Z",
"start_time": "2021-08-06T12:27:41.977822Z"
}
},
"outputs": [],
"source": [
"kg = pd.read_csv(save_path+'auxillary/kg_grouped_orphanet.csv', low_memory=False)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:28:19.775375Z",
"start_time": "2021-08-06T12:27:56.847134Z"
}
},
"outputs": [],
"source": [
"# nodes file \n",
"nodes = pd.concat([kg.get(['x_id','x_type', 'x_name','x_source']).rename(columns={'x_id':'node_id', 'x_type':'node_type', 'x_name':'node_name', 'x_source':'node_source'}), \n",
" kg.get(['y_id','y_type', 'y_name','y_source']).rename(columns={'y_id':'node_id', 'y_type':'node_type', 'y_name':'node_name', 'y_source':'node_source'})])\n",
"nodes = nodes.drop_duplicates().reset_index().drop('index',axis=1).reset_index().rename(columns={'index':'node_index'})\n",
"\n",
"# assign index \n",
"kg = pd.merge(kg, nodes.rename(columns={'node_index':'x_index',\n",
" 'node_id':'x_id',\n",
" 'node_type':'x_type',\n",
" 'node_name':'x_name',\n",
" 'node_source':'x_source'}), 'left').dropna()\n",
"kg = pd.merge(kg, nodes.rename(columns={'node_index':'y_index',\n",
" 'node_id':'y_id',\n",
" 'node_type':'y_type',\n",
" 'node_name':'y_name',\n",
" 'node_source':'y_source'}), 'left').dropna()\n",
"kg = kg.get(['relation', 'display_relation', 'x_index', 'x_id', 'x_type', 'x_name', 'x_source',\n",
" 'y_index', 'y_id', 'y_type', 'y_name', 'y_source'])\n",
"\n",
"# edges file \n",
"edges = kg.get(['relation', 'display_relation', 'x_index', 'y_index']).copy()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:29:17.530524Z",
"start_time": "2021-08-06T12:28:19.778095Z"
}
},
"outputs": [],
"source": [
"kg.to_csv(save_path+'kg.csv', index=False)\n",
"nodes.to_csv(save_path+'nodes.csv', index=False)\n",
"edges.to_csv(save_path+'edges.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:29:17.586693Z",
"start_time": "2021-08-06T12:29:17.532592Z"
},
"code_folding": []
},
"outputs": [],
"source": [
"def kg_describe(df, by, count_col): \n",
" df = df.groupby(by).count().sort_values(by=count_col, ascending=False).rename(columns={count_col:'count'}).get(['count'])\n",
" total = np.sum(df.get('count').values)\n",
" df = df.eval('percent = 100*count/@total')\n",
" df = df.append(df.sum(0).rename('total'))\n",
" df['count'] = df.get(['count']).astype('int')\n",
" df['percent'] = df.get(['percent']).round(1)\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:29:17.776738Z",
"start_time": "2021-08-06T12:29:17.591473Z"
},
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>count</th>\n",
" <th>percent</th>\n",
" </tr>\n",
" <tr>\n",
" <th>node_type</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>biological_process</th>\n",
" <td>28642</td>\n",
" <td>22.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>gene/protein</th>\n",
" <td>27671</td>\n",
" <td>21.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>disease</th>\n",
" <td>16305</td>\n",
" <td>12.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>effect/phenotype</th>\n",
" <td>15874</td>\n",
" <td>12.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>anatomy</th>\n",
" <td>14035</td>\n",
" <td>10.9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>molecular_function</th>\n",
" <td>11169</td>\n",
" <td>8.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>drug</th>\n",
" <td>7949</td>\n",
" <td>6.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>cellular_component</th>\n",
" <td>4176</td>\n",
" <td>3.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pathway</th>\n",
" <td>2516</td>\n",
" <td>1.9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>exposure</th>\n",
" <td>802</td>\n",
" <td>0.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>total</th>\n",
" <td>129139</td>\n",
" <td>100.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" count percent\n",
"node_type \n",
"biological_process 28642 22.2\n",
"gene/protein 27671 21.4\n",
"disease 16305 12.6\n",
"effect/phenotype 15874 12.3\n",
"anatomy 14035 10.9\n",
"molecular_function 11169 8.6\n",
"drug 7949 6.2\n",
"cellular_component 4176 3.2\n",
"pathway 2516 1.9\n",
"exposure 802 0.6\n",
"total 129139 100.0"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kg_describe(nodes,'node_type','node_index')"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"ExecuteTime": {
"end_time": "2021-08-06T12:29:18.783745Z",
"start_time": "2021-08-06T12:29:17.779433Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>count</th>\n",
" <th>percent</th>\n",
" </tr>\n",
" <tr>\n",
" <th>relation</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>drug_drug</th>\n",
" <td>2672628</td>\n",
" <td>49.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>protein_present_anatomy</th>\n",
" <td>1518203</td>\n",
" <td>28.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>protein_protein</th>\n",
" <td>321075</td>\n",
" <td>5.9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>disease_phenotype_positive</th>\n",
" <td>172469</td>\n",
" <td>3.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>protein_bioprocess</th>\n",
" <td>144805</td>\n",
" <td>2.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>protein_cellcomp</th>\n",
" <td>83402</td>\n",
" <td>1.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>drug_effect</th>\n",
" <td>79137</td>\n",
" <td>1.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>disease_protein</th>\n",
" <td>74752</td>\n",
" <td>1.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>protein_molfunc</th>\n",
" <td>69530</td>\n",
" <td>1.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>bioprocess_bioprocess</th>\n",
" <td>52886</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>protein_pathway</th>\n",
" <td>42646</td>\n",
" <td>0.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>disease_disease</th>\n",
" <td>30066</td>\n",
" <td>0.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>contraindication</th>\n",
" <td>25716</td>\n",
" <td>0.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>drug_protein</th>\n",
" <td>25653</td>\n",
" <td>0.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>phenotype_phenotype</th>\n",
" <td>21925</td>\n",
" <td>0.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>protein_absent_anatomy</th>\n",
" <td>19887</td>\n",
" <td>0.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>anatomy_anatomy</th>\n",
" <td>14032</td>\n",
" <td>0.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>molfunc_molfunc</th>\n",
" <td>13574</td>\n",
" <td>0.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>phenotype_protein</th>\n",
" <td>10518</td>\n",
" <td>0.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>indication</th>\n",
" <td>8115</td>\n",
" <td>0.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>cellcomp_cellcomp</th>\n",
" <td>4845</td>\n",
" <td>0.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pathway_pathway</th>\n",
" <td>2535</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>off-label use</th>\n",
" <td>2299</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>exposure_exposure</th>\n",
" <td>2140</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>exposure_disease</th>\n",
" <td>1788</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>exposure_bioprocess</th>\n",
" <td>1625</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>disease_phenotype_negative</th>\n",
" <td>1318</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>exposure_protein</th>\n",
" <td>1212</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>exposure_molfunc</th>\n",
" <td>45</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>exposure_cellcomp</th>\n",
" <td>10</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>total</th>\n",
" <td>5418836</td>\n",
" <td>100.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" count percent\n",
"relation \n",
"drug_drug 2672628 49.3\n",
"protein_present_anatomy 1518203 28.0\n",
"protein_protein 321075 5.9\n",
"disease_phenotype_positive 172469 3.2\n",
"protein_bioprocess 144805 2.7\n",
"protein_cellcomp 83402 1.5\n",
"drug_effect 79137 1.5\n",
"disease_protein 74752 1.4\n",
"protein_molfunc 69530 1.3\n",
"bioprocess_bioprocess 52886 1.0\n",
"protein_pathway 42646 0.8\n",
"disease_disease 30066 0.6\n",
"contraindication 25716 0.5\n",
"drug_protein 25653 0.5\n",
"phenotype_phenotype 21925 0.4\n",
"protein_absent_anatomy 19887 0.4\n",
"anatomy_anatomy 14032 0.3\n",
"molfunc_molfunc 13574 0.3\n",
"phenotype_protein 10518 0.2\n",
"indication 8115 0.1\n",
"cellcomp_cellcomp 4845 0.1\n",
"pathway_pathway 2535 0.0\n",
"off-label use 2299 0.0\n",
"exposure_exposure 2140 0.0\n",
"exposure_disease 1788 0.0\n",
"exposure_bioprocess 1625 0.0\n",
"disease_phenotype_negative 1318 0.0\n",
"exposure_protein 1212 0.0\n",
"exposure_molfunc 45 0.0\n",
"exposure_cellcomp 10 0.0\n",
"total 5418836 100.0"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kg_describe(edges,'relation','x_index')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"hide_input": false,
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
},
"notify_time": "10",
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": false,
"sideBar": false,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {
"height": "549.5652465820312px",
"left": "993.9918212890625px",
"top": "49.45652389526367px",
"width": "161.64402770996094px"
},
"toc_section_display": false,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 4
}