529 lines (528 with data), 23.4 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pickle\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\mulyadi\\TempRepo\\hackathon\\AIAgents4Pharma\\venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"# Load the knowledge graph\n",
"pyg_file = \"../../../aiagents4pharma/talk2knowledgegraphs/tests/files/primekg_ibd_pyg_graph.pkl\"\n",
"with open(pyg_file, \"rb\") as f:\n",
" pyg_data = pickle.load(f)\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Data(x=[3426, 768], edge_index=[2, 12752], edge_attr=[12752, 768], node_id=[3426], node_name=[3426], node_type=[3426], enriched_node=[3426], key=[12752], head_id=[12752], head_name=[12752], tail_id=[12752], tail_name=[12752], edge_type=[12752], enriched_edge=[12752])"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pyg_data"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>node_id</th>\n",
" <th>node_name</th>\n",
" <th>node_type</th>\n",
" <th>enriched_node</th>\n",
" <th>embedded_node</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>SMAD3_(144)</td>\n",
" <td>SMAD3</td>\n",
" <td>gene/protein</td>\n",
" <td>SMAD3 belongs to gene/protein category. The SM...</td>\n",
" <td>[0.02653600461781025, 0.05420931056141853, -0....</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>IL10RB_(179)</td>\n",
" <td>IL10RB</td>\n",
" <td>gene/protein</td>\n",
" <td>IL10RB belongs to gene/protein category. The p...</td>\n",
" <td>[0.02476494573056698, 0.02278200164437294, -0....</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>GNA12_(192)</td>\n",
" <td>GNA12</td>\n",
" <td>gene/protein</td>\n",
" <td>GNA12 belongs to gene/protein category. Predic...</td>\n",
" <td>[0.00479594711214304, 0.04921527951955795, -0....</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>HNF4A_(279)</td>\n",
" <td>HNF4A</td>\n",
" <td>gene/protein</td>\n",
" <td>HNF4A belongs to gene/protein category. The pr...</td>\n",
" <td>[0.013905026949942112, 0.032602787017822266, -...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>VCAM1_(417)</td>\n",
" <td>VCAM1</td>\n",
" <td>gene/protein</td>\n",
" <td>VCAM1 belongs to gene/protein category. This g...</td>\n",
" <td>[0.04729974642395973, 0.03262118622660637, -0....</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3421</th>\n",
" <td>IRAK2 mediated activation of TAK1 complex upon...</td>\n",
" <td>IRAK2 mediated activation of TAK1 complex upon...</td>\n",
" <td>pathway</td>\n",
" <td>IRAK2 mediated activation of TAK1 complex upon...</td>\n",
" <td>[-0.014931154437363148, 0.03044624999165535, -...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3422</th>\n",
" <td>TRAF6 mediated IRF7 activation in TLR7/8 or 9 ...</td>\n",
" <td>TRAF6 mediated IRF7 activation in TLR7/8 or 9 ...</td>\n",
" <td>pathway</td>\n",
" <td>TRAF6 mediated IRF7 activation in TLR7/8 or 9 ...</td>\n",
" <td>[0.03156436234712601, 0.05540117993950844, -0....</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3423</th>\n",
" <td>Antigen processing: Ubiquitination & Proteasom...</td>\n",
" <td>Antigen processing: Ubiquitination & Proteasom...</td>\n",
" <td>pathway</td>\n",
" <td>Antigen processing: Ubiquitination & Proteasom...</td>\n",
" <td>[0.04519890621304512, 0.029452601447701454, -0...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3424</th>\n",
" <td>Antigen Presentation: Folding, assembly and pe...</td>\n",
" <td>Antigen Presentation: Folding, assembly and pe...</td>\n",
" <td>pathway</td>\n",
" <td>Antigen Presentation: Folding, assembly and pe...</td>\n",
" <td>[0.014839296229183674, 0.04876236990094185, -0...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3425</th>\n",
" <td>Kinesins_(129367)</td>\n",
" <td>Kinesins</td>\n",
" <td>pathway</td>\n",
" <td>Kinesins belongs to pathway category. This pat...</td>\n",
" <td>[0.038248274475336075, 0.07633280754089355, -0...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3426 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" node_id \\\n",
"0 SMAD3_(144) \n",
"1 IL10RB_(179) \n",
"2 GNA12_(192) \n",
"3 HNF4A_(279) \n",
"4 VCAM1_(417) \n",
"... ... \n",
"3421 IRAK2 mediated activation of TAK1 complex upon... \n",
"3422 TRAF6 mediated IRF7 activation in TLR7/8 or 9 ... \n",
"3423 Antigen processing: Ubiquitination & Proteasom... \n",
"3424 Antigen Presentation: Folding, assembly and pe... \n",
"3425 Kinesins_(129367) \n",
"\n",
" node_name node_type \\\n",
"0 SMAD3 gene/protein \n",
"1 IL10RB gene/protein \n",
"2 GNA12 gene/protein \n",
"3 HNF4A gene/protein \n",
"4 VCAM1 gene/protein \n",
"... ... ... \n",
"3421 IRAK2 mediated activation of TAK1 complex upon... pathway \n",
"3422 TRAF6 mediated IRF7 activation in TLR7/8 or 9 ... pathway \n",
"3423 Antigen processing: Ubiquitination & Proteasom... pathway \n",
"3424 Antigen Presentation: Folding, assembly and pe... pathway \n",
"3425 Kinesins pathway \n",
"\n",
" enriched_node \\\n",
"0 SMAD3 belongs to gene/protein category. The SM... \n",
"1 IL10RB belongs to gene/protein category. The p... \n",
"2 GNA12 belongs to gene/protein category. Predic... \n",
"3 HNF4A belongs to gene/protein category. The pr... \n",
"4 VCAM1 belongs to gene/protein category. This g... \n",
"... ... \n",
"3421 IRAK2 mediated activation of TAK1 complex upon... \n",
"3422 TRAF6 mediated IRF7 activation in TLR7/8 or 9 ... \n",
"3423 Antigen processing: Ubiquitination & Proteasom... \n",
"3424 Antigen Presentation: Folding, assembly and pe... \n",
"3425 Kinesins belongs to pathway category. This pat... \n",
"\n",
" embedded_node \n",
"0 [0.02653600461781025, 0.05420931056141853, -0.... \n",
"1 [0.02476494573056698, 0.02278200164437294, -0.... \n",
"2 [0.00479594711214304, 0.04921527951955795, -0.... \n",
"3 [0.013905026949942112, 0.032602787017822266, -... \n",
"4 [0.04729974642395973, 0.03262118622660637, -0.... \n",
"... ... \n",
"3421 [-0.014931154437363148, 0.03044624999165535, -... \n",
"3422 [0.03156436234712601, 0.05540117993950844, -0.... \n",
"3423 [0.04519890621304512, 0.029452601447701454, -0... \n",
"3424 [0.014839296229183674, 0.04876236990094185, -0... \n",
"3425 [0.038248274475336075, 0.07633280754089355, -0... \n",
"\n",
"[3426 rows x 5 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Convert the PyG data to a pandas DataFrame for node\n",
"df_nodes = pd.DataFrame({\n",
" \"node_id\": pyg_data.node_id,\n",
" \"node_name\": pyg_data.node_name,\n",
" \"node_type\": pyg_data.node_type,\n",
" \"enriched_node\": pyg_data.enriched_node,\n",
" \"embedded_node\": pyg_data.x.tolist(),\n",
"})\n",
"df_nodes"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>head_id</th>\n",
" <th>head_name</th>\n",
" <th>edge_type</th>\n",
" <th>tail_id</th>\n",
" <th>tail_name</th>\n",
" <th>enriched_edge</th>\n",
" <th>embedded_edge</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>SMAD3_(144)</td>\n",
" <td>SMAD3</td>\n",
" <td>(gene/protein, associated with, disease)</td>\n",
" <td>Crohn disease_(37784)</td>\n",
" <td>Crohn disease</td>\n",
" <td>SMAD3 (gene/protein) has a direct relationship...</td>\n",
" <td>[0.052218832075595856, 0.011464782059192657, -...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>SMAD3_(144)</td>\n",
" <td>SMAD3</td>\n",
" <td>(gene/protein, associated with, disease)</td>\n",
" <td>inflammatory bowel disease_(28158)</td>\n",
" <td>inflammatory bowel disease</td>\n",
" <td>SMAD3 (gene/protein) has a direct relationship...</td>\n",
" <td>[0.04878539964556694, 0.027767326682806015, -0...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>SMAD3_(144)</td>\n",
" <td>SMAD3</td>\n",
" <td>(gene/protein, associated with, disease)</td>\n",
" <td>Crohn's colitis_(83770)</td>\n",
" <td>Crohn's colitis</td>\n",
" <td>SMAD3 (gene/protein) has a direct relationship...</td>\n",
" <td>[0.04968055710196495, 0.013924038037657738, -0...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>SMAD3_(144)</td>\n",
" <td>SMAD3</td>\n",
" <td>(gene/protein, associated with, disease)</td>\n",
" <td>Crohn ileitis and jejunitis_(35814)</td>\n",
" <td>Crohn ileitis and jejunitis</td>\n",
" <td>SMAD3 (gene/protein) has a direct relationship...</td>\n",
" <td>[0.03398257866501808, 0.014872003346681595, -0...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>SMAD3_(144)</td>\n",
" <td>SMAD3</td>\n",
" <td>(gene/protein, interacts with, pathway)</td>\n",
" <td>Signaling by NODAL_(62373)</td>\n",
" <td>Signaling by NODAL</td>\n",
" <td>SMAD3 (gene/protein) has a direct relationship...</td>\n",
" <td>[0.01159461960196495, 0.01849970780313015, -0....</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12747</th>\n",
" <td>IRAK2 mediated activation of TAK1 complex upon...</td>\n",
" <td>IRAK2 mediated activation of TAK1 complex upon...</td>\n",
" <td>(pathway, interacts with, gene/protein)</td>\n",
" <td>TLR4_(3259)</td>\n",
" <td>TLR4</td>\n",
" <td>IRAK2 mediated activation of TAK1 complex upon...</td>\n",
" <td>[-0.00019741167488973588, 0.006676936056464910...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12748</th>\n",
" <td>TRAF6 mediated IRF7 activation in TLR7/8 or 9 ...</td>\n",
" <td>TRAF6 mediated IRF7 activation in TLR7/8 or 9 ...</td>\n",
" <td>(pathway, interacts with, gene/protein)</td>\n",
" <td>TLR9_(10113)</td>\n",
" <td>TLR9</td>\n",
" <td>TRAF6 mediated IRF7 activation in TLR7/8 or 9 ...</td>\n",
" <td>[0.03718600049614906, 0.01651887036859989, -0....</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12749</th>\n",
" <td>Antigen processing: Ubiquitination & Proteasom...</td>\n",
" <td>Antigen processing: Ubiquitination & Proteasom...</td>\n",
" <td>(pathway, interacts with, gene/protein)</td>\n",
" <td>HERC2_(1777)</td>\n",
" <td>HERC2</td>\n",
" <td>Antigen processing: Ubiquitination & Proteasom...</td>\n",
" <td>[0.057375308126211166, 0.009233011864125729, -...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12750</th>\n",
" <td>Antigen Presentation: Folding, assembly and pe...</td>\n",
" <td>Antigen Presentation: Folding, assembly and pe...</td>\n",
" <td>(pathway, interacts with, gene/protein)</td>\n",
" <td>ERAP2_(12763)</td>\n",
" <td>ERAP2</td>\n",
" <td>Antigen Presentation: Folding, assembly and pe...</td>\n",
" <td>[0.008740102872252464, 0.007800932973623276, -...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12751</th>\n",
" <td>Kinesins_(129367)</td>\n",
" <td>Kinesins</td>\n",
" <td>(pathway, interacts with, gene/protein)</td>\n",
" <td>KIF21B_(8564)</td>\n",
" <td>KIF21B</td>\n",
" <td>Kinesins (pathway) has a direct relationship o...</td>\n",
" <td>[0.01051196176558733, 0.04535209387540817, -0....</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>12752 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" head_id \\\n",
"0 SMAD3_(144) \n",
"1 SMAD3_(144) \n",
"2 SMAD3_(144) \n",
"3 SMAD3_(144) \n",
"4 SMAD3_(144) \n",
"... ... \n",
"12747 IRAK2 mediated activation of TAK1 complex upon... \n",
"12748 TRAF6 mediated IRF7 activation in TLR7/8 or 9 ... \n",
"12749 Antigen processing: Ubiquitination & Proteasom... \n",
"12750 Antigen Presentation: Folding, assembly and pe... \n",
"12751 Kinesins_(129367) \n",
"\n",
" head_name \\\n",
"0 SMAD3 \n",
"1 SMAD3 \n",
"2 SMAD3 \n",
"3 SMAD3 \n",
"4 SMAD3 \n",
"... ... \n",
"12747 IRAK2 mediated activation of TAK1 complex upon... \n",
"12748 TRAF6 mediated IRF7 activation in TLR7/8 or 9 ... \n",
"12749 Antigen processing: Ubiquitination & Proteasom... \n",
"12750 Antigen Presentation: Folding, assembly and pe... \n",
"12751 Kinesins \n",
"\n",
" edge_type \\\n",
"0 (gene/protein, associated with, disease) \n",
"1 (gene/protein, associated with, disease) \n",
"2 (gene/protein, associated with, disease) \n",
"3 (gene/protein, associated with, disease) \n",
"4 (gene/protein, interacts with, pathway) \n",
"... ... \n",
"12747 (pathway, interacts with, gene/protein) \n",
"12748 (pathway, interacts with, gene/protein) \n",
"12749 (pathway, interacts with, gene/protein) \n",
"12750 (pathway, interacts with, gene/protein) \n",
"12751 (pathway, interacts with, gene/protein) \n",
"\n",
" tail_id tail_name \\\n",
"0 Crohn disease_(37784) Crohn disease \n",
"1 inflammatory bowel disease_(28158) inflammatory bowel disease \n",
"2 Crohn's colitis_(83770) Crohn's colitis \n",
"3 Crohn ileitis and jejunitis_(35814) Crohn ileitis and jejunitis \n",
"4 Signaling by NODAL_(62373) Signaling by NODAL \n",
"... ... ... \n",
"12747 TLR4_(3259) TLR4 \n",
"12748 TLR9_(10113) TLR9 \n",
"12749 HERC2_(1777) HERC2 \n",
"12750 ERAP2_(12763) ERAP2 \n",
"12751 KIF21B_(8564) KIF21B \n",
"\n",
" enriched_edge \\\n",
"0 SMAD3 (gene/protein) has a direct relationship... \n",
"1 SMAD3 (gene/protein) has a direct relationship... \n",
"2 SMAD3 (gene/protein) has a direct relationship... \n",
"3 SMAD3 (gene/protein) has a direct relationship... \n",
"4 SMAD3 (gene/protein) has a direct relationship... \n",
"... ... \n",
"12747 IRAK2 mediated activation of TAK1 complex upon... \n",
"12748 TRAF6 mediated IRF7 activation in TLR7/8 or 9 ... \n",
"12749 Antigen processing: Ubiquitination & Proteasom... \n",
"12750 Antigen Presentation: Folding, assembly and pe... \n",
"12751 Kinesins (pathway) has a direct relationship o... \n",
"\n",
" embedded_edge \n",
"0 [0.052218832075595856, 0.011464782059192657, -... \n",
"1 [0.04878539964556694, 0.027767326682806015, -0... \n",
"2 [0.04968055710196495, 0.013924038037657738, -0... \n",
"3 [0.03398257866501808, 0.014872003346681595, -0... \n",
"4 [0.01159461960196495, 0.01849970780313015, -0.... \n",
"... ... \n",
"12747 [-0.00019741167488973588, 0.006676936056464910... \n",
"12748 [0.03718600049614906, 0.01651887036859989, -0.... \n",
"12749 [0.057375308126211166, 0.009233011864125729, -... \n",
"12750 [0.008740102872252464, 0.007800932973623276, -... \n",
"12751 [0.01051196176558733, 0.04535209387540817, -0.... \n",
"\n",
"[12752 rows x 7 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Convert the PyG data to a pandas DataFrame for node\n",
"df_edges = pd.DataFrame({\n",
" \"head_id\": pyg_data.head_id,\n",
" \"head_name\": pyg_data.head_name,\n",
" \"edge_type\": pyg_data.edge_type,\n",
" \"tail_id\": pyg_data.tail_id,\n",
" \"tail_name\": pyg_data.tail_name,\n",
" \"enriched_edge\": pyg_data.enriched_edge,\n",
" \"embedded_edge\": pyg_data.edge_attr.tolist(),\n",
"})\n",
"df_edges"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}