In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import os, sys

In [3]:
import pylab as plt
import matplotlib

import numpy as np
import pandas as pd

In [4]:
sys.path.append('..')

from pyMultiOmics.base import SingleOmicsData, MultiOmicsData
from pyMultiOmics.constants import HOMO_SAPIENS, PROTEINS, COMPOUNDS, REACTIONS
from pyMultiOmics.mapping import Mapper
from pyMultiOmics.common import set_log_level_info, set_log_level_debug, download_file, extract_zip_file

2022-03-24 14:40:01.677 | INFO     | pyMultiOmics.reactome:get_neo4j_driver:24 - Created graph database driver for bolt://localhost:7687 (neo4j)


# Demonstration of pyMultiOmics mapping

## Load the processed Covid data from [1]

[1] [Shen, Bo, et al. "Proteomic and metabolomic characterization of COVID-19 patient sera." Cell 182.1 (2020): 59-72.](https://www.sciencedirect.com/science/article/pii/S0092867420306279?casa_token=wKwWn9P4MK0AAAAA:v8z5MVnQ1ONrcatncCsigSDoxeOq2ZOcN4da9SofGDcpeDqrC76QK8yNKrKtVUrMWBBAntI8)

In [5]:
url = 'https://github.com/glasgowcompbio/pyMultiOmics-data/raw/main/covid19_dualomics_data.zip'
out_file = download_file(url)
extract_zip_file(out_file)

2022-03-24 14:40:02.949 | INFO     | pyMultiOmics.common:download_file:59 - Downloading covid19_dualomics_data.zip
551KB [00:00, 8.89kKB/s]                                                                                                
2022-03-24 14:40:03.061 | INFO     | pyMultiOmics.common:extract_zip_file:71 - Extracting covid19_dualomics_data.zip
100%|█████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 1191.03it/s]
2022-03-24 14:40:03.075 | INFO     | pyMultiOmics.common:extract_zip_file:77 - Deleting covid19_dualomics_data.zip


In [6]:
DATA_FOLDER = os.path.abspath(os.path.join('covid19_dualomics_data'))
DATA_FOLDER

'/Users/joewandy/Work/git/pyMultiOmics/notebooks/covid19_dualomics_data'

Read proteomics data

In [7]:
protein_df = pd.read_csv(os.path.join(DATA_FOLDER, 'protein_data.csv'), index_col='Identifier')
protein_design = pd.read_csv(os.path.join(DATA_FOLDER, 'protein_design.csv'), index_col='sample')

In [8]:
protein_df.head()

Unnamed: 0_level_0,h_F1_131N,h_F1_131C,h_F1_132C,h_F2_131N,h_F2_131C,h_F2_132C,h_F3_131N,h_F3_131C,h_F3_132C,h_F4_131N,...,s_F3_128N,s_F3_128C,s_F3_129C,s_F4_128N,s_F4_128C,s_F5_128N,s_F5_128C,s_F6_128N,s_F6_128C,s_F6_133N
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P04114,0.75,0.853,0.822,1.191,1.175,1.078,0.693,0.947,0.931,1.057,...,1.044,1.305,1.657,1.323,1.624,1.17,0.981,0.791,1.029,1.195
P01024,0.782,1.057,0.994,0.864,0.917,0.79,0.823,1.152,0.816,0.92,...,1.1,0.986,1.114,1.21,1.289,1.104,1.111,1.007,1.159,0.979
P02768,1.183,1.101,1.045,1.086,1.041,1.187,1.234,1.079,1.011,1.099,...,0.786,0.706,0.947,0.831,0.717,0.795,0.776,0.938,0.903,0.743
P01023,1.066,1.278,0.959,0.811,0.789,0.931,0.971,0.769,1.011,0.866,...,0.817,0.728,0.861,0.798,0.751,0.917,0.809,0.78,1.195,0.706
P02751,1.085,0.947,0.993,1.343,1.13,0.778,0.731,1.084,1.107,0.909,...,0.566,0.854,1.109,0.63,0.85,0.661,0.848,0.829,0.76,0.811


In [9]:
protein_design

Unnamed: 0_level_0,group
sample,Unnamed: 1_level_1
h_F1_131N,healthy
h_F1_131C,healthy
h_F1_132C,healthy
h_F2_131N,healthy
h_F2_131C,healthy
...,...
s_F5_128N,severe
s_F5_128C,severe
s_F6_128N,severe
s_F6_128C,severe


Read metabolomics data

In [10]:
compound_df = pd.read_csv(os.path.join(DATA_FOLDER, 'compound_data.csv'), index_col='Identifier')
compound_design = pd.read_csv(os.path.join(DATA_FOLDER, 'compound_design.csv'), index_col='sample')

In [11]:
compound_df.head()

Unnamed: 0_level_0,h_jkdz1,h_jkdz2,h_jkdz3,h_jkdz4,h_jkdz5,h_jkdz6,h_jkdz7,h_jkdz8,h_jkdz9,h_jkdz10,...,s_ZX12,s_ZX13,s_ZX14,s_ZX15,s_ZX16,s_ZX17,s_ZX18,s_ZX19,s_ZX20,s_ZX21
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C21482,19413052.0,6381812.0,9748316.0,5326872.0,19980720.0,3580375.0,8256121.0,8079382.0,15596590.0,15203630.0,...,1904349.0,3226016.0,737814.7,2817698.0,3329101.0,3206752.75,1466174.0,2779301.0,2117668.0,2184310.0
C18218,2711915.25,2056393.0,1445594.0,2038765.0,2536996.0,2638198.0,2285757.0,1973140.0,2015425.0,2290842.0,...,1409720.0,1413307.0,3218834.0,1602131.0,1317878.0,2930312.75,1168094.0,2946776.0,1417311.0,1474166.0
C05127,87727.25,,92387.06,,159787.9,,,,90551.3,121411.4,...,,,,,,,,138278.8,,
C01152,58832828.0,58439340.0,55521330.0,45162140.0,54789520.0,39412590.0,29878760.0,67517260.0,46660310.0,91185240.0,...,28813140.0,31643580.0,25387670.0,33076040.0,39156980.0,24400592.0,25933750.0,64138680.0,40205880.0,49044880.0
C02918,,181554.9,224039.2,160939.7,320619.4,717655.7,326818.2,513581.0,273458.2,,...,333724.5,,434715.2,35321.18,,655827.25,835970.6,4034381.0,283935.8,80621.6


In [12]:
compound_design

Unnamed: 0_level_0,group
sample,Unnamed: 1_level_1
h_jkdz1,healthy
h_jkdz2,healthy
h_jkdz3,healthy
h_jkdz4,healthy
h_jkdz5,healthy
...,...
s_ZX17,severe
s_ZX18,severe
s_ZX19,severe
s_ZX20,severe


In [13]:
set_log_level_info()

1

## Create multi-omics container object

In [14]:
protein_data = SingleOmicsData(PROTEINS, protein_df, protein_design)
protein_data

proteins data with (791, 70) measurements

In [15]:
compound_data = SingleOmicsData(COMPOUNDS, compound_df, compound_design)
compound_data

compounds data with (220, 96) measurements

In [16]:
publication = 'Proteomic and Metabolomic Characterization of COVID-19 Patient Sera'
url = 'https://www.cell.com/cell/fulltext/S0092-8674(20)30627-9'

In [17]:
mo = MultiOmicsData(publication=publication, url=url)
mo.add_data([protein_data, compound_data])
mo

Multi-omics data container
- publication: Proteomic and Metabolomic Characterization of COVID-19 Patient Sera
- URL: https://www.cell.com/cell/fulltext/S0092-8674(20)30627-9
- Views: 2 modalities
	 - proteins data with (791, 70) measurements
	 - compounds data with (220, 96) measurements

## Create a mapping object

The mapping object uses Reactome to map the different biological entities in the data:
- Transcripts (or genes) are connected to the proteins they encode
- Proteins and compounds are connected to reactions they're involved in
- Reactions are connected to pathways

In [18]:
m = Mapper(mo, HOMO_SAPIENS, metabolic_pathway_only=False)
m.build()

2022-03-24 14:40:03.534 | INFO     | pyMultiOmics.functions:remove_dupes:385 - Removing 9 rows with duplicate identifiers
2022-03-24 14:40:03.535 | INFO     | pyMultiOmics.functions:reactome_mapping:78 - There are 211 observed compound ids
2022-03-24 14:40:03.535 | INFO     | pyMultiOmics.functions:reactome_mapping:81 - Mapping genes -> proteins
2022-03-24 14:40:06.215 | INFO     | pyMultiOmics.functions:reactome_mapping:86 - Mapping proteins -> reactions
2022-03-24 14:40:07.489 | INFO     | pyMultiOmics.functions:reactome_mapping:94 - Mapping compounds -> reactions
2022-03-24 14:40:11.539 | INFO     | pyMultiOmics.functions:reactome_mapping:100 - Mapping reactions -> pathways
2022-03-24 14:40:12.580 | INFO     | pyMultiOmics.functions:reactome_mapping:111 - Mapping reactions -> proteins
2022-03-24 14:40:18.631 | INFO     | pyMultiOmics.functions:reactome_mapping:118 - Mapping reactions -> compounds
2022-03-24 14:40:21.469 | INFO     | pyMultiOmics.functions:reactome_mapping:130 - Mapp

<pyMultiOmics.mapping.Mapper at 0x1762619f0>

In [19]:
m

<pyMultiOmics.mapping.Mapper at 0x1762619f0>

## Query mapping object

Below shows some example queries we can perform with the mapping object

In [20]:
reactions = m.get_nodes(types=REACTIONS)

data = []
for reaction_id, reaction_data in reactions:
    reaction_name = reaction_data['display_name']
    proteins = m.get_connected(reaction_id, dest_type=PROTEINS, observed=True)
    compounds = m.get_connected(reaction_id, dest_type=COMPOUNDS, observed=True)
    
    if len(proteins) > 0 and len(compounds) > 0:
        row = [reaction_id, reaction_name, len(proteins), len(compounds)]
        data.append(row)

df = pd.DataFrame(data, columns=['reaction_id', 'reaction_name', 'num_proteins', 'num_compounds'])
df

Unnamed: 0,reaction_id,reaction_name,num_proteins,num_compounds
0,R-HSA-114552,Thrombin-activated pars activate g12/13,1,1
1,R-HSA-114558,Thrombin-activated pars activate gq,1,1
2,R-HSA-1214188,Prdm9 trimethylates histone h3,3,1
3,R-HSA-1605591,Glucosylceramidase cleaves the glucosidic bond...,1,1
4,R-HSA-163432,Cholesterol ester + h2o -> cholesterol + fatty...,1,1
...,...,...,...,...
223,R-HSA-9710490,The gsdme gene promoter is hypermethylated,3,1
224,R-HSA-9733545,Bile salts and acids bind alb,1,5
225,R-HSA-9733960,Bile salts and acids dissociate from alb,1,5
226,R-HSA-977071,Sialyltransferase i can add sialic acid to the...,1,1


##### List all entities connected to reaction R-HSA-194153

In [21]:
query_id = 'R-HSA-194153'
m.get_connected(query_id)

Unnamed: 0_level_0,display_name,data_type,observed,source_id
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000108846,Abcc3,genes,False,R-HSA-194153
O15438,O15438,proteins,False,R-HSA-194153
28865,Taurocholic acid,compounds,True,R-HSA-194153
30616,Atp(4-),compounds,False,R-HSA-194153
17687,Glycocholic acid,compounds,False,R-HSA-194153
36274,Glycochenodeoxycholic acid,compounds,True,R-HSA-194153
16525,Taurochenodeoxycholic acid,compounds,False,R-HSA-194153
43474,Hydrogenphosphate,compounds,False,R-HSA-194153
456216,Adp(3-),compounds,False,R-HSA-194153
16359,Cholic acid,compounds,True,R-HSA-194153


##### Query the connections between proteins and compounds (through their shared reactions)

In [22]:
query_id = 'P02768'
m.get_connected(query_id, dest_type=COMPOUNDS)

Unnamed: 0_level_0,display_name,data_type,observed,source_id
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
18421,Superoxide,compounds,False,P02768
17033,Biliverdin,compounds,True,P02768
16990,Bilirubin ixalpha,compounds,True,P02768
17627,Ferroheme b,compounds,False,P02768
36144,Ferriheme b,compounds,False,P02768
456216,Adp(3-),compounds,False,P02768
30616,Atp(4-),compounds,False,P02768
36274,Glycochenodeoxycholic acid,compounds,True,P02768
16755,Chenodeoxycholic acid,compounds,True,P02768
16359,Cholic acid,compounds,True,P02768


In [23]:
query_id = '16990'
m.get_connected(query_id, dest_type=PROTEINS)

Unnamed: 0_level_0,display_name,data_type,observed,source_id
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Q9NPD5,Q9NPD5,proteins,False,16990
P33527,P33527,proteins,False,16990
Q9BYK8,Q9BYK8,proteins,False,16990
Q96RS0,Q96RS0,proteins,False,16990
Q92793,Q92793,proteins,False,16990
...,...,...,...,...
P08047,P08047,proteins,False,16990
P30043,P30043,proteins,True,16990
P53004,P53004,proteins,False,16990
O75182,O75182,proteins,False,16990
