In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import os, sys

In [3]:
import pandas as pd

In [4]:
sys.path.append('..')

from pyMultiOmics.base import SingleOmicsData, MultiOmicsData
from pyMultiOmics.constants import GENES, PROTEINS, COMPOUNDS, DANIO_RERIO, REACTIONS, PATHWAYS
from pyMultiOmics.mapping import Mapper
from pyMultiOmics.common import set_log_level_info, set_log_level_debug, download_file, extract_zip_file

2022-03-24 14:40:16.459 | INFO     | pyMultiOmics.reactome:get_neo4j_driver:24 - Created graph database driver for bolt://localhost:7687 (neo4j)


# Demonstration of pyMultiOmics mapping

## Load the processed Zebrafish data from [1]

[1] [Rabinowitz, Jeremy S., et al. "Transcriptomic, proteomic, and metabolomic landscape of positional memory in the caudal fin of zebrafish." Proceedings of the National Academy of Sciences 114.5 (2017): E717-E726.](https://www.pnas.org/content/114/5/E717.short)

In [5]:
url = 'https://github.com/glasgowcompbio/pyMultiOmics-data/raw/main/zebrafish_data.zip'
out_file = download_file(url)
extract_zip_file(out_file)

2022-03-24 14:40:25.643 | INFO     | pyMultiOmics.common:download_file:59 - Downloading zebrafish_data.zip
1.75kKB [00:00, 22.0kKB/s]                                                                                              
2022-03-24 14:40:25.771 | INFO     | pyMultiOmics.common:extract_zip_file:71 - Extracting zebrafish_data.zip
100%|██████████████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 349.96it/s]
2022-03-24 14:40:25.823 | INFO     | pyMultiOmics.common:extract_zip_file:77 - Deleting zebrafish_data.zip


In [6]:
DATA_FOLDER = os.path.abspath(os.path.join('zebrafish_data'))
DATA_FOLDER

'/Users/joewandy/Work/git/pyMultiOmics/notebooks/zebrafish_data'

Read transcriptomics data (identified by their gene ids)

In [7]:
gene_data = pd.read_csv(os.path.join(DATA_FOLDER, 'gene_data_combined.csv'), index_col='Identifier')
gene_design = pd.read_csv(os.path.join(DATA_FOLDER, 'gene_design.csv'), index_col='sample')

In [8]:
gene_data.head()

Unnamed: 0_level_0,US-1584693,US-1584700,US-1584706,US-1584712,US-1584722,US-1584724,US-1584725,US-1584732,US-1584738,US-1584744,...,US-1584753,US-1584754,US-1584758,US-1584765,FC_distal_vs_proximal,padj_distal_vs_proximal,FC_distal_vs_middle,padj_distal_vs_middle,FC_middle_vs_proximal,padj_middle_vs_proximal
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSDARG00000000001,51,40,69,78,89,47,88,86,110,55,...,58,104,43,72,0.869331,8e-06,0.748943,4.38e-05,0.114026,0.630834
ENSDARG00000000002,283,129,164,269,211,171,146,256,283,150,...,142,272,260,256,0.287823,0.031298,1.005337,1.31e-13,-0.724987,1e-06
ENSDARG00000000018,545,503,547,387,332,559,623,499,436,488,...,462,287,495,299,-0.437271,0.000389,-0.40477,0.00068684,-0.040193,0.760679
ENSDARG00000000019,437,469,538,557,550,404,544,443,623,502,...,470,460,329,480,0.521291,1.5e-05,0.271082,0.01936266,0.242435,0.041606
ENSDARG00000000068,266,249,247,236,195,247,283,259,299,232,...,231,236,274,241,0.06482,0.595522,0.142243,0.2579239,-0.084764,0.528336


In [9]:
gene_design

Unnamed: 0_level_0,group
sample,Unnamed: 1_level_1
US-1584752,Proximal
US-1584732,Proximal
US-1584724,Proximal
US-1584693,Proximal
US-1584758,Proximal
US-1584725,Middle
US-1584706,Middle
US-1584700,Middle
US-1584744,Middle
US-1584753,Middle


Read proteomics data

In [10]:
protein_data = pd.read_csv(os.path.join(DATA_FOLDER, 'protein_data.csv'), index_col='Uniprot')
protein_design = pd.read_csv(os.path.join(DATA_FOLDER, 'protein_design.csv'), index_col='sample')

In [11]:
protein_data.head()

Unnamed: 0_level_0,Distal#3_01,Distal#3_02,Distal#3_03,Distal#3_04,Middle#3_01,Middle#3_02,Middle#3_03,Middle#3_04,Proximal#3_01,Proximal#3_02,Proximal#3_03,Proximal#3_04
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
A0A0A0MPL4,995526.4,4946580.0,1377194.0,2208140.0,2907807.0,4231976.0,1560849.0,2852904.0,1781795.086,2668135.0,3079148.0,2840473.0
A0A0B4J1A5,2982519.0,8816655.0,7668431.0,4632309.0,7672153.0,7776017.0,6633781.0,8242783.0,5475654.544,5703832.0,8294364.0,13348740.0
A0A0B4J1A7,15530490.0,1037155.0,18561370.0,17678590.0,13757360.0,17479980.0,15175070.0,23944650.0,3157387.719,17947750.0,23004300.0,20638000.0
A0AUQ3,2012699.0,3088982.0,2455865.0,944833.1,2866780.0,2661669.0,2100352.0,2133662.0,1738244.989,2629396.0,2900560.0,2416018.0
A0AUR9,3640487.0,25884770.0,34159890.0,2868569.0,1971142.0,2472776.0,5615177.0,1303356.0,3263299.566,6866769.0,2465929.0,4515643.0


In [12]:
protein_design

Unnamed: 0_level_0,group
sample,Unnamed: 1_level_1
Distal#3_01,Distal
Distal#3_02,Distal
Distal#3_03,Distal
Distal#3_04,Distal
Middle#3_01,Middle
Middle#3_02,Middle
Middle#3_03,Middle
Middle#3_04,Middle
Proximal#3_01,Proximal
Proximal#3_02,Proximal


Read metabolomics data

In [13]:
compound_data = pd.read_csv(os.path.join(DATA_FOLDER, 'compound_data_kegg.csv'), index_col='Identifier')
compound_design = pd.read_csv(os.path.join(DATA_FOLDER, 'compound_design.csv'), index_col='sample')

In [14]:
compound_data.head()

Unnamed: 0_level_0,distal_M1,distal_M2,distal_M3,distal_F1,distal_F2,distal_F3,middle_M1,middle_M2,middle_M3,middle_F1,middle_F2,middle_F3,proximal_M1,proximal_M2,proximal_M3,proximal_F1,proximal_F2,proximal_F3
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
C00565,75170.0,57052,39170.0,84057,38608.0,64126.0,50214.0,75680,165178,121856,77061,98015.0,113765,96098,84198,117644,169459,169669
C00037,64511.0,33658,23565.0,52102,49508.0,37498.0,30417.0,55728,88519,103871,45974,73101.0,72725,66008,54220,95341,110192,291598
C01104,5787534.0,4351239,4401036.0,8187282,8431125.0,5082056.0,5138937.0,7341351,7837293,9256269,9934066,10243285.0,7344406,5524811,4809250,9279874,9047339,9211255
C00134,3430897.0,1877785,1225710.0,2326620,2421267.0,2595529.0,2003627.0,2120053,2269318,3220850,4596854,3155377.0,3760854,2658833,2488025,2506550,4000703,3292566
C00213,112845.0,129977,122292.0,63219,50113.0,100343.0,156651.0,176682,379322,160906,56802,107161.0,235982,181200,142994,116132,94589,167280


In [15]:
compound_design

Unnamed: 0_level_0,group
sample,Unnamed: 1_level_1
distal_M1,Distal
distal_M2,Distal
distal_M3,Distal
distal_F1,Distal
distal_F2,Distal
distal_F3,Distal
middle_M1,Middle
middle_M2,Middle
middle_M3,Middle
middle_F1,Middle


In [16]:
set_log_level_info()

1

## Create omics data

In [17]:
transcript_data = SingleOmicsData(GENES, gene_data, gene_design)
transcript_data

genes data with (31953, 15) measurements

In [18]:
protein_data = SingleOmicsData(PROTEINS, protein_data, protein_design)
protein_data

proteins data with (3061, 12) measurements

In [19]:
compound_data = SingleOmicsData(COMPOUNDS, compound_data, compound_design)
compound_data

compounds data with (130, 18) measurements

In [20]:
publication = 'Rabinowitz, Jeremy S., et al. "Transcriptomic, proteomic, and metabolomic landscape of positional memory in the caudal fin of zebrafish." Proceedings of the National Academy of Sciences 114.5 (2017): E717-E726.'
url = 'https://www.pnas.org/content/114/5/E717.short'

In [21]:
mo = MultiOmicsData(publication=publication, url=url)
mo.add_data([transcript_data, protein_data, compound_data])
mo

Multi-omics data container
- publication: Rabinowitz, Jeremy S., et al. "Transcriptomic, proteomic, and metabolomic landscape of positional memory in the caudal fin of zebrafish." Proceedings of the National Academy of Sciences 114.5 (2017): E717-E726.
- URL: https://www.pnas.org/content/114/5/E717.short
- Views: 3 modalities
	 - genes data with (31953, 15) measurements
	 - proteins data with (3061, 12) measurements
	 - compounds data with (130, 18) measurements

## Create a mapping object

The mapping object uses Reactome to map the different biological entities in the data:
- Transcripts (or genes) are connected to the proteins they encode
- Proteins and compounds are connected to reactions they're involved in
- Reactions are connected to pathways

In [22]:
m = Mapper(mo, DANIO_RERIO, metabolic_pathway_only=True)
m.build()

2022-03-24 14:40:26.569 | INFO     | pyMultiOmics.functions:remove_dupes:385 - Removing 2 rows with duplicate identifiers
2022-03-24 14:40:26.570 | INFO     | pyMultiOmics.functions:reactome_mapping:78 - There are 128 observed compound ids
2022-03-24 14:40:26.571 | INFO     | pyMultiOmics.functions:reactome_mapping:81 - Mapping genes -> proteins
2022-03-24 14:40:34.971 | INFO     | pyMultiOmics.functions:reactome_mapping:86 - Mapping proteins -> reactions
2022-03-24 14:40:46.041 | INFO     | pyMultiOmics.functions:reactome_mapping:94 - Mapping compounds -> reactions
2022-03-24 14:40:49.516 | INFO     | pyMultiOmics.functions:reactome_mapping:100 - Mapping reactions -> pathways
2022-03-24 14:40:50.558 | INFO     | pyMultiOmics.functions:reactome_mapping:111 - Mapping reactions -> proteins
2022-03-24 14:40:57.492 | INFO     | pyMultiOmics.functions:reactome_mapping:118 - Mapping reactions -> compounds
2022-03-24 14:41:01.232 | INFO     | pyMultiOmics.functions:reactome_mapping:130 - Mapp

<pyMultiOmics.mapping.Mapper at 0x174b0a440>

In [23]:
m

<pyMultiOmics.mapping.Mapper at 0x174b0a440>

## Querying mapping object

Below shows some example queries we can perform with the mapping object

##### Find reactions that are connected to some observed genes, proteins and compounds in the data

In [24]:
reactions = m.get_nodes(types=REACTIONS)

data = []
for reaction_id, reaction_data in reactions:
    reaction_name = reaction_data['display_name']
    genes = m.get_connected(reaction_id, dest_type=GENES, observed=True)
    proteins = m.get_connected(reaction_id, dest_type=PROTEINS, observed=True)
    compounds = m.get_connected(reaction_id, dest_type=COMPOUNDS, observed=True)
    
    if len(genes) > 0 and len(proteins) > 0 and len(compounds) > 0:
        row = [reaction_id, reaction_name, len(genes), len(proteins), len(compounds)]
        data.append(row)

df = pd.DataFrame(data, columns=['reaction_id', 'reaction_name', 'num_genes', 'num_proteins', 'num_compounds'])
df

Unnamed: 0,reaction_id,reaction_name,num_genes,num_proteins,num_compounds
0,R-DRE-109278,"Nt5e:zn2+ hydrolyses amp,damp,gmp, imp",1,1,3
1,R-DRE-109291,"Cmp or tmp or ump + h2o => cytidine, thymidine...",1,1,2
2,R-DRE-109415,Amp + h2o => adenosine + orthophosphate [nt5c1b],1,1,1
3,R-DRE-109624,(2-deoxy)adenosine + atp => (d)amp + adp (adk),1,1,1
4,R-DRE-1237160,Mta is cleaved and phosphorylated,1,1,1
5,R-DRE-1247910,Cndp2:2mn2+ dimer hydrolyses cysgly,1,1,1
6,R-DRE-139970,Fmo3:fad n-oxidises tma to tmao,3,1,2
7,R-DRE-1482976,Cdp-dag is converted to pi by cdipt,1,1,1
8,R-DRE-1614583,Pxlp-k212-cth cleaves l-cystathionine,1,1,1
9,R-DRE-174401,Ahcy:nad+ tetramer hydrolyses adohcy,1,1,1


##### Find everything connected to protein 'F1QAA7'

In [25]:
query_id = 'F1QAA7'
m.get_connected(query_id)

Unnamed: 0_level_0,display_name,data_type,observed,source_id
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSDARG00000037781,Acss2,genes,True,F1QAA7
456215,Adenosine 5-monophosphate(2-),compounds,False,F1QAA7
33019,Diphosphate(3-),compounds,False,F1QAA7
57288,Acetyl-coa(4-),compounds,False,F1QAA7
57287,Coenzyme a(4-),compounds,False,F1QAA7
30616,Atp(4-),compounds,False,F1QAA7
15366,Acetic acid,compounds,False,F1QAA7
R-DRE-71735,Acetate + coa + atp => acetyl-coa + amp + pyro...,reactions,,F1QAA7
R-DRE-71384,Ethanol oxidation,pathways,,F1QAA7


##### Find compounds in the data connected to protein 'F1QAA7'

In [26]:
query_id = 'F1QAA7'
m.get_connected(query_id, dest_type=COMPOUNDS)

Unnamed: 0_level_0,display_name,data_type,observed,source_id
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
456215,Adenosine 5-monophosphate(2-),compounds,False,F1QAA7
33019,Diphosphate(3-),compounds,False,F1QAA7
57288,Acetyl-coa(4-),compounds,False,F1QAA7
57287,Coenzyme a(4-),compounds,False,F1QAA7
30616,Atp(4-),compounds,False,F1QAA7
15366,Acetic acid,compounds,False,F1QAA7


##### Find observed genes and proteins connected to compound '33019'

In [27]:
query_id = '33019'
genes = m.get_connected(query_id, dest_type=[GENES, PROTEINS], observed=True)
genes

Unnamed: 0_level_0,display_name,data_type,observed,source_id
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSDARG00000058162,Pcyt1ba,genes,True,33019
ENSDARG00000011233,Pcyt1aa,genes,True,33019
ENSDARG00000004517,Ppat,genes,True,33019
ENSDARG00000039934,Hlcs,genes,True,33019
ENSDARG00000061994,Acacb,genes,True,33019
...,...,...,...,...
B0V0X1,B0V0X1,proteins,True,33019
B8JLW8,B8JLW8,proteins,True,33019
F1QYS7,F1QYS7,proteins,True,33019
B0S5C4,B0S5C4,proteins,True,33019


##### Find pathways connected to genes 'ENSDARG00000087927'

In [28]:
query_id = 'ENSDARG00000087927'
m.get_connected(query_id, dest_type=PATHWAYS)

Unnamed: 0_level_0,display_name,data_type,observed,source_id
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
R-DRE-2393930,Phosphate bond hydrolysis by nudt proteins,pathways,,ENSDARG00000087927


##### Find genes, proteins, compounds and pathways connected to reaction 'R-DRE-2395818'

In [29]:
m.get_connected('R-DRE-2395818')

Unnamed: 0_level_0,display_name,data_type,observed,source_id
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSDARG00000030573,Nudt1,genes,True,R-DRE-2395818
Q7ZWC3,Q7ZWC3,proteins,False,R-DRE-2395818
15377,Water,compounds,False,R-DRE-2395818
18420,Magnesium(2+),compounds,False,R-DRE-2395818
63212,2-hydroxy-damp(2-),compounds,False,R-DRE-2395818
77897,2-hydroxy-datp(4-),compounds,False,R-DRE-2395818
15378,Hydron,compounds,False,R-DRE-2395818
33019,Diphosphate(3-),compounds,False,R-DRE-2395818
R-DRE-2393930,Phosphate bond hydrolysis by nudt proteins,pathways,,R-DRE-2395818


##### Find observed genes, proteins, compounds and reactions involved in the pathway 'R-DRE-2393930'

In [30]:
m.get_connected('R-DRE-2393930', dest_type=[GENES, PROTEINS, COMPOUNDS], observed=True)

Unnamed: 0_level_0,display_name,data_type,observed,source_id
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSDARG00000030573,Nudt1,genes,True,R-DRE-2393930
ENSDARG00000019503,Zgc:103759,genes,True,R-DRE-2393930
ENSDARG00000078073,Nudt5,genes,True,R-DRE-2393930
ENSDARG00000087927,Nudt9,genes,True,R-DRE-2393930
ENSDARG00000041576,Nudt18,genes,True,R-DRE-2393930
ENSDARG00000026090,Adprm,genes,True,R-DRE-2393930
Q6IQ66,Q6IQ66,proteins,True,R-DRE-2393930
F1QL34,F1QL34,proteins,True,R-DRE-2393930
