In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import os, sys

In [3]:
import pylab as plt
import matplotlib

import numpy as np
import pandas as pd

In [4]:
os.getcwd()
print(sys.path)

['/Users/joewandy/Work/git/pyMultiOmics/notebooks', '/opt/anaconda3/envs/pyMultiOmics/lib/python310.zip', '/opt/anaconda3/envs/pyMultiOmics/lib/python3.10', '/opt/anaconda3/envs/pyMultiOmics/lib/python3.10/lib-dynload', '', '/opt/anaconda3/envs/pyMultiOmics/lib/python3.10/site-packages']


In [5]:
sys.path.append('..')

from pyMultiOmics.base import SingleOmicsData, MultiOmicsData
from pyMultiOmics.mapping import Mapper
from pyMultiOmics.common import set_log_level_info, set_log_level_debug, download_file, extract_zip_file
from pyMultiOmics.constants import IDENTIFIER_COL, SAMPLE_COL
from pyMultiOmics.constants import GENES, PROTEINS, COMPOUNDS

2022-03-24 14:39:13.236 | INFO     | pyMultiOmics.reactome:get_neo4j_driver:24 - Created graph database driver for bolt://localhost:7687 (neo4j)


# Demonstration of pyMultiOmics base classes

### Load the multi-omics COVID-19 data from [1]

[1] [Overmyer, Katherine A., et al. "Large-scale multi-omic analysis of COVID-19 severity." Cell systems 12.1 (2021): 23-40.](https://www.sciencedirect.com/science/article/pii/S2405471220303719)

In [6]:
url = 'https://github.com/glasgowcompbio/pyMultiOmics-data/raw/main/covid19_multiomics_data.zip'
out_file = download_file(url)
extract_zip_file(out_file)

2022-03-24 14:39:14.454 | INFO     | pyMultiOmics.common:download_file:59 - Downloading covid19_multiomics_data.zip
2.36kKB [00:00, 15.6kKB/s]                                                                                              
2022-03-24 14:39:14.634 | INFO     | pyMultiOmics.common:extract_zip_file:71 - Extracting covid19_multiomics_data.zip
100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 256.77it/s]
2022-03-24 14:39:14.692 | INFO     | pyMultiOmics.common:extract_zip_file:77 - Deleting covid19_multiomics_data.zip


In [7]:
DATA_FOLDER = os.path.abspath(os.path.join('covid19_multiomics_data'))
DATA_FOLDER

'/Users/joewandy/Work/git/pyMultiOmics/notebooks/covid19_multiomics_data'

### Read the individual dataframes

In [8]:
transcript_df = pd.read_csv(os.path.join(DATA_FOLDER, 'covid_genes_include_p.csv'), index_col='Identifier')
transcript_design = pd.read_csv(os.path.join(DATA_FOLDER, 'covid_genes_design.csv'), index_col='sample')

In [9]:
protein_df = pd.read_csv(os.path.join(DATA_FOLDER, 'covid_protein_include_p.csv'), index_col='Identifier')
protein_design = pd.read_csv(os.path.join(DATA_FOLDER, 'covid_protein_design.csv'), index_col='sample')

In [10]:
compound_df = pd.read_csv(os.path.join(DATA_FOLDER, 'covid_compounds_include_p.csv'), index_col='Identifier')
compound_design = pd.read_csv(os.path.join(DATA_FOLDER, 'covid_compounds_design.csv'), index_col='sample')

### Create single omics data container objects

Some data cleaning is done upon loading in `SingleOmicsData`:
- Duplicate values are removed from the rows and columns
- Duplicate sample names are removed
- Measurements with missing metadata are removed
- Metadata with missing measurements are removed too

In [11]:
transcript_data = SingleOmicsData(GENES, transcript_df, transcript_design)
transcript_data



genes data with (13028, 125) measurements

In [12]:
protein_data = SingleOmicsData(PROTEINS, protein_df, protein_design)
protein_data

proteins data with (1499, 129) measurements

In [13]:
compound_data = SingleOmicsData(COMPOUNDS, compound_df, compound_design)
compound_data

compounds data with (46, 129) measurements

### Getting values

You can get data out of the container by using the `data_df` and `design_df` attributes.

Notice that after the data is loaded and cleaned, the number of samples in the measurement dataframe (`data_df`) is the same as the number of rows in the sample metadata dataframe (`design_df`).

In [14]:
protein_data.data_df

Unnamed: 0_level_0,sample_1,sample_2,sample_3,sample_4,sample_5,sample_6,sample_7,sample_8,sample_9,sample_10,...,sample_120,sample_121,sample_122,sample_123,sample_124,sample_125,sample_126,sample_127,sample_128,sample_129
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0A024R6I7,37.995543,37.353091,37.527875,37.673128,37.983542,37.489959,37.615303,37.534702,37.525762,37.844902,...,37.730180,38.082377,37.661959,37.195828,37.365447,37.564109,37.683884,37.282698,37.655909,37.754401
A0A0G2JRN3,37.995543,37.353091,37.527875,37.673128,37.983542,37.489959,37.615303,37.534702,37.525762,37.844902,...,37.730180,38.082377,37.661959,37.195828,37.365447,37.564109,37.683884,37.282698,37.655909,37.754401
A0A075B6H9,27.176361,29.419228,27.174171,28.879702,27.262485,29.535232,27.657446,30.323779,28.194885,29.253091,...,27.731627,27.621776,27.249976,27.403813,27.133006,27.793462,26.876193,27.715091,26.965212,28.376997
A0A075B6I0,28.294477,29.048510,28.953215,29.527460,28.605867,29.251895,26.943355,29.914134,28.947290,29.108465,...,28.977874,27.961132,29.085045,28.319069,29.418182,29.003245,28.002763,28.660543,28.665092,29.839541
A0A075B6I4,24.783368,27.573331,25.619161,23.893562,18.440251,25.033513,19.384221,22.054705,27.642444,27.163654,...,27.365780,26.182814,27.649039,25.888229,26.724164,26.577443,27.996742,21.713502,19.659693,20.145198
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
V9GYE3,30.157740,29.011826,30.755732,31.037739,30.690791,29.544621,29.991859,29.005067,28.611893,29.039570,...,30.307260,30.378135,29.925498,30.471018,31.777999,31.486205,31.822441,31.144855,32.149526,28.001153
V9GYG9,30.157740,29.011826,30.755732,31.037739,30.690791,29.544621,29.991859,29.005067,28.611893,29.039570,...,30.307260,30.378135,29.925498,30.471018,31.777999,31.486205,31.822441,31.144855,32.149526,28.001153
X6R8F3,22.821315,20.147061,21.002496,19.830364,18.213300,22.831885,21.686202,23.468367,20.292526,24.971321,...,20.178151,21.893118,24.072429,18.115715,20.282632,21.229028,23.293898,20.400931,18.845678,26.863133
P80188,22.821315,20.147061,21.002496,19.830364,18.213300,22.831885,21.686202,23.468367,20.292526,24.971321,...,20.178151,21.893118,24.072429,18.115715,20.282632,21.229028,23.293898,20.400931,18.845678,26.863133


In [15]:
protein_data.design_df

Unnamed: 0_level_0,group
sample,Unnamed: 1_level_1
sample_1,covid
sample_2,covid
sample_3,covid
sample_4,covid
sample_5,covid
...,...
sample_125,non_covid
sample_126,non_covid
sample_127,non_covid
sample_128,non_covid


### Create a multi-omics data container objects

In [16]:
publication = 'Overmyer, Katherine A., et al. "Large-scale multi-omic analysis of COVID-19 severity." Cell systems 12.1 (2021): 23-40.'
url = 'https://www.sciencedirect.com/science/article/pii/S2405471220303719'

In [17]:
mo = MultiOmicsData(publication=publication, url=url)
mo.add_data([transcript_data, protein_data, compound_data])
mo

Multi-omics data container
- publication: Overmyer, Katherine A., et al. "Large-scale multi-omic analysis of COVID-19 severity." Cell systems 12.1 (2021): 23-40.
- URL: https://www.sciencedirect.com/science/article/pii/S2405471220303719
- Views: 3 modalities
	 - genes data with (13028, 125) measurements
	 - proteins data with (1499, 129) measurements
	 - compounds data with (46, 129) measurements