|
a |
|
b/scripts/analysis_zebrafish.py |
|
|
1 |
import os, sys |
|
|
2 |
|
|
|
3 |
import pandas as pd |
|
|
4 |
from loguru import logger |
|
|
5 |
|
|
|
6 |
from pyMultiOmics.base import SingleOmicsData, MultiOmicsData |
|
|
7 |
|
|
|
8 |
sys.path.append('..') |
|
|
9 |
|
|
|
10 |
from pyMultiOmics.constants import * |
|
|
11 |
from pyMultiOmics.mapping import Mapper |
|
|
12 |
from pyMultiOmics.common import set_log_level_info |
|
|
13 |
from pyMultiOmics.analysis import * |
|
|
14 |
from pyMultiOmics.query import * |
|
|
15 |
from pyMultiOmics.pipelines import * |
|
|
16 |
|
|
|
17 |
|
|
|
18 |
DATA_FOLDER = os.path.abspath(os.path.join('..', 'notebooks', 'zebrafish_data')) |
|
|
19 |
gene_data = pd.read_csv(os.path.join(DATA_FOLDER, 'gene_data_combined.csv'), index_col='Identifier') |
|
|
20 |
gene_design = pd.read_csv(os.path.join(DATA_FOLDER, 'gene_design.csv'), index_col='sample') |
|
|
21 |
protein_data = pd.read_csv(os.path.join(DATA_FOLDER, 'protein_data.csv'), index_col='Uniprot') |
|
|
22 |
protein_design = pd.read_csv(os.path.join(DATA_FOLDER, 'protein_design.csv'), index_col='sample') |
|
|
23 |
compound_data = pd.read_csv(os.path.join(DATA_FOLDER, 'compound_data_kegg.csv'), index_col='Identifier') |
|
|
24 |
compound_design = pd.read_csv(os.path.join(DATA_FOLDER, 'compound_design.csv'), index_col='sample') |
|
|
25 |
|
|
|
26 |
set_log_level_info() |
|
|
27 |
|
|
|
28 |
gene_so = SingleOmicsData(GENES, gene_data, gene_design) |
|
|
29 |
prot_so = SingleOmicsData(GENES, protein_data, protein_design) |
|
|
30 |
comp_so = SingleOmicsData(GENES, compound_data, compound_design) |
|
|
31 |
|
|
|
32 |
mo = MultiOmicsData() |
|
|
33 |
mo.add_data([gene_so, prot_so, comp_so]) |
|
|
34 |
|
|
|
35 |
m = Mapper(mo, DANIO_RERIO, metabolic_pathway_only=True) \ |
|
|
36 |
.build() |
|
|
37 |
|
|
|
38 |
ap = AnalysisPipeline(mo, m) |
|
|
39 |
|
|
|
40 |
# method = INFERENCE_DESEQ |
|
|
41 |
method = INFERENCE_T_TEST |
|
|
42 |
ap.run_de(method, GENES, 'Distal', 'Proximal') |
|
|
43 |
ap.run_de(method, GENES, 'Distal', 'Middle') |
|
|
44 |
ap.run_de(method, GENES, 'Proximal', 'Middle') |
|
|
45 |
|
|
|
46 |
method = INFERENCE_T_TEST |
|
|
47 |
ap.run_de(method, PROTEINS, 'Distal', 'Proximal') |
|
|
48 |
ap.run_de(method, PROTEINS, 'Distal', 'Middle') |
|
|
49 |
ap.run_de(method, PROTEINS, 'Proximal', 'Middle') |
|
|
50 |
ap.run_de(method, COMPOUNDS, 'Distal', 'Proximal') |
|
|
51 |
ap.run_de(method, COMPOUNDS, 'Distal', 'Middle') |
|
|
52 |
ap.run_de(method, COMPOUNDS, 'Proximal', 'Middle') |
|
|
53 |
|
|
|
54 |
# Retrieve a single node |
|
|
55 |
node_id = '15366' |
|
|
56 |
res = QueryBuilder(ap) \ |
|
|
57 |
.add(Entity(node_id)) \ |
|
|
58 |
.run() |
|
|
59 |
logger.info(res) |
|
|
60 |
|
|
|
61 |
# Retrieve multiple nodes |
|
|
62 |
node_id = ['15366', 'ENSDARG00000037781', 'F1QAA7'] |
|
|
63 |
res = QueryBuilder(ap) \ |
|
|
64 |
.add(Entity(node_id)) \ |
|
|
65 |
.run() |
|
|
66 |
logger.info(res) |
|
|
67 |
|
|
|
68 |
# Retrieve nodes connected to a single node |
|
|
69 |
query_id = 'F1QAA7' |
|
|
70 |
res = QueryBuilder(ap) \ |
|
|
71 |
.add(Entity(query_id)) \ |
|
|
72 |
.add(Connected()) \ |
|
|
73 |
.run() |
|
|
74 |
logger.info(res) |
|
|
75 |
|
|
|
76 |
# Retrieve top-10 significantly changing genes |
|
|
77 |
case = 'Distal' |
|
|
78 |
control = 'Proximal' |
|
|
79 |
pval = 0.05 |
|
|
80 |
fc_lte = -2 |
|
|
81 |
fc_gte = 2 |
|
|
82 |
N = 20 |
|
|
83 |
res = QueryBuilder(ap) \ |
|
|
84 |
.add(Select(GENES)) \ |
|
|
85 |
.add(SignificantDE(case, control, pval, fc_lte=fc_lte, fc_gte=fc_gte, N=N)) \ |
|
|
86 |
.run() |
|
|
87 |
logger.info(res) |
|
|
88 |
|
|
|
89 |
# Find the compounds that are connected to the DE genes above |
|
|
90 |
|
|
|
91 |
res = QueryBuilder(ap) \ |
|
|
92 |
.add(Select(GENES)) \ |
|
|
93 |
.add(SignificantDE(case, control, pval, fc_lte=fc_lte, fc_gte=fc_gte, N=N)) \ |
|
|
94 |
.add(Connected(data_type=COMPOUNDS)) \ |
|
|
95 |
.run() |
|
|
96 |
logger.info(res) |
|
|
97 |
|
|
|
98 |
# Plot some heatmap using Plotly |
|
|
99 |
|
|
|
100 |
res = QueryBuilder(ap) \ |
|
|
101 |
.add(Select(GENES)) \ |
|
|
102 |
.add(SignificantDE(case, control, pval, fc_lte=fc_lte, fc_gte=fc_gte, N=N)) \ |
|
|
103 |
.run() |
|
|
104 |
logger.info(res) |
|
|
105 |
|
|
|
106 |
data_type = GENES |
|
|
107 |
analysis = ap.get_de_analysis(data_type, case, control) |
|
|
108 |
wi = analysis.wi |
|
|
109 |
data_df, design_df = wi.data_df, wi.design_df |
|
|
110 |
|
|
|
111 |
case_group = design_df[design_df['group'] == case].index.values.tolist() |
|
|
112 |
control_group = design_df[design_df['group'] == control].index.values.tolist() |
|
|
113 |
selection = case_group + control_group |
|
|
114 |
|
|
|
115 |
heatmap_df = wi.data_df.loc[res.index.values] |
|
|
116 |
heatmap_df = heatmap_df[selection] |
|
|
117 |
heatmap_df |
|
|
118 |
|
|
|
119 |
from plotly import express as px |
|
|
120 |
px.imshow(wi._normalise_df(heatmap_df, log=True)) |