|
a |
|
b/mowgli/tl.py |
|
|
1 |
# Biology imports. |
|
|
2 |
import scanpy as sc |
|
|
3 |
import mudata as md |
|
|
4 |
|
|
|
5 |
# Typing imports. |
|
|
6 |
from typing import Iterable |
|
|
7 |
|
|
|
8 |
# Matrix operations. |
|
|
9 |
import numpy as np |
|
|
10 |
|
|
|
11 |
|
|
|
12 |
def top_features( |
|
|
13 |
mdata: md.MuData, |
|
|
14 |
mod: str = "rna", |
|
|
15 |
uns: str = "H_OT", |
|
|
16 |
dim: int = 0, |
|
|
17 |
threshold: float = 0.2, |
|
|
18 |
) -> Iterable: |
|
|
19 |
"""Returns the top features for a given modality and latent dimension. |
|
|
20 |
|
|
|
21 |
Args: |
|
|
22 |
mdata (md.MuData): The input data |
|
|
23 |
mod (str, optional): The modality. Defaults to 'rna'. |
|
|
24 |
uns (str, optional): Where to look for H. Defaults to 'H_OT'. |
|
|
25 |
dim (int, optional): The latent dimension. Defaults to 0. |
|
|
26 |
n_features (int, optional): The number of top features. Defaults to 5. |
|
|
27 |
|
|
|
28 |
Returns: |
|
|
29 |
Iterable: A list of features names. |
|
|
30 |
""" |
|
|
31 |
# TODO: put variable names in uns! |
|
|
32 |
|
|
|
33 |
# Compue the number of features needed. |
|
|
34 |
n_features = np.sum(np.cumsum(np.sort(mdata[mod].uns[uns][:, dim])) > threshold) |
|
|
35 |
|
|
|
36 |
# Get names for highly variable features. |
|
|
37 |
idx = mdata[mod].var.highly_variable |
|
|
38 |
var_names = mdata[mod].var_names[idx] |
|
|
39 |
|
|
|
40 |
# Sort them by contribution. |
|
|
41 |
var_idx = np.argsort(mdata[mod].uns[uns][:, dim])[::-1] |
|
|
42 |
|
|
|
43 |
# Return the top ones. |
|
|
44 |
return var_names[var_idx[:n_features]].tolist() |
|
|
45 |
|
|
|
46 |
|
|
|
47 |
def enrich( |
|
|
48 |
mdata: md.MuData, |
|
|
49 |
mod: str = "rna", |
|
|
50 |
uns: str = "H_OT", |
|
|
51 |
n_genes: int = 200, |
|
|
52 |
sources: Iterable[str] = ["GO:MF", "GO:CC", "GO:BP"], |
|
|
53 |
ordered: bool = True, |
|
|
54 |
domain_scope="custom_annotated", |
|
|
55 |
): |
|
|
56 |
"""Return Gene Set Enrichment Analysis results for each dimension. |
|
|
57 |
|
|
|
58 |
Args: |
|
|
59 |
mdata (md.MuData): Input data. |
|
|
60 |
mod (str, optional): Modality that contains genes. Defaults to 'rna'. |
|
|
61 |
uns (str, optional): Name of H matrix. Defaults to 'H_OT'. |
|
|
62 |
n_genes (int, optional): |
|
|
63 |
Number of top genes by dimension. Defaults to 200. |
|
|
64 |
sources (Iterable[str], optional): |
|
|
65 |
Enrichment sources. Defaults to ['GO:MF', 'GO:CC', 'GO:BP']. |
|
|
66 |
ordered (bool, optional): |
|
|
67 |
Make query with ordered genes. Defaults to True. |
|
|
68 |
|
|
|
69 |
Returns: |
|
|
70 |
Pandas dataframe with the results of the queries, as well |
|
|
71 |
as average best p_value across dimensions. |
|
|
72 |
""" |
|
|
73 |
|
|
|
74 |
# Initialize ordered genes dictionary. |
|
|
75 |
ordered_genes = {} |
|
|
76 |
|
|
|
77 |
# Get background genes. |
|
|
78 |
background = mdata[mod].var.index.tolist() |
|
|
79 |
|
|
|
80 |
# For each dimension, |
|
|
81 |
for dim in range(mdata[mod].uns[uns].shape[1]): |
|
|
82 |
|
|
|
83 |
# Sort the gene indices by weight. |
|
|
84 |
idx_sorted = mdata[mod].uns[uns][:, dim].argsort()[::-1] |
|
|
85 |
|
|
|
86 |
if n_genes == "auto": |
|
|
87 |
nn = np.sum(np.cumsum(np.sort(mdata[mod].uns[uns][:, dim])) > 0.1) |
|
|
88 |
else: |
|
|
89 |
nn = n_genes |
|
|
90 |
|
|
|
91 |
# Select the `n_genes` highest genes. |
|
|
92 |
gene_list = mdata[mod].var[mdata[mod].var.highly_variable].index |
|
|
93 |
gene_list = gene_list[idx_sorted].tolist()[:nn] |
|
|
94 |
|
|
|
95 |
# Input them in the dictionary. |
|
|
96 |
ordered_genes["dimension " + str(dim)] = gene_list |
|
|
97 |
|
|
|
98 |
# Make the queries to gProfiler, specifying if genes are ordered. |
|
|
99 |
if "custom" in domain_scope: |
|
|
100 |
enr = sc.queries.enrich( |
|
|
101 |
ordered_genes, |
|
|
102 |
gprofiler_kwargs={ |
|
|
103 |
"ordered": ordered, |
|
|
104 |
"sources": sources, |
|
|
105 |
"domain_scope": domain_scope, |
|
|
106 |
"background": background, |
|
|
107 |
"no_evidences": True, |
|
|
108 |
}, |
|
|
109 |
) |
|
|
110 |
else: |
|
|
111 |
enr = sc.queries.enrich( |
|
|
112 |
ordered_genes, |
|
|
113 |
gprofiler_kwargs={ |
|
|
114 |
"ordered": ordered, |
|
|
115 |
"sources": sources, |
|
|
116 |
"domain_scope": domain_scope, |
|
|
117 |
"no_evidences": True, |
|
|
118 |
}, |
|
|
119 |
) |
|
|
120 |
|
|
|
121 |
# Compute the average of the best p_values for each dimension. |
|
|
122 |
mean_best_p = enr.groupby("query")["p_value"].min().mean() |
|
|
123 |
|
|
|
124 |
# Return the results of the queries and the average best p_value. |
|
|
125 |
return enr, mean_best_p |