[28d9d2]: / mowgli / tl.py

Download this file

126 lines (101 with data), 3.9 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# Biology imports.
import scanpy as sc
import mudata as md
# Typing imports.
from typing import Iterable
# Matrix operations.
import numpy as np
def top_features(
mdata: md.MuData,
mod: str = "rna",
uns: str = "H_OT",
dim: int = 0,
threshold: float = 0.2,
) -> Iterable:
"""Returns the top features for a given modality and latent dimension.
Args:
mdata (md.MuData): The input data
mod (str, optional): The modality. Defaults to 'rna'.
uns (str, optional): Where to look for H. Defaults to 'H_OT'.
dim (int, optional): The latent dimension. Defaults to 0.
n_features (int, optional): The number of top features. Defaults to 5.
Returns:
Iterable: A list of features names.
"""
# TODO: put variable names in uns!
# Compue the number of features needed.
n_features = np.sum(np.cumsum(np.sort(mdata[mod].uns[uns][:, dim])) > threshold)
# Get names for highly variable features.
idx = mdata[mod].var.highly_variable
var_names = mdata[mod].var_names[idx]
# Sort them by contribution.
var_idx = np.argsort(mdata[mod].uns[uns][:, dim])[::-1]
# Return the top ones.
return var_names[var_idx[:n_features]].tolist()
def enrich(
mdata: md.MuData,
mod: str = "rna",
uns: str = "H_OT",
n_genes: int = 200,
sources: Iterable[str] = ["GO:MF", "GO:CC", "GO:BP"],
ordered: bool = True,
domain_scope="custom_annotated",
):
"""Return Gene Set Enrichment Analysis results for each dimension.
Args:
mdata (md.MuData): Input data.
mod (str, optional): Modality that contains genes. Defaults to 'rna'.
uns (str, optional): Name of H matrix. Defaults to 'H_OT'.
n_genes (int, optional):
Number of top genes by dimension. Defaults to 200.
sources (Iterable[str], optional):
Enrichment sources. Defaults to ['GO:MF', 'GO:CC', 'GO:BP'].
ordered (bool, optional):
Make query with ordered genes. Defaults to True.
Returns:
Pandas dataframe with the results of the queries, as well
as average best p_value across dimensions.
"""
# Initialize ordered genes dictionary.
ordered_genes = {}
# Get background genes.
background = mdata[mod].var.index.tolist()
# For each dimension,
for dim in range(mdata[mod].uns[uns].shape[1]):
# Sort the gene indices by weight.
idx_sorted = mdata[mod].uns[uns][:, dim].argsort()[::-1]
if n_genes == "auto":
nn = np.sum(np.cumsum(np.sort(mdata[mod].uns[uns][:, dim])) > 0.1)
else:
nn = n_genes
# Select the `n_genes` highest genes.
gene_list = mdata[mod].var[mdata[mod].var.highly_variable].index
gene_list = gene_list[idx_sorted].tolist()[:nn]
# Input them in the dictionary.
ordered_genes["dimension " + str(dim)] = gene_list
# Make the queries to gProfiler, specifying if genes are ordered.
if "custom" in domain_scope:
enr = sc.queries.enrich(
ordered_genes,
gprofiler_kwargs={
"ordered": ordered,
"sources": sources,
"domain_scope": domain_scope,
"background": background,
"no_evidences": True,
},
)
else:
enr = sc.queries.enrich(
ordered_genes,
gprofiler_kwargs={
"ordered": ordered,
"sources": sources,
"domain_scope": domain_scope,
"no_evidences": True,
},
)
# Compute the average of the best p_values for each dimension.
mean_best_p = enr.groupby("query")["p_value"].min().mean()
# Return the results of the queries and the average best p_value.
return enr, mean_best_p