a b/mowgli/tl.py
1
# Biology imports.
2
import scanpy as sc
3
import mudata as md
4
5
# Typing imports.
6
from typing import Iterable
7
8
# Matrix operations.
9
import numpy as np
10
11
12
def top_features(
13
    mdata: md.MuData,
14
    mod: str = "rna",
15
    uns: str = "H_OT",
16
    dim: int = 0,
17
    threshold: float = 0.2,
18
) -> Iterable:
19
    """Returns the top features for a given modality and latent dimension.
20
21
    Args:
22
        mdata (md.MuData): The input data
23
        mod (str, optional): The modality. Defaults to 'rna'.
24
        uns (str, optional): Where to look for H. Defaults to 'H_OT'.
25
        dim (int, optional): The latent dimension. Defaults to 0.
26
        n_features (int, optional): The number of top features. Defaults to 5.
27
28
    Returns:
29
        Iterable: A list of features names.
30
    """
31
    # TODO: put variable names in uns!
32
33
    # Compue the number of features needed.
34
    n_features = np.sum(np.cumsum(np.sort(mdata[mod].uns[uns][:, dim])) > threshold)
35
36
    # Get names for highly variable features.
37
    idx = mdata[mod].var.highly_variable
38
    var_names = mdata[mod].var_names[idx]
39
40
    # Sort them by contribution.
41
    var_idx = np.argsort(mdata[mod].uns[uns][:, dim])[::-1]
42
43
    # Return the top ones.
44
    return var_names[var_idx[:n_features]].tolist()
45
46
47
def enrich(
48
    mdata: md.MuData,
49
    mod: str = "rna",
50
    uns: str = "H_OT",
51
    n_genes: int = 200,
52
    sources: Iterable[str] = ["GO:MF", "GO:CC", "GO:BP"],
53
    ordered: bool = True,
54
    domain_scope="custom_annotated",
55
):
56
    """Return Gene Set Enrichment Analysis results for each dimension.
57
58
    Args:
59
        mdata (md.MuData): Input data.
60
        mod (str, optional): Modality that contains genes. Defaults to 'rna'.
61
        uns (str, optional): Name of H matrix. Defaults to 'H_OT'.
62
        n_genes (int, optional):
63
            Number of top genes by dimension. Defaults to 200.
64
        sources (Iterable[str], optional):
65
            Enrichment sources. Defaults to ['GO:MF', 'GO:CC', 'GO:BP'].
66
        ordered (bool, optional):
67
            Make query with ordered genes. Defaults to True.
68
69
    Returns:
70
        Pandas dataframe with the results of the queries, as well
71
        as average best p_value across dimensions.
72
    """
73
74
    # Initialize ordered genes dictionary.
75
    ordered_genes = {}
76
77
    # Get background genes.
78
    background = mdata[mod].var.index.tolist()
79
80
    # For each dimension,
81
    for dim in range(mdata[mod].uns[uns].shape[1]):
82
83
        # Sort the gene indices by weight.
84
        idx_sorted = mdata[mod].uns[uns][:, dim].argsort()[::-1]
85
86
        if n_genes == "auto":
87
            nn = np.sum(np.cumsum(np.sort(mdata[mod].uns[uns][:, dim])) > 0.1)
88
        else:
89
            nn = n_genes
90
91
        # Select the `n_genes` highest genes.
92
        gene_list = mdata[mod].var[mdata[mod].var.highly_variable].index
93
        gene_list = gene_list[idx_sorted].tolist()[:nn]
94
95
        # Input them in the dictionary.
96
        ordered_genes["dimension " + str(dim)] = gene_list
97
98
    # Make the queries to gProfiler, specifying if genes are ordered.
99
    if "custom" in domain_scope:
100
        enr = sc.queries.enrich(
101
            ordered_genes,
102
            gprofiler_kwargs={
103
                "ordered": ordered,
104
                "sources": sources,
105
                "domain_scope": domain_scope,
106
                "background": background,
107
                "no_evidences": True,
108
            },
109
        )
110
    else:
111
        enr = sc.queries.enrich(
112
            ordered_genes,
113
            gprofiler_kwargs={
114
                "ordered": ordered,
115
                "sources": sources,
116
                "domain_scope": domain_scope,
117
                "no_evidences": True,
118
            },
119
        )
120
121
    # Compute the average of the best p_values for each dimension.
122
    mean_best_p = enr.groupby("query")["p_value"].min().mean()
123
124
    # Return the results of the queries and the average best p_value.
125
    return enr, mean_best_p