Mowgli / Git / [28d9d2] /mowgli/tl.py

Models:
AlyssaS/
Mowgli
Downloads: 1
[28d9d2]: / mowgli / tl.py
History
Download this file
126 lines (101 with data), 3.9 kB

# Biology imports.
import scanpy as sc
import mudata as md

# Typing imports.
from typing import Iterable

# Matrix operations.
import numpy as np


def top_features(
    mdata: md.MuData,
    mod: str = "rna",
    uns: str = "H_OT",
    dim: int = 0,
    threshold: float = 0.2,
) -> Iterable:
    """Returns the top features for a given modality and latent dimension.

    Args:
        mdata (md.MuData): The input data
        mod (str, optional): The modality. Defaults to 'rna'.
        uns (str, optional): Where to look for H. Defaults to 'H_OT'.
        dim (int, optional): The latent dimension. Defaults to 0.
        n_features (int, optional): The number of top features. Defaults to 5.

    Returns:
        Iterable: A list of features names.
    """
    # TODO: put variable names in uns!

    # Compue the number of features needed.
    n_features = np.sum(np.cumsum(np.sort(mdata[mod].uns[uns][:, dim])) > threshold)

    # Get names for highly variable features.
    idx = mdata[mod].var.highly_variable
    var_names = mdata[mod].var_names[idx]

    # Sort them by contribution.
    var_idx = np.argsort(mdata[mod].uns[uns][:, dim])[::-1]

    # Return the top ones.
    return var_names[var_idx[:n_features]].tolist()


def enrich(
    mdata: md.MuData,
    mod: str = "rna",
    uns: str = "H_OT",
    n_genes: int = 200,
    sources: Iterable[str] = ["GO:MF", "GO:CC", "GO:BP"],
    ordered: bool = True,
    domain_scope="custom_annotated",
):
    """Return Gene Set Enrichment Analysis results for each dimension.

    Args:
        mdata (md.MuData): Input data.
        mod (str, optional): Modality that contains genes. Defaults to 'rna'.
        uns (str, optional): Name of H matrix. Defaults to 'H_OT'.
        n_genes (int, optional):
            Number of top genes by dimension. Defaults to 200.
        sources (Iterable[str], optional):
            Enrichment sources. Defaults to ['GO:MF', 'GO:CC', 'GO:BP'].
        ordered (bool, optional):
            Make query with ordered genes. Defaults to True.

    Returns:
        Pandas dataframe with the results of the queries, as well
        as average best p_value across dimensions.
    """

    # Initialize ordered genes dictionary.
    ordered_genes = {}

    # Get background genes.
    background = mdata[mod].var.index.tolist()

    # For each dimension,
    for dim in range(mdata[mod].uns[uns].shape[1]):

        # Sort the gene indices by weight.
        idx_sorted = mdata[mod].uns[uns][:, dim].argsort()[::-1]

        if n_genes == "auto":
            nn = np.sum(np.cumsum(np.sort(mdata[mod].uns[uns][:, dim])) > 0.1)
        else:
            nn = n_genes

        # Select the `n_genes` highest genes.
        gene_list = mdata[mod].var[mdata[mod].var.highly_variable].index
        gene_list = gene_list[idx_sorted].tolist()[:nn]

        # Input them in the dictionary.
        ordered_genes["dimension " + str(dim)] = gene_list

    # Make the queries to gProfiler, specifying if genes are ordered.
    if "custom" in domain_scope:
        enr = sc.queries.enrich(
            ordered_genes,
            gprofiler_kwargs={
                "ordered": ordered,
                "sources": sources,
                "domain_scope": domain_scope,
                "background": background,
                "no_evidences": True,
            },
        )
    else:
        enr = sc.queries.enrich(
            ordered_genes,
            gprofiler_kwargs={
                "ordered": ordered,
                "sources": sources,
                "domain_scope": domain_scope,
                "no_evidences": True,
            },
        )

    # Compute the average of the best p_values for each dimension.
    mean_best_p = enr.groupby("query")["p_value"].min().mean()

    # Return the results of the queries and the average best p_value.
    return enr, mean_best_p