--- a +++ b/mowgli/tl.py @@ -0,0 +1,125 @@ +# Biology imports. +import scanpy as sc +import mudata as md + +# Typing imports. +from typing import Iterable + +# Matrix operations. +import numpy as np + + +def top_features( + mdata: md.MuData, + mod: str = "rna", + uns: str = "H_OT", + dim: int = 0, + threshold: float = 0.2, +) -> Iterable: + """Returns the top features for a given modality and latent dimension. + + Args: + mdata (md.MuData): The input data + mod (str, optional): The modality. Defaults to 'rna'. + uns (str, optional): Where to look for H. Defaults to 'H_OT'. + dim (int, optional): The latent dimension. Defaults to 0. + n_features (int, optional): The number of top features. Defaults to 5. + + Returns: + Iterable: A list of features names. + """ + # TODO: put variable names in uns! + + # Compue the number of features needed. + n_features = np.sum(np.cumsum(np.sort(mdata[mod].uns[uns][:, dim])) > threshold) + + # Get names for highly variable features. + idx = mdata[mod].var.highly_variable + var_names = mdata[mod].var_names[idx] + + # Sort them by contribution. + var_idx = np.argsort(mdata[mod].uns[uns][:, dim])[::-1] + + # Return the top ones. + return var_names[var_idx[:n_features]].tolist() + + +def enrich( + mdata: md.MuData, + mod: str = "rna", + uns: str = "H_OT", + n_genes: int = 200, + sources: Iterable[str] = ["GO:MF", "GO:CC", "GO:BP"], + ordered: bool = True, + domain_scope="custom_annotated", +): + """Return Gene Set Enrichment Analysis results for each dimension. + + Args: + mdata (md.MuData): Input data. + mod (str, optional): Modality that contains genes. Defaults to 'rna'. + uns (str, optional): Name of H matrix. Defaults to 'H_OT'. + n_genes (int, optional): + Number of top genes by dimension. Defaults to 200. + sources (Iterable[str], optional): + Enrichment sources. Defaults to ['GO:MF', 'GO:CC', 'GO:BP']. + ordered (bool, optional): + Make query with ordered genes. Defaults to True. + + Returns: + Pandas dataframe with the results of the queries, as well + as average best p_value across dimensions. + """ + + # Initialize ordered genes dictionary. + ordered_genes = {} + + # Get background genes. + background = mdata[mod].var.index.tolist() + + # For each dimension, + for dim in range(mdata[mod].uns[uns].shape[1]): + + # Sort the gene indices by weight. + idx_sorted = mdata[mod].uns[uns][:, dim].argsort()[::-1] + + if n_genes == "auto": + nn = np.sum(np.cumsum(np.sort(mdata[mod].uns[uns][:, dim])) > 0.1) + else: + nn = n_genes + + # Select the `n_genes` highest genes. + gene_list = mdata[mod].var[mdata[mod].var.highly_variable].index + gene_list = gene_list[idx_sorted].tolist()[:nn] + + # Input them in the dictionary. + ordered_genes["dimension " + str(dim)] = gene_list + + # Make the queries to gProfiler, specifying if genes are ordered. + if "custom" in domain_scope: + enr = sc.queries.enrich( + ordered_genes, + gprofiler_kwargs={ + "ordered": ordered, + "sources": sources, + "domain_scope": domain_scope, + "background": background, + "no_evidences": True, + }, + ) + else: + enr = sc.queries.enrich( + ordered_genes, + gprofiler_kwargs={ + "ordered": ordered, + "sources": sources, + "domain_scope": domain_scope, + "no_evidences": True, + }, + ) + + # Compute the average of the best p_values for each dimension. + mean_best_p = enr.groupby("query")["p_value"].min().mean() + + # Return the results of the queries and the average best p_value. + return enr, mean_best_p