Mowgli / Git / [28d9d2] /mowgli/score.py

Models:
AlyssaS/
Mowgli
Downloads: 1
[28d9d2]: / mowgli / score.py
History
Download this file
146 lines (112 with data), 4.4 kB

from typing import Iterable

import anndata as ad
import numpy as np
import scanpy as sc
from scipy.spatial.distance import cdist
from sklearn.metrics import adjusted_rand_score as ARI
from sklearn.metrics import normalized_mutual_info_score as NMI
from sklearn.metrics import silhouette_score
from tqdm import tqdm


def embedding_silhouette_score(
    embedding: np.ndarray,
    labels: np.ndarray,
    metric: str = "euclidean",
) -> float:
    """Compute the silhouette score for an embedding.

    Args:
        embedding (np.ndarray):
            The embedding, shape (n_obs, n_latent)
        labels (np.ndarray):
            The labels, shape (n_obs,)
        metric (str, optional):
            The metric on the embedding space. Defaults to "euclidean".

    Returns:
        float: The silhouette score.
    """
    # Check the dimensions of the inputs.
    assert embedding.shape[0] == labels.shape[0]

    # Compute and return the silhouette score.
    return silhouette_score(embedding, labels, metric=metric)


def embedding_leiden_across_resolutions(
    embedding: np.ndarray,
    labels: np.ndarray,
    n_neighbors: int,
    resolutions: Iterable[float] = np.arange(0.1, 2.1, 0.1),
):
    """Compute the ARI and NMI for Leiden clustering on an embedding,
    for various resolutions.

    Args:
        embedding (np.ndarray):
            The embedding, shape (n_obs, n_latent)
        labels (np.ndarray):
            The labels, shape (n_obs,)
        n_neighbors (int):
            The number of neighbors to use for the kNN graph.
        resolutions (Iterable[float], optional):
            The resolutions to use for Leiden clustering. Defaults to
            np.arange(0.1, 2.1, 0.1).

    Returns:
        Tuple[Iterable[float], Iterable[float], Iterable[float]]:
            The resolutions, ARIs and NMIs.
    """
    # Create an AnnData object with the joint embedding.
    joint_embedding = ad.AnnData(embedding)

    # Initialize the results.
    aris, nmis = [], []

    # Compute neighbors on the joint embedding.
    sc.pp.neighbors(joint_embedding, use_rep="X", n_neighbors=n_neighbors)

    # For all resolutions,
    for resolution in tqdm(resolutions):

        # Perform Leiden clustering.
        sc.tl.leiden(joint_embedding, resolution=resolution)

        # Compute ARI and NMI
        aris.append(ARI(joint_embedding.obs["leiden"], labels))
        nmis.append(NMI(joint_embedding.obs["leiden"], labels))

    # Return ARI and NMI for various resolutions.
    return resolutions, aris, nmis


def knn_purity_score(knn: np.ndarray, labels: np.ndarray) -> float:
    """Compute the kNN purity score, averaged over all observations.
    For one observation, the purity score is the percentage of
    nearest neighbors that share its label.

    Args:
        knn (np.ndarray):
            The knn, shaped (n_obs, k). The i-th row should contain integers
            representing the indices of the k nearest neighbors.
        labels (np.ndarray):
            The labels, shaped (n_obs)

    Returns:
        float: The purity score.
    """
    # Check the dimensions of the input.
    assert knn.shape[0] == labels.shape[0]

    # Initialize a list of purity scores.
    score = 0

    # Iterate over the observations.
    for i, neighbors in enumerate(knn):

        # Do the neighbors have the same label as the observation?
        matches = labels[neighbors] == labels[i]

        # Add the purity rate to the scores.
        score += np.mean(matches) / knn.shape[0]

    # Return the average purity.
    return score


def embedding_to_knn(
    embedding: np.ndarray, k: int = 15, metric: str = "euclidean"
) -> np.ndarray:
    """Convert embedding to knn

    Args:
        embedding (np.ndarray): The embedding (n_obs, n_latent)
        k (int, optional): The number of nearest neighbors. Defaults to 15.
        metric (str, optional): The metric to compute neighbors with. Defaults to "euclidean".

    Returns:
        np.ndarray: The knn (n_obs, k)
    """
    # Initialize the knn graph.
    knn = np.zeros((embedding.shape[0], k), dtype=int)

    # Compute pariwise distances between observations.
    distances = cdist(embedding, embedding, metric=metric)

    # Iterate over observations.
    for i in range(distances.shape[0]):

        # Get the `max_neighbors` nearest neighbors.
        knn[i] = distances[i].argsort()[1 : k + 1]

    # Return the knn graph.
    return knn