a b/mowgli/score.py
1
from typing import Iterable
2
3
import anndata as ad
4
import numpy as np
5
import scanpy as sc
6
from scipy.spatial.distance import cdist
7
from sklearn.metrics import adjusted_rand_score as ARI
8
from sklearn.metrics import normalized_mutual_info_score as NMI
9
from sklearn.metrics import silhouette_score
10
from tqdm import tqdm
11
12
13
def embedding_silhouette_score(
14
    embedding: np.ndarray,
15
    labels: np.ndarray,
16
    metric: str = "euclidean",
17
) -> float:
18
    """Compute the silhouette score for an embedding.
19
20
    Args:
21
        embedding (np.ndarray):
22
            The embedding, shape (n_obs, n_latent)
23
        labels (np.ndarray):
24
            The labels, shape (n_obs,)
25
        metric (str, optional):
26
            The metric on the embedding space. Defaults to "euclidean".
27
28
    Returns:
29
        float: The silhouette score.
30
    """
31
    # Check the dimensions of the inputs.
32
    assert embedding.shape[0] == labels.shape[0]
33
34
    # Compute and return the silhouette score.
35
    return silhouette_score(embedding, labels, metric=metric)
36
37
38
def embedding_leiden_across_resolutions(
39
    embedding: np.ndarray,
40
    labels: np.ndarray,
41
    n_neighbors: int,
42
    resolutions: Iterable[float] = np.arange(0.1, 2.1, 0.1),
43
):
44
    """Compute the ARI and NMI for Leiden clustering on an embedding,
45
    for various resolutions.
46
47
    Args:
48
        embedding (np.ndarray):
49
            The embedding, shape (n_obs, n_latent)
50
        labels (np.ndarray):
51
            The labels, shape (n_obs,)
52
        n_neighbors (int):
53
            The number of neighbors to use for the kNN graph.
54
        resolutions (Iterable[float], optional):
55
            The resolutions to use for Leiden clustering. Defaults to
56
            np.arange(0.1, 2.1, 0.1).
57
58
    Returns:
59
        Tuple[Iterable[float], Iterable[float], Iterable[float]]:
60
            The resolutions, ARIs and NMIs.
61
    """
62
    # Create an AnnData object with the joint embedding.
63
    joint_embedding = ad.AnnData(embedding)
64
65
    # Initialize the results.
66
    aris, nmis = [], []
67
68
    # Compute neighbors on the joint embedding.
69
    sc.pp.neighbors(joint_embedding, use_rep="X", n_neighbors=n_neighbors)
70
71
    # For all resolutions,
72
    for resolution in tqdm(resolutions):
73
74
        # Perform Leiden clustering.
75
        sc.tl.leiden(joint_embedding, resolution=resolution)
76
77
        # Compute ARI and NMI
78
        aris.append(ARI(joint_embedding.obs["leiden"], labels))
79
        nmis.append(NMI(joint_embedding.obs["leiden"], labels))
80
81
    # Return ARI and NMI for various resolutions.
82
    return resolutions, aris, nmis
83
84
85
def knn_purity_score(knn: np.ndarray, labels: np.ndarray) -> float:
86
    """Compute the kNN purity score, averaged over all observations.
87
    For one observation, the purity score is the percentage of
88
    nearest neighbors that share its label.
89
90
    Args:
91
        knn (np.ndarray):
92
            The knn, shaped (n_obs, k). The i-th row should contain integers
93
            representing the indices of the k nearest neighbors.
94
        labels (np.ndarray):
95
            The labels, shaped (n_obs)
96
97
    Returns:
98
        float: The purity score.
99
    """
100
    # Check the dimensions of the input.
101
    assert knn.shape[0] == labels.shape[0]
102
103
    # Initialize a list of purity scores.
104
    score = 0
105
106
    # Iterate over the observations.
107
    for i, neighbors in enumerate(knn):
108
109
        # Do the neighbors have the same label as the observation?
110
        matches = labels[neighbors] == labels[i]
111
112
        # Add the purity rate to the scores.
113
        score += np.mean(matches) / knn.shape[0]
114
115
    # Return the average purity.
116
    return score
117
118
119
def embedding_to_knn(
120
    embedding: np.ndarray, k: int = 15, metric: str = "euclidean"
121
) -> np.ndarray:
122
    """Convert embedding to knn
123
124
    Args:
125
        embedding (np.ndarray): The embedding (n_obs, n_latent)
126
        k (int, optional): The number of nearest neighbors. Defaults to 15.
127
        metric (str, optional): The metric to compute neighbors with. Defaults to "euclidean".
128
129
    Returns:
130
        np.ndarray: The knn (n_obs, k)
131
    """
132
    # Initialize the knn graph.
133
    knn = np.zeros((embedding.shape[0], k), dtype=int)
134
135
    # Compute pariwise distances between observations.
136
    distances = cdist(embedding, embedding, metric=metric)
137
138
    # Iterate over observations.
139
    for i in range(distances.shape[0]):
140
141
        # Get the `max_neighbors` nearest neighbors.
142
        knn[i] = distances[i].argsort()[1 : k + 1]
143
144
    # Return the knn graph.
145
    return knn