Switch to unified view

a b/exseek/scripts/evaluation.py
1
import numpy as np
2
from sklearn.utils.linear_assignment_ import linear_assignment
3
from sklearn.mixture import GaussianMixture as GMM
4
from sklearn.cluster import KMeans
5
from sklearn.preprocessing import StandardScaler
6
7
def convert_label_to_int(sample_class):
8
    classes, counts = np.unique(sample_class, return_counts=True)
9
    classes = np.argmax(sample_class.reshape((-1, 1)) == classes.reshape((1, -1)), axis=1)
10
    return classes
11
12
def unsupervised_clustering_accuracy(y, y_pred):
13
    assert len(y_pred) == len(y)
14
    u = np.unique(np.concatenate((y, y_pred)))
15
    n_clusters = len(u)
16
    mapping = dict(zip(u, range(n_clusters)))
17
    reward_matrix = np.zeros((n_clusters, n_clusters), dtype=np.int64)
18
    for y_pred_, y_ in zip(y_pred, y):
19
        if y_ in mapping:
20
            reward_matrix[mapping[y_pred_], mapping[y_]] += 1
21
    cost_matrix = reward_matrix.max() - reward_matrix
22
    ind = linear_assignment(cost_matrix)
23
    return sum([reward_matrix[i, j] for i, j in ind]) * 1.0 / y_pred.size, ind
24
25
def uca_score(X, y, prediction_algorithm='knn'):
26
    #X_log = np.log2(X + 0.001).T
27
    X_scale = StandardScaler().fit_transform(X)
28
    # convert string labels to integer labels
29
    unique_classes = np.unique(y)
30
    labels = np.zeros(y.shape)
31
    for i, c in enumerate(unique_classes):
32
        labels[y == c] = i
33
    
34
    cluster_num = np.unique(y).shape[0]
35
    if prediction_algorithm == 'knn':
36
        labels_pred = KMeans(cluster_num, n_init=200).fit_predict(X_scale)  
37
    elif prediction_algorithm == 'gmm':
38
        gmm = GMM(cluster_num)
39
        gmm.fit(X_scale)
40
        labels_pred = gmm.predict(X_scale)
41
    labels_int = convert_label_to_int(labels)
42
    score = unsupervised_clustering_accuracy(labels_int, labels_pred)[0]
43
    return score
44
45
def knn_score(X, y, K=10, n_shuffle=None):
46
    from sklearn.neighbors import NearestNeighbors
47
48
    N = X.shape[0]
49
    assert K < N
50
    def knn_fractions(X, y):
51
        nn = NearestNeighbors(K)
52
        nn.fit(X)
53
        distances, indices = nn.kneighbors(X, K + 1)
54
        neighbor_classes = np.take(y, indices[:, 1:])
55
        return np.sum(neighbor_classes == y[:, np.newaxis], axis=1)
56
    
57
    def expected_fractions(X, y):
58
        stats = np.zeros((n_shuffle, N))
59
        for i in range(n_shuffle):
60
            y = np.random.permutation(y)
61
            stats[i] = knn_fractions(X, y)
62
        return stats.mean(axis=0)
63
    
64
    classes, class_sizes = np.unique(y, return_counts=True)
65
    classes = np.argmax(y.reshape((-1, 1)) == classes.reshape((1, -1)), axis=1)
66
    class_sizes = np.take(class_sizes, classes)
67
    # expected fraction
68
    mean_r = K/(N - 1)*class_sizes
69
    observed_r = knn_fractions(X, y)
70
    #mean_r = expected_fractions(X, y)
71
    max_r = np.minimum(K, class_sizes)
72
    #print(observed_r, mean_r, max_r)
73
    scores = (observed_r - mean_r)/(max_r - mean_r)
74
    return scores.mean()