|
a |
|
b/exseek/scripts/evaluation.py |
|
|
1 |
import numpy as np |
|
|
2 |
from sklearn.utils.linear_assignment_ import linear_assignment |
|
|
3 |
from sklearn.mixture import GaussianMixture as GMM |
|
|
4 |
from sklearn.cluster import KMeans |
|
|
5 |
from sklearn.preprocessing import StandardScaler |
|
|
6 |
|
|
|
7 |
def convert_label_to_int(sample_class): |
|
|
8 |
classes, counts = np.unique(sample_class, return_counts=True) |
|
|
9 |
classes = np.argmax(sample_class.reshape((-1, 1)) == classes.reshape((1, -1)), axis=1) |
|
|
10 |
return classes |
|
|
11 |
|
|
|
12 |
def unsupervised_clustering_accuracy(y, y_pred): |
|
|
13 |
assert len(y_pred) == len(y) |
|
|
14 |
u = np.unique(np.concatenate((y, y_pred))) |
|
|
15 |
n_clusters = len(u) |
|
|
16 |
mapping = dict(zip(u, range(n_clusters))) |
|
|
17 |
reward_matrix = np.zeros((n_clusters, n_clusters), dtype=np.int64) |
|
|
18 |
for y_pred_, y_ in zip(y_pred, y): |
|
|
19 |
if y_ in mapping: |
|
|
20 |
reward_matrix[mapping[y_pred_], mapping[y_]] += 1 |
|
|
21 |
cost_matrix = reward_matrix.max() - reward_matrix |
|
|
22 |
ind = linear_assignment(cost_matrix) |
|
|
23 |
return sum([reward_matrix[i, j] for i, j in ind]) * 1.0 / y_pred.size, ind |
|
|
24 |
|
|
|
25 |
def uca_score(X, y, prediction_algorithm='knn'): |
|
|
26 |
#X_log = np.log2(X + 0.001).T |
|
|
27 |
X_scale = StandardScaler().fit_transform(X) |
|
|
28 |
# convert string labels to integer labels |
|
|
29 |
unique_classes = np.unique(y) |
|
|
30 |
labels = np.zeros(y.shape) |
|
|
31 |
for i, c in enumerate(unique_classes): |
|
|
32 |
labels[y == c] = i |
|
|
33 |
|
|
|
34 |
cluster_num = np.unique(y).shape[0] |
|
|
35 |
if prediction_algorithm == 'knn': |
|
|
36 |
labels_pred = KMeans(cluster_num, n_init=200).fit_predict(X_scale) |
|
|
37 |
elif prediction_algorithm == 'gmm': |
|
|
38 |
gmm = GMM(cluster_num) |
|
|
39 |
gmm.fit(X_scale) |
|
|
40 |
labels_pred = gmm.predict(X_scale) |
|
|
41 |
labels_int = convert_label_to_int(labels) |
|
|
42 |
score = unsupervised_clustering_accuracy(labels_int, labels_pred)[0] |
|
|
43 |
return score |
|
|
44 |
|
|
|
45 |
def knn_score(X, y, K=10, n_shuffle=None): |
|
|
46 |
from sklearn.neighbors import NearestNeighbors |
|
|
47 |
|
|
|
48 |
N = X.shape[0] |
|
|
49 |
assert K < N |
|
|
50 |
def knn_fractions(X, y): |
|
|
51 |
nn = NearestNeighbors(K) |
|
|
52 |
nn.fit(X) |
|
|
53 |
distances, indices = nn.kneighbors(X, K + 1) |
|
|
54 |
neighbor_classes = np.take(y, indices[:, 1:]) |
|
|
55 |
return np.sum(neighbor_classes == y[:, np.newaxis], axis=1) |
|
|
56 |
|
|
|
57 |
def expected_fractions(X, y): |
|
|
58 |
stats = np.zeros((n_shuffle, N)) |
|
|
59 |
for i in range(n_shuffle): |
|
|
60 |
y = np.random.permutation(y) |
|
|
61 |
stats[i] = knn_fractions(X, y) |
|
|
62 |
return stats.mean(axis=0) |
|
|
63 |
|
|
|
64 |
classes, class_sizes = np.unique(y, return_counts=True) |
|
|
65 |
classes = np.argmax(y.reshape((-1, 1)) == classes.reshape((1, -1)), axis=1) |
|
|
66 |
class_sizes = np.take(class_sizes, classes) |
|
|
67 |
# expected fraction |
|
|
68 |
mean_r = K/(N - 1)*class_sizes |
|
|
69 |
observed_r = knn_fractions(X, y) |
|
|
70 |
#mean_r = expected_fractions(X, y) |
|
|
71 |
max_r = np.minimum(K, class_sizes) |
|
|
72 |
#print(observed_r, mean_r, max_r) |
|
|
73 |
scores = (observed_r - mean_r)/(max_r - mean_r) |
|
|
74 |
return scores.mean() |