a b/src/evals.py
1
"""
2
Author: Ritambhara Singh, Pinar Demetci, Rebecca Santorella
3
19 February 2020
4
"""
5
import numpy as np
6
import random, math, os, sys
7
import matplotlib.pyplot as plt
8
from sklearn.preprocessing import normalize
9
from sklearn.metrics import roc_auc_score, silhouette_samples
10
from sklearn.decomposition import PCA
11
from sklearn.neighbors import KNeighborsClassifier
12
13
def calc_frac_idx(x1_mat,x2_mat):
14
    """
15
    Returns fraction closer than true match for each sample (as an array)
16
    """
17
    fracs = []
18
    x = []
19
    nsamp = x1_mat.shape[0]
20
    rank=0
21
    for row_idx in range(nsamp):
22
        euc_dist = np.sqrt(np.sum(np.square(np.subtract(x1_mat[row_idx,:], x2_mat)), axis=1))
23
        true_nbr = euc_dist[row_idx]
24
        sort_euc_dist = sorted(euc_dist)
25
        rank =sort_euc_dist.index(true_nbr)
26
        frac = float(rank)/(nsamp -1)
27
28
        fracs.append(frac)
29
        x.append(row_idx+1)
30
31
    return fracs,x
32
33
def calc_domainAveraged_FOSCTTM(x1_mat, x2_mat):
34
    """
35
    Outputs average FOSCTTM measure (averaged over both domains)
36
    Get the fraction matched for all data points in both directions
37
    Averages the fractions in both directions for each data point
38
    """
39
    fracs1,xs = calc_frac_idx(x1_mat, x2_mat)
40
    fracs2,xs = calc_frac_idx(x2_mat, x1_mat)
41
    fracs = []
42
    for i in range(len(fracs1)):
43
        fracs.append((fracs1[i]+fracs2[i])/2)  
44
    return fracs
45
46
def calc_sil(x1_mat,x2_mat,x1_lab,x2_lab):
47
    """
48
    Returns silhouette score for datasets with cell clusters
49
    """
50
    sil = []
51
    sil_d0 = []
52
    sil_d3 = []
53
    sil_d7 = []
54
    sil_d11 = []
55
    sil_npc = []
56
57
    x = np.concatenate((x1_mat,x2_mat))
58
    lab = np.concatenate((x1_lab,x2_lab))
59
60
    sil_score = silhouette_samples(x,lab)
61
62
    nsamp = x.shape[0]
63
    for i in range(nsamp):
64
        if(lab[i]==1):
65
            sil_d0.append(sil_score[i])
66
        elif(lab[i]==2):
67
            sil_d3.append(sil_score[i])
68
        elif(lab[i]==3):
69
            sil_d7.append(sil_score[i])
70
        elif(lab[i]==4):
71
            sil_d11.append(sil_score[i])
72
        elif(lab[i]==5):
73
            sil_npc.append(sil_score[i])
74
75
    avg = np.mean(sil_score)
76
    d0 = sum(sil_d0)/len(sil_d0)
77
    d3 = sum(sil_d3)/len(sil_d3)
78
    d7 = sum(sil_d7)/len(sil_d7)
79
    d11 = sum(sil_d11)/len(sil_d11)
80
    npc = sum(sil_npc)/len(sil_npc)
81
    
82
    return avg,d0,d3,d7,d11,npc
83
84
def binarize_labels(label,x):
85
    """
86
    Helper function for calc_auc
87
    """
88
    bin_lab = np.array([1] * len(x))
89
    idx = np.where(x == label)
90
    
91
    bin_lab[idx] = 0
92
    return bin_lab
93
    
94
def calc_auc(x1_mat, x2_mat, x1_lab, x2_lab):
95
    """
96
    calculate avg. ROC AUC scores for transformed data when there are >=2 number of clusters.
97
    """
98
    nsamp = x1_mat.shape[0]
99
    
100
    auc = []
101
    auc_d0 = []
102
    auc_d3 = []
103
    auc_d7 = []
104
    auc_d11 = []
105
    auc_npc = []
106
    
107
    for row_idx in range(nsamp):
108
        euc_dist = np.sqrt(np.sum(np.square(np.subtract(x1_mat[row_idx,:], x2_mat)), axis=1))
109
        y_scores = euc_dist
110
        y_true = binarize_labels(x1_lab[row_idx],x2_lab)
111
                
112
        auc_score = roc_auc_score(y_true, y_scores)
113
        auc.append(auc_score)
114
    
115
        if(x1_lab[row_idx]==0):
116
            auc_d0.append(auc_score)
117
        elif(x1_lab[row_idx]==1):
118
            auc_d3.append(auc_score)
119
        elif(x1_lab[row_idx]==2):
120
            auc_d7.append(auc_score)
121
        elif(x1_lab[row_idx]==3):
122
            auc_d11.append(auc_score)
123
        elif(x1_lab[row_idx]==4):
124
            auc_npc.append(auc_score)
125
        
126
    avg = sum(auc)/len(auc)
127
    d0 = sum(auc_d0)/len(auc_d0)
128
    d3 = sum(auc_d3)/len(auc_d3)
129
    d7 = sum(auc_d7)/len(auc_d7)
130
    d11 = sum(auc_d11)/len(auc_d11)
131
    npc = sum(auc_npc)/len(auc_npc)
132
    
133
    return avg,d0,d3,d7,d11,npc
134
135
def transfer_accuracy(domain1, domain2, type1, type2, n):
136
    """
137
    Metric from UnionCom: "Label Transfer Accuracy"
138
    """
139
    knn = KNeighborsClassifier(n_neighbors=n)
140
    knn.fit(domain2, type2)
141
    type1_predict = knn.predict(domain1)
142
    np.savetxt("type1_predict.txt", type1_predict)
143
    count = 0
144
    for label1, label2 in zip(type1_predict, type1):
145
        if label1 == label2:
146
            count += 1
147
    return count / len(type1)