a b/src/.ipynb_checkpoints/evals-checkpoint.py
1
"""
2
Author: Ritambhara Singh, Pinar Demetci, Rebecca Santorella
3
19 February 2020
4
"""
5
import numpy as np
6
import random, math, os, sys
7
import matplotlib.pyplot as plt
8
from sklearn.preprocessing import normalize
9
from sklearn.metrics import roc_auc_score, silhouette_samples
10
from sklearn.decomposition import PCA
11
12
def calc_frac_idx(x1_mat,x2_mat):
13
    """
14
    Returns fraction closer than true match for each sample (as an array)
15
    """
16
    fracs = []
17
    x = []
18
    nsamp = x1_mat.shape[0]
19
    rank=0
20
    for row_idx in range(nsamp):
21
        euc_dist = np.sqrt(np.sum(np.square(np.subtract(x1_mat[row_idx,:], x2_mat)), axis=1))
22
        true_nbr = euc_dist[row_idx]
23
        sort_euc_dist = sorted(euc_dist)
24
        rank =sort_euc_dist.index(true_nbr)
25
        frac = float(rank)/(nsamp -1)
26
27
        fracs.append(frac)
28
        x.append(row_idx+1)
29
30
    return fracs,x
31
32
def calc_domainAveraged_FOSCTTM(x1_mat, x2_mat):
33
    """
34
    Outputs average FOSCTTM measure (averaged over both domains)
35
    Get the fraction matched for all data points in both directions
36
    Averages the fractions in both directions for each data point
37
    """
38
    fracs1,xs = calc_frac_idx(x1_mat, x2_mat)
39
    fracs2,xs = calc_frac_idx(x2_mat, x1_mat)
40
    fracs = []
41
    for i in range(len(fracs1)):
42
        fracs.append((fracs1[i]+fracs2[i])/2)  
43
    return fracs
44
45
def calc_sil(x1_mat,x2_mat,x1_lab,x2_lab):
46
    """
47
    Returns silhouette score for datasets with cell clusters
48
    """
49
    sil = []
50
    sil_d0 = []
51
    sil_d3 = []
52
    sil_d7 = []
53
    sil_d11 = []
54
    sil_npc = []
55
56
    x = np.concatenate((x1_mat,x2_mat))
57
    lab = np.concatenate((x1_lab,x2_lab))
58
59
    sil_score = silhouette_samples(x,lab)
60
61
    nsamp = x.shape[0]
62
    for i in range(nsamp):
63
        if(lab[i]==1):
64
            sil_d0.append(sil_score[i])
65
        elif(lab[i]==2):
66
            sil_d3.append(sil_score[i])
67
        elif(lab[i]==3):
68
            sil_d7.append(sil_score[i])
69
        elif(lab[i]==4):
70
            sil_d11.append(sil_score[i])
71
        elif(lab[i]==5):
72
            sil_npc.append(sil_score[i])
73
74
    avg = np.mean(sil_score)
75
    d0 = sum(sil_d0)/len(sil_d0)
76
    d3 = sum(sil_d3)/len(sil_d3)
77
    d7 = sum(sil_d7)/len(sil_d7)
78
    d11 = sum(sil_d11)/len(sil_d11)
79
    npc = sum(sil_npc)/len(sil_npc)
80
    
81
    return avg,d0,d3,d7,d11,npc
82
83
def binarize_labels(label,x):
84
    """
85
    Helper function for calc_auc
86
    """
87
    bin_lab = np.array([1] * len(x))
88
    idx = np.where(x == label)
89
    
90
    bin_lab[idx] = 0
91
    return bin_lab
92
    
93
def calc_auc(x1_mat, x2_mat, x1_lab, x2_lab):
94
    """
95
    calculate avg. ROC AUC scores for transformed data when there are >=2 number of clusters.
96
    """
97
    nsamp = x1_mat.shape[0]
98
    
99
    auc = []
100
    auc_d0 = []
101
    auc_d3 = []
102
    auc_d7 = []
103
    auc_d11 = []
104
    auc_npc = []
105
    
106
    for row_idx in range(nsamp):
107
        euc_dist = np.sqrt(np.sum(np.square(np.subtract(x1_mat[row_idx,:], x2_mat)), axis=1))
108
        y_scores = euc_dist
109
        y_true = binarize_labels(x1_lab[row_idx],x2_lab)
110
                
111
        auc_score = roc_auc_score(y_true, y_scores)
112
        auc.append(auc_score)
113
    
114
        if(x1_lab[row_idx]==0):
115
            auc_d0.append(auc_score)
116
        elif(x1_lab[row_idx]==1):
117
            auc_d3.append(auc_score)
118
        elif(x1_lab[row_idx]==2):
119
            auc_d7.append(auc_score)
120
        elif(x1_lab[row_idx]==3):
121
            auc_d11.append(auc_score)
122
        elif(x1_lab[row_idx]==4):
123
            auc_npc.append(auc_score)
124
        
125
    avg = sum(auc)/len(auc)
126
    d0 = sum(auc_d0)/len(auc_d0)
127
    d3 = sum(auc_d3)/len(auc_d3)
128
    d7 = sum(auc_d7)/len(auc_d7)
129
    d11 = sum(auc_d11)/len(auc_d11)
130
    npc = sum(auc_npc)/len(auc_npc)
131
    
132
    return avg,d0,d3,d7,d11,npc
133
134
def transfer_accuracy(domain1, domain2, type1, type2, n):
135
    """
136
    Metric from UnionCom: "Label Transfer Accuracy"
137
    """
138
    knn = KNeighborsClassifier(n_neighbors=n)
139
    knn.fit(domain2, type2)
140
    type1_predict = knn.predict(domain1)
141
    np.savetxt("type1_predict.txt", type1_predict)
142
    count = 0
143
    for label1, label2 in zip(type1_predict, type1):
144
        if label1 == label2:
145
            count += 1
146
    return count / len(type1)