a b/util.py
1
from __future__ import division
2
import scipy.sparse as sp 
3
import scipy.io
4
import numpy as np
5
import copy
6
from scipy import pi
7
import sys
8
import pandas as pd
9
from os.path import join
10
import gzip
11
from scipy.io import mmwrite,mmread
12
from sklearn.decomposition import PCA,TruncatedSVD
13
from sklearn.metrics.pairwise import cosine_similarity
14
from sklearn.metrics import pairwise_distances
15
from sklearn.cluster import KMeans
16
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score
17
from sklearn.metrics.cluster import homogeneity_score, adjusted_mutual_info_score
18
from sklearn.preprocessing import MinMaxScaler,MaxAbsScaler
19
import metric
20
import matplotlib
21
matplotlib.use('agg')
22
import matplotlib.pyplot as plt
23
import seaborn as sns
24
sns.set_style("whitegrid", {'axes.grid' : False})
25
matplotlib.rc('xtick', labelsize=20) 
26
matplotlib.rc('ytick', labelsize=20) 
27
matplotlib.rcParams.update({'font.size': 22})
28
29
def compute_inertia(a, X, norm=True):
30
    if norm:
31
        W = [np.sum(pairwise_distances(X[a == c, :]))/(2.*sum(a == c)) for c in np.unique(a)]
32
        return np.sum(W)
33
    else:
34
        W = [np.mean(pairwise_distances(X[a == c, :])) for c in np.unique(a)]
35
        return np.mean(W)
36
37
#gap statistic 
38
def compute_gap(clustering, data, k_max=10, n_references=100):
39
    if len(data.shape) == 1:
40
        data = data.reshape(-1, 1)
41
    reference_inertia = []
42
    for k in range(2, k_max+1):
43
        reference = np.random.rand(*data.shape)
44
        mins = np.min(data,axis=0)
45
        maxs = np.max(data,axis=0)
46
        reference = reference*(maxs-mins)+mins
47
        local_inertia = []
48
        for _ in range(n_references):
49
            clustering.n_clusters = k
50
            assignments = clustering.fit_predict(reference)
51
            local_inertia.append(compute_inertia(assignments, reference))
52
        reference_inertia.append(np.mean(local_inertia))
53
    
54
    ondata_inertia = []
55
    for k in range(2, k_max+1):
56
        clustering.n_clusters = k
57
        assignments = clustering.fit_predict(data)
58
        ondata_inertia.append(compute_inertia(assignments, data))
59
        
60
    gap = np.log(reference_inertia)-np.log(ondata_inertia)
61
    return gap, np.log(reference_inertia), np.log(ondata_inertia)
62
63
64
65
#scATAC data
66
class scATAC_Sampler(object):
67
    def __init__(self,name,dim=20,low=0.03,has_label=True):
68
        self.name = name
69
        self.dim = dim
70
        self.has_label = has_label
71
        X = pd.read_csv('datasets/%s/sc_mat.txt'%name,sep='\t',header=0,index_col=[0]).values
72
        if has_label:
73
            labels = [item.strip() for item in open('datasets/%s/label.txt'%name).readlines()]
74
            uniq_labels = list(np.unique(labels))
75
            Y = np.array([uniq_labels.index(item) for item in labels])
76
            #X,Y = self.filter_cells(X,Y,min_peaks=10)
77
            self.Y = Y
78
        X = self.filter_peaks(X,low)
79
        #TF-IDF transformation
80
        nfreqs = 1.0 * X / np.tile(np.sum(X,axis=0), (X.shape[0],1))
81
        X  = nfreqs * np.tile(np.log(1 + 1.0 * X.shape[1] / np.sum(X,axis=1)).reshape(-1,1), (1,X.shape[1]))
82
        X = X.T #(cells, peaks)
83
        X = MinMaxScaler().fit_transform(X)
84
        #PCA transformation
85
        pca = PCA(n_components=dim, random_state=3456).fit(X)
86
        X = pca.transform(X)
87
        self.X = X
88
        self.total_size = self.X.shape[0]
89
90
91
    def filter_peaks(self,X,ratio):
92
        ind = np.sum(X>0,axis=1) > X.shape[1]*ratio
93
        return X[ind,:]
94
95
    def filter_cells(self,X,Y,min_peaks):
96
        ind = np.sum(X>0,axis=0) > min_peaks
97
        return X[:,ind], Y[ind]
98
99
    def correlation(self,X,Y,heatmap=False):
100
        nb_classes = len(set(Y))
101
        print(nb_classes)
102
        km = KMeans(n_clusters=nb_classes,random_state=0).fit(X)
103
        label_kmeans = km.labels_
104
        purity = metric.compute_purity(label_kmeans, Y)
105
        nmi = normalized_mutual_info_score(Y, label_kmeans)
106
        ari = adjusted_rand_score(Y, label_kmeans)
107
        homogeneity = homogeneity_score(Y, label_kmeans)
108
        ami = adjusted_mutual_info_score(Y, label_kmeans)
109
        print('NMI = {}, ARI = {}, Purity = {},AMI = {}, Homogeneity = {}'.format(nmi,ari,purity,ami,homogeneity))
110
 
111
    def train(self, batch_size):
112
        indx = np.random.randint(low = 0, high = self.total_size, size = batch_size)
113
114
        if self.has_label:
115
            return self.X[indx, :], self.Y[indx]
116
        else:
117
            return self.X[indx, :]
118
119
    def load_all(self):
120
        if self.has_label:
121
            return self.X, self.Y
122
        else:
123
            return self.X 
124
125
126
#load data from 10x Genomic paired ARC technology
127
class ARC_Sampler(object):
128
    def __init__(self,name='PBMC10k',n_components=50,scale=10000,filter_feat=True,filter_cell=False,random_seed=1234,mode=1, \
129
        min_rna_c=0,max_rna_c=None,min_atac_c=0,max_atac_c=None):
130
        #c:cell, g:gene, l:locus
131
        self.name = name
132
        self.mode = mode
133
        self.min_rna_c = min_rna_c
134
        self.max_rna_c = max_rna_c
135
        self.min_atac_c = min_atac_c
136
        self.max_atac_c = max_atac_c
137
138
        self.rna_mat, self.atac_mat, self.genes, self.peaks = self.load_data(filter_feat,filter_cell)
139
140
        self.rna_mat = self.rna_mat*scale/self.rna_mat.sum(1)
141
        self.rna_mat = np.log10(self.rna_mat+1)
142
        self.atac_mat = self.atac_mat*scale/self.atac_mat.sum(1)
143
        self.atac_mat = np.log10(self.atac_mat+1)
144
145
        self.rna_reducer = PCA(n_components=n_components, random_state=random_seed)
146
        self.rna_reducer.fit(self.rna_mat)
147
        self.pca_rna_mat = self.rna_reducer.transform(self.rna_mat)
148
149
        self.atac_reducer = PCA(n_components=n_components, random_state=random_seed)
150
        self.atac_reducer.fit(self.atac_mat)
151
        self.pca_atac_mat = self.atac_reducer.transform(self.atac_mat)
152
153
    def load_data(self,filter_feat,filter_cell):
154
        mtx_file = 'datasets/%s/matrix.mtx'%self.name
155
        feat_file = 'datasets/%s/features.tsv'%self.name
156
        barcode_file = 'datasets/%s/barcodes.tsv'%self.name
157
        combined_mat = mmread(mtx_file).T.tocsr() #(cells, genes+peaks)
158
        cells = [item.strip() for item in open(barcode_file).readlines()] 
159
        genes = [item.split('\t')[1] for item in open(feat_file).readlines() if item.split('\t')[2]=="Gene Expression"]
160
        peaks = [item.split('\t')[1] for item in open(feat_file).readlines() if item.split('\t')[2]=="Peaks"]
161
        assert len(genes)+len(peaks) == combined_mat.shape[1]
162
        rna_mat = combined_mat[:,:len(genes)]
163
        atac_mat = combined_mat[:,len(genes):]
164
        print('scRNA-seq: ', rna_mat.shape, 'scATAC-seq: ', atac_mat.shape)
165
166
        if filter_feat:
167
            rna_mat, atac_mat, genes, peaks = self.filter_feats(rna_mat, atac_mat, genes, peaks)
168
            print('scRNA-seq filtered: ', rna_mat.shape, 'scATAC-seq filtered: ', atac_mat.shape)
169
        return rna_mat, atac_mat, genes, peaks
170
171
    def filter_feats(self, rna_mat_sp, atac_mat_sp, genes, peaks):
172
        #filter genes
173
        gene_select = np.array((rna_mat_sp>0).sum(axis=0)).squeeze() > self.min_rna_c
174
        if self.max_rna_c is not None:
175
            gene_select *= np.array((rna_mat_sp>0).sum(axis=0)).squeeze() < self.max_rna_c
176
        rna_mat_sp = rna_mat_sp[:,gene_select]
177
        genes = np.array(genes)[gene_select]
178
        #filter peaks
179
        locus_select = np.array((atac_mat_sp>0).sum(axis=0)).squeeze() > self.min_atac_c
180
        if self.max_atac_c is not None:
181
            locus_select *= np.array((atac_mat_sp>0).sum(axis=0)).squeeze() < self.max_atac_c
182
        atac_mat_sp = atac_mat_sp[:,locus_select]
183
        peaks = np.array(peaks)[locus_select]
184
        return rna_mat_sp, atac_mat_sp, genes, peaks
185
186
    def get_batch(self,batch_size):
187
        idx = np.random.randint(low = 0, high = self.atac_mat.shape[0], size = batch_size)
188
        if self.mode == 1:
189
            return self.pca_rna_mat[idx,:]
190
        elif self.mode == 2:
191
            return self.pca_atac_mat[idx,:]
192
        elif self.mode == 3:
193
            return np.hstack((self.pca_rna_mat, self.pca_atac_mat))[idx,:]
194
        else:
195
            print('Wrong mode!')
196
            sys.exit()
197
    
198
    def load_all(self):
199
        #mode: 1 only scRNA-seq, 2 only scATAC-seq, 3 both
200
        if self.mode == 1:
201
            return self.pca_rna_mat
202
        elif self.mode == 2:
203
            return self.pca_atac_mat
204
        elif self.mode == 3:
205
            return np.hstack((self.pca_rna_mat, self.pca_atac_mat))
206
        else:
207
            print('Wrong mode!')
208
            sys.exit()
209
210
211
#sample continuous (Gaussian) and discrete (Catagory) latent variables together
212
class Mixture_sampler(object):
213
    def __init__(self, nb_classes, N, dim, sd, scale=1):
214
        self.nb_classes = nb_classes
215
        self.total_size = N
216
        self.dim = dim
217
        self.sd = sd 
218
        self.scale = scale
219
        np.random.seed(1024)
220
        self.X_c = self.scale*np.random.normal(0, self.sd**2, (self.total_size,self.dim))
221
        #self.X_c = self.scale*np.random.uniform(-1, 1, (self.total_size,self.dim))
222
        self.label_idx = np.random.randint(low = 0 , high = self.nb_classes, size = self.total_size)
223
        self.X_d = np.eye(self.nb_classes)[self.label_idx]
224
        self.X = np.hstack((self.X_c,self.X_d))
225
    
226
    def train(self,batch_size,weights=None):
227
        X_batch_c = self.scale*np.random.normal(0, 1, (batch_size,self.dim))
228
        #X_batch_c = self.scale*np.random.uniform(-1, 1, (batch_size,self.dim))
229
        if weights is None:
230
            weights = np.ones(self.nb_classes, dtype=np.float64) / float(self.nb_classes)
231
        label_batch_idx =  np.random.choice(self.nb_classes, size=batch_size, replace=True, p=weights)
232
        X_batch_d = np.eye(self.nb_classes)[label_batch_idx]
233
        return X_batch_c, X_batch_d
234
235
    def load_all(self):
236
        return self.X_c, self.X_d
237
238
#sample continuous (Gaussian Mixture) and discrete (Catagory) latent variables together
239
class Mixture_sampler_v2(object):
240
    def __init__(self, nb_classes, N, dim, weights=None,sd=0.5):
241
        self.nb_classes = nb_classes
242
        self.total_size = N
243
        self.dim = dim
244
        np.random.seed(1024)
245
        if nb_classes<=dim:
246
            self.mean = np.random.uniform(-5,5,size =(nb_classes, dim))
247
            #self.mean = np.zeros((nb_classes,dim))
248
            #self.mean[:,:nb_classes] = np.eye(nb_classes)
249
        else:
250
            if dim==2:
251
                self.mean = np.array([(np.cos(2*np.pi*idx/float(self.nb_classes)),np.sin(2*np.pi*idx/float(self.nb_classes))) for idx in range(self.nb_classes)])
252
            else:
253
                self.mean = np.zeros((nb_classes,dim))
254
                self.mean[:,:2] = np.array([(np.cos(2*np.pi*idx/float(self.nb_classes)),np.sin(2*np.pi*idx/float(self.nb_classes))) for idx in range(self.nb_classes)])
255
        self.cov = [sd**2*np.eye(dim) for item in range(nb_classes)]
256
        if weights is None:
257
            weights = np.ones(self.nb_classes, dtype=np.float64) / float(self.nb_classes)
258
        self.Y = np.random.choice(self.nb_classes, size=N, replace=True, p=weights)
259
        self.X_c = np.array([np.random.multivariate_normal(mean=self.mean[i],cov=self.cov[i]) for i in self.Y],dtype='float64')
260
        self.X_d = np.eye(self.nb_classes)[self.Y]
261
        self.X = np.hstack((self.X_c,self.X_d))
262
263
    def train(self, batch_size, label = False):
264
        indx = np.random.randint(low = 0, high = self.total_size, size = batch_size)
265
        if label:
266
            return self.X_c[indx, :], self.X_d[indx, :], self.Y[indx, :]
267
        else:
268
            return self.X_c[indx, :], self.X_d[indx, :]
269
270
    def get_batch(self,batch_size,weights=None):
271
        if weights is None:
272
            weights = np.ones(self.nb_classes, dtype=np.float64) / float(self.nb_classes)
273
        label_batch_idx =  np.random.choice(self.nb_classes, size=batch_size, replace=True, p=weights)
274
        return self.X_c[label_batch_idx, :], self.X_d[label_batch_idx, :]
275
    def predict_onepoint(self,array):#return component index with max likelyhood
276
        from scipy.stats import multivariate_normal
277
        assert len(array) == self.dim
278
        return np.argmax([multivariate_normal.pdf(array,self.mean[idx],self.cov[idx]) for idx in range(self.nb_classes)])
279
280
    def predict_multipoints(self,arrays):
281
        assert arrays.shape[-1] == self.dim
282
        return map(self.predict_onepoint,arrays)
283
    def load_all(self):
284
        return self.X_c, self.X_d, self.label_idx
285
286
#get a batch of data from previous 50 batches, add stochastic
287
class DataPool(object):
288
    def __init__(self, maxsize=50):
289
        self.maxsize = maxsize
290
        self.nb_batch = 0
291
        self.pool = []
292
293
    def __call__(self, data):
294
        if self.nb_batch < self.maxsize:
295
            self.pool.append(data)
296
            self.nb_batch += 1
297
            return data
298
        if np.random.rand() > 0.5:
299
            results=[]
300
            for i in range(len(data)):
301
                idx = int(np.random.rand()*self.maxsize)
302
                results.append(copy.copy(self.pool[idx])[i])
303
                self.pool[idx][i] = data[i]
304
            return results
305
        else:
306
            return data
307