|
a |
|
b/util.py |
|
|
1 |
from __future__ import division |
|
|
2 |
import scipy.sparse as sp |
|
|
3 |
import scipy.io |
|
|
4 |
import numpy as np |
|
|
5 |
import copy |
|
|
6 |
from scipy import pi |
|
|
7 |
import sys |
|
|
8 |
import pandas as pd |
|
|
9 |
from os.path import join |
|
|
10 |
import gzip |
|
|
11 |
from scipy.io import mmwrite,mmread |
|
|
12 |
from sklearn.decomposition import PCA,TruncatedSVD |
|
|
13 |
from sklearn.metrics.pairwise import cosine_similarity |
|
|
14 |
from sklearn.metrics import pairwise_distances |
|
|
15 |
from sklearn.cluster import KMeans |
|
|
16 |
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score |
|
|
17 |
from sklearn.metrics.cluster import homogeneity_score, adjusted_mutual_info_score |
|
|
18 |
from sklearn.preprocessing import MinMaxScaler,MaxAbsScaler |
|
|
19 |
import metric |
|
|
20 |
import matplotlib |
|
|
21 |
matplotlib.use('agg') |
|
|
22 |
import matplotlib.pyplot as plt |
|
|
23 |
import seaborn as sns |
|
|
24 |
sns.set_style("whitegrid", {'axes.grid' : False}) |
|
|
25 |
matplotlib.rc('xtick', labelsize=20) |
|
|
26 |
matplotlib.rc('ytick', labelsize=20) |
|
|
27 |
matplotlib.rcParams.update({'font.size': 22}) |
|
|
28 |
|
|
|
29 |
def compute_inertia(a, X, norm=True): |
|
|
30 |
if norm: |
|
|
31 |
W = [np.sum(pairwise_distances(X[a == c, :]))/(2.*sum(a == c)) for c in np.unique(a)] |
|
|
32 |
return np.sum(W) |
|
|
33 |
else: |
|
|
34 |
W = [np.mean(pairwise_distances(X[a == c, :])) for c in np.unique(a)] |
|
|
35 |
return np.mean(W) |
|
|
36 |
|
|
|
37 |
#gap statistic |
|
|
38 |
def compute_gap(clustering, data, k_max=10, n_references=100): |
|
|
39 |
if len(data.shape) == 1: |
|
|
40 |
data = data.reshape(-1, 1) |
|
|
41 |
reference_inertia = [] |
|
|
42 |
for k in range(2, k_max+1): |
|
|
43 |
reference = np.random.rand(*data.shape) |
|
|
44 |
mins = np.min(data,axis=0) |
|
|
45 |
maxs = np.max(data,axis=0) |
|
|
46 |
reference = reference*(maxs-mins)+mins |
|
|
47 |
local_inertia = [] |
|
|
48 |
for _ in range(n_references): |
|
|
49 |
clustering.n_clusters = k |
|
|
50 |
assignments = clustering.fit_predict(reference) |
|
|
51 |
local_inertia.append(compute_inertia(assignments, reference)) |
|
|
52 |
reference_inertia.append(np.mean(local_inertia)) |
|
|
53 |
|
|
|
54 |
ondata_inertia = [] |
|
|
55 |
for k in range(2, k_max+1): |
|
|
56 |
clustering.n_clusters = k |
|
|
57 |
assignments = clustering.fit_predict(data) |
|
|
58 |
ondata_inertia.append(compute_inertia(assignments, data)) |
|
|
59 |
|
|
|
60 |
gap = np.log(reference_inertia)-np.log(ondata_inertia) |
|
|
61 |
return gap, np.log(reference_inertia), np.log(ondata_inertia) |
|
|
62 |
|
|
|
63 |
|
|
|
64 |
|
|
|
65 |
#scATAC data |
|
|
66 |
class scATAC_Sampler(object): |
|
|
67 |
def __init__(self,name,dim=20,low=0.03,has_label=True): |
|
|
68 |
self.name = name |
|
|
69 |
self.dim = dim |
|
|
70 |
self.has_label = has_label |
|
|
71 |
X = pd.read_csv('datasets/%s/sc_mat.txt'%name,sep='\t',header=0,index_col=[0]).values |
|
|
72 |
if has_label: |
|
|
73 |
labels = [item.strip() for item in open('datasets/%s/label.txt'%name).readlines()] |
|
|
74 |
uniq_labels = list(np.unique(labels)) |
|
|
75 |
Y = np.array([uniq_labels.index(item) for item in labels]) |
|
|
76 |
#X,Y = self.filter_cells(X,Y,min_peaks=10) |
|
|
77 |
self.Y = Y |
|
|
78 |
X = self.filter_peaks(X,low) |
|
|
79 |
#TF-IDF transformation |
|
|
80 |
nfreqs = 1.0 * X / np.tile(np.sum(X,axis=0), (X.shape[0],1)) |
|
|
81 |
X = nfreqs * np.tile(np.log(1 + 1.0 * X.shape[1] / np.sum(X,axis=1)).reshape(-1,1), (1,X.shape[1])) |
|
|
82 |
X = X.T #(cells, peaks) |
|
|
83 |
X = MinMaxScaler().fit_transform(X) |
|
|
84 |
#PCA transformation |
|
|
85 |
pca = PCA(n_components=dim, random_state=3456).fit(X) |
|
|
86 |
X = pca.transform(X) |
|
|
87 |
self.X = X |
|
|
88 |
self.total_size = self.X.shape[0] |
|
|
89 |
|
|
|
90 |
|
|
|
91 |
def filter_peaks(self,X,ratio): |
|
|
92 |
ind = np.sum(X>0,axis=1) > X.shape[1]*ratio |
|
|
93 |
return X[ind,:] |
|
|
94 |
|
|
|
95 |
def filter_cells(self,X,Y,min_peaks): |
|
|
96 |
ind = np.sum(X>0,axis=0) > min_peaks |
|
|
97 |
return X[:,ind], Y[ind] |
|
|
98 |
|
|
|
99 |
def correlation(self,X,Y,heatmap=False): |
|
|
100 |
nb_classes = len(set(Y)) |
|
|
101 |
print(nb_classes) |
|
|
102 |
km = KMeans(n_clusters=nb_classes,random_state=0).fit(X) |
|
|
103 |
label_kmeans = km.labels_ |
|
|
104 |
purity = metric.compute_purity(label_kmeans, Y) |
|
|
105 |
nmi = normalized_mutual_info_score(Y, label_kmeans) |
|
|
106 |
ari = adjusted_rand_score(Y, label_kmeans) |
|
|
107 |
homogeneity = homogeneity_score(Y, label_kmeans) |
|
|
108 |
ami = adjusted_mutual_info_score(Y, label_kmeans) |
|
|
109 |
print('NMI = {}, ARI = {}, Purity = {},AMI = {}, Homogeneity = {}'.format(nmi,ari,purity,ami,homogeneity)) |
|
|
110 |
|
|
|
111 |
def train(self, batch_size): |
|
|
112 |
indx = np.random.randint(low = 0, high = self.total_size, size = batch_size) |
|
|
113 |
|
|
|
114 |
if self.has_label: |
|
|
115 |
return self.X[indx, :], self.Y[indx] |
|
|
116 |
else: |
|
|
117 |
return self.X[indx, :] |
|
|
118 |
|
|
|
119 |
def load_all(self): |
|
|
120 |
if self.has_label: |
|
|
121 |
return self.X, self.Y |
|
|
122 |
else: |
|
|
123 |
return self.X |
|
|
124 |
|
|
|
125 |
|
|
|
126 |
#load data from 10x Genomic paired ARC technology |
|
|
127 |
class ARC_Sampler(object): |
|
|
128 |
def __init__(self,name='PBMC10k',n_components=50,scale=10000,filter_feat=True,filter_cell=False,random_seed=1234,mode=1, \ |
|
|
129 |
min_rna_c=0,max_rna_c=None,min_atac_c=0,max_atac_c=None): |
|
|
130 |
#c:cell, g:gene, l:locus |
|
|
131 |
self.name = name |
|
|
132 |
self.mode = mode |
|
|
133 |
self.min_rna_c = min_rna_c |
|
|
134 |
self.max_rna_c = max_rna_c |
|
|
135 |
self.min_atac_c = min_atac_c |
|
|
136 |
self.max_atac_c = max_atac_c |
|
|
137 |
|
|
|
138 |
self.rna_mat, self.atac_mat, self.genes, self.peaks = self.load_data(filter_feat,filter_cell) |
|
|
139 |
|
|
|
140 |
self.rna_mat = self.rna_mat*scale/self.rna_mat.sum(1) |
|
|
141 |
self.rna_mat = np.log10(self.rna_mat+1) |
|
|
142 |
self.atac_mat = self.atac_mat*scale/self.atac_mat.sum(1) |
|
|
143 |
self.atac_mat = np.log10(self.atac_mat+1) |
|
|
144 |
|
|
|
145 |
self.rna_reducer = PCA(n_components=n_components, random_state=random_seed) |
|
|
146 |
self.rna_reducer.fit(self.rna_mat) |
|
|
147 |
self.pca_rna_mat = self.rna_reducer.transform(self.rna_mat) |
|
|
148 |
|
|
|
149 |
self.atac_reducer = PCA(n_components=n_components, random_state=random_seed) |
|
|
150 |
self.atac_reducer.fit(self.atac_mat) |
|
|
151 |
self.pca_atac_mat = self.atac_reducer.transform(self.atac_mat) |
|
|
152 |
|
|
|
153 |
def load_data(self,filter_feat,filter_cell): |
|
|
154 |
mtx_file = 'datasets/%s/matrix.mtx'%self.name |
|
|
155 |
feat_file = 'datasets/%s/features.tsv'%self.name |
|
|
156 |
barcode_file = 'datasets/%s/barcodes.tsv'%self.name |
|
|
157 |
combined_mat = mmread(mtx_file).T.tocsr() #(cells, genes+peaks) |
|
|
158 |
cells = [item.strip() for item in open(barcode_file).readlines()] |
|
|
159 |
genes = [item.split('\t')[1] for item in open(feat_file).readlines() if item.split('\t')[2]=="Gene Expression"] |
|
|
160 |
peaks = [item.split('\t')[1] for item in open(feat_file).readlines() if item.split('\t')[2]=="Peaks"] |
|
|
161 |
assert len(genes)+len(peaks) == combined_mat.shape[1] |
|
|
162 |
rna_mat = combined_mat[:,:len(genes)] |
|
|
163 |
atac_mat = combined_mat[:,len(genes):] |
|
|
164 |
print('scRNA-seq: ', rna_mat.shape, 'scATAC-seq: ', atac_mat.shape) |
|
|
165 |
|
|
|
166 |
if filter_feat: |
|
|
167 |
rna_mat, atac_mat, genes, peaks = self.filter_feats(rna_mat, atac_mat, genes, peaks) |
|
|
168 |
print('scRNA-seq filtered: ', rna_mat.shape, 'scATAC-seq filtered: ', atac_mat.shape) |
|
|
169 |
return rna_mat, atac_mat, genes, peaks |
|
|
170 |
|
|
|
171 |
def filter_feats(self, rna_mat_sp, atac_mat_sp, genes, peaks): |
|
|
172 |
#filter genes |
|
|
173 |
gene_select = np.array((rna_mat_sp>0).sum(axis=0)).squeeze() > self.min_rna_c |
|
|
174 |
if self.max_rna_c is not None: |
|
|
175 |
gene_select *= np.array((rna_mat_sp>0).sum(axis=0)).squeeze() < self.max_rna_c |
|
|
176 |
rna_mat_sp = rna_mat_sp[:,gene_select] |
|
|
177 |
genes = np.array(genes)[gene_select] |
|
|
178 |
#filter peaks |
|
|
179 |
locus_select = np.array((atac_mat_sp>0).sum(axis=0)).squeeze() > self.min_atac_c |
|
|
180 |
if self.max_atac_c is not None: |
|
|
181 |
locus_select *= np.array((atac_mat_sp>0).sum(axis=0)).squeeze() < self.max_atac_c |
|
|
182 |
atac_mat_sp = atac_mat_sp[:,locus_select] |
|
|
183 |
peaks = np.array(peaks)[locus_select] |
|
|
184 |
return rna_mat_sp, atac_mat_sp, genes, peaks |
|
|
185 |
|
|
|
186 |
def get_batch(self,batch_size): |
|
|
187 |
idx = np.random.randint(low = 0, high = self.atac_mat.shape[0], size = batch_size) |
|
|
188 |
if self.mode == 1: |
|
|
189 |
return self.pca_rna_mat[idx,:] |
|
|
190 |
elif self.mode == 2: |
|
|
191 |
return self.pca_atac_mat[idx,:] |
|
|
192 |
elif self.mode == 3: |
|
|
193 |
return np.hstack((self.pca_rna_mat, self.pca_atac_mat))[idx,:] |
|
|
194 |
else: |
|
|
195 |
print('Wrong mode!') |
|
|
196 |
sys.exit() |
|
|
197 |
|
|
|
198 |
def load_all(self): |
|
|
199 |
#mode: 1 only scRNA-seq, 2 only scATAC-seq, 3 both |
|
|
200 |
if self.mode == 1: |
|
|
201 |
return self.pca_rna_mat |
|
|
202 |
elif self.mode == 2: |
|
|
203 |
return self.pca_atac_mat |
|
|
204 |
elif self.mode == 3: |
|
|
205 |
return np.hstack((self.pca_rna_mat, self.pca_atac_mat)) |
|
|
206 |
else: |
|
|
207 |
print('Wrong mode!') |
|
|
208 |
sys.exit() |
|
|
209 |
|
|
|
210 |
|
|
|
211 |
#sample continuous (Gaussian) and discrete (Catagory) latent variables together |
|
|
212 |
class Mixture_sampler(object): |
|
|
213 |
def __init__(self, nb_classes, N, dim, sd, scale=1): |
|
|
214 |
self.nb_classes = nb_classes |
|
|
215 |
self.total_size = N |
|
|
216 |
self.dim = dim |
|
|
217 |
self.sd = sd |
|
|
218 |
self.scale = scale |
|
|
219 |
np.random.seed(1024) |
|
|
220 |
self.X_c = self.scale*np.random.normal(0, self.sd**2, (self.total_size,self.dim)) |
|
|
221 |
#self.X_c = self.scale*np.random.uniform(-1, 1, (self.total_size,self.dim)) |
|
|
222 |
self.label_idx = np.random.randint(low = 0 , high = self.nb_classes, size = self.total_size) |
|
|
223 |
self.X_d = np.eye(self.nb_classes)[self.label_idx] |
|
|
224 |
self.X = np.hstack((self.X_c,self.X_d)) |
|
|
225 |
|
|
|
226 |
def train(self,batch_size,weights=None): |
|
|
227 |
X_batch_c = self.scale*np.random.normal(0, 1, (batch_size,self.dim)) |
|
|
228 |
#X_batch_c = self.scale*np.random.uniform(-1, 1, (batch_size,self.dim)) |
|
|
229 |
if weights is None: |
|
|
230 |
weights = np.ones(self.nb_classes, dtype=np.float64) / float(self.nb_classes) |
|
|
231 |
label_batch_idx = np.random.choice(self.nb_classes, size=batch_size, replace=True, p=weights) |
|
|
232 |
X_batch_d = np.eye(self.nb_classes)[label_batch_idx] |
|
|
233 |
return X_batch_c, X_batch_d |
|
|
234 |
|
|
|
235 |
def load_all(self): |
|
|
236 |
return self.X_c, self.X_d |
|
|
237 |
|
|
|
238 |
#sample continuous (Gaussian Mixture) and discrete (Catagory) latent variables together |
|
|
239 |
class Mixture_sampler_v2(object): |
|
|
240 |
def __init__(self, nb_classes, N, dim, weights=None,sd=0.5): |
|
|
241 |
self.nb_classes = nb_classes |
|
|
242 |
self.total_size = N |
|
|
243 |
self.dim = dim |
|
|
244 |
np.random.seed(1024) |
|
|
245 |
if nb_classes<=dim: |
|
|
246 |
self.mean = np.random.uniform(-5,5,size =(nb_classes, dim)) |
|
|
247 |
#self.mean = np.zeros((nb_classes,dim)) |
|
|
248 |
#self.mean[:,:nb_classes] = np.eye(nb_classes) |
|
|
249 |
else: |
|
|
250 |
if dim==2: |
|
|
251 |
self.mean = np.array([(np.cos(2*np.pi*idx/float(self.nb_classes)),np.sin(2*np.pi*idx/float(self.nb_classes))) for idx in range(self.nb_classes)]) |
|
|
252 |
else: |
|
|
253 |
self.mean = np.zeros((nb_classes,dim)) |
|
|
254 |
self.mean[:,:2] = np.array([(np.cos(2*np.pi*idx/float(self.nb_classes)),np.sin(2*np.pi*idx/float(self.nb_classes))) for idx in range(self.nb_classes)]) |
|
|
255 |
self.cov = [sd**2*np.eye(dim) for item in range(nb_classes)] |
|
|
256 |
if weights is None: |
|
|
257 |
weights = np.ones(self.nb_classes, dtype=np.float64) / float(self.nb_classes) |
|
|
258 |
self.Y = np.random.choice(self.nb_classes, size=N, replace=True, p=weights) |
|
|
259 |
self.X_c = np.array([np.random.multivariate_normal(mean=self.mean[i],cov=self.cov[i]) for i in self.Y],dtype='float64') |
|
|
260 |
self.X_d = np.eye(self.nb_classes)[self.Y] |
|
|
261 |
self.X = np.hstack((self.X_c,self.X_d)) |
|
|
262 |
|
|
|
263 |
def train(self, batch_size, label = False): |
|
|
264 |
indx = np.random.randint(low = 0, high = self.total_size, size = batch_size) |
|
|
265 |
if label: |
|
|
266 |
return self.X_c[indx, :], self.X_d[indx, :], self.Y[indx, :] |
|
|
267 |
else: |
|
|
268 |
return self.X_c[indx, :], self.X_d[indx, :] |
|
|
269 |
|
|
|
270 |
def get_batch(self,batch_size,weights=None): |
|
|
271 |
if weights is None: |
|
|
272 |
weights = np.ones(self.nb_classes, dtype=np.float64) / float(self.nb_classes) |
|
|
273 |
label_batch_idx = np.random.choice(self.nb_classes, size=batch_size, replace=True, p=weights) |
|
|
274 |
return self.X_c[label_batch_idx, :], self.X_d[label_batch_idx, :] |
|
|
275 |
def predict_onepoint(self,array):#return component index with max likelyhood |
|
|
276 |
from scipy.stats import multivariate_normal |
|
|
277 |
assert len(array) == self.dim |
|
|
278 |
return np.argmax([multivariate_normal.pdf(array,self.mean[idx],self.cov[idx]) for idx in range(self.nb_classes)]) |
|
|
279 |
|
|
|
280 |
def predict_multipoints(self,arrays): |
|
|
281 |
assert arrays.shape[-1] == self.dim |
|
|
282 |
return map(self.predict_onepoint,arrays) |
|
|
283 |
def load_all(self): |
|
|
284 |
return self.X_c, self.X_d, self.label_idx |
|
|
285 |
|
|
|
286 |
#get a batch of data from previous 50 batches, add stochastic |
|
|
287 |
class DataPool(object): |
|
|
288 |
def __init__(self, maxsize=50): |
|
|
289 |
self.maxsize = maxsize |
|
|
290 |
self.nb_batch = 0 |
|
|
291 |
self.pool = [] |
|
|
292 |
|
|
|
293 |
def __call__(self, data): |
|
|
294 |
if self.nb_batch < self.maxsize: |
|
|
295 |
self.pool.append(data) |
|
|
296 |
self.nb_batch += 1 |
|
|
297 |
return data |
|
|
298 |
if np.random.rand() > 0.5: |
|
|
299 |
results=[] |
|
|
300 |
for i in range(len(data)): |
|
|
301 |
idx = int(np.random.rand()*self.maxsize) |
|
|
302 |
results.append(copy.copy(self.pool[idx])[i]) |
|
|
303 |
self.pool[idx][i] = data[i] |
|
|
304 |
return results |
|
|
305 |
else: |
|
|
306 |
return data |
|
|
307 |
|