--- a +++ b/util.py @@ -0,0 +1,307 @@ +from __future__ import division +import scipy.sparse as sp +import scipy.io +import numpy as np +import copy +from scipy import pi +import sys +import pandas as pd +from os.path import join +import gzip +from scipy.io import mmwrite,mmread +from sklearn.decomposition import PCA,TruncatedSVD +from sklearn.metrics.pairwise import cosine_similarity +from sklearn.metrics import pairwise_distances +from sklearn.cluster import KMeans +from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score +from sklearn.metrics.cluster import homogeneity_score, adjusted_mutual_info_score +from sklearn.preprocessing import MinMaxScaler,MaxAbsScaler +import metric +import matplotlib +matplotlib.use('agg') +import matplotlib.pyplot as plt +import seaborn as sns +sns.set_style("whitegrid", {'axes.grid' : False}) +matplotlib.rc('xtick', labelsize=20) +matplotlib.rc('ytick', labelsize=20) +matplotlib.rcParams.update({'font.size': 22}) + +def compute_inertia(a, X, norm=True): + if norm: + W = [np.sum(pairwise_distances(X[a == c, :]))/(2.*sum(a == c)) for c in np.unique(a)] + return np.sum(W) + else: + W = [np.mean(pairwise_distances(X[a == c, :])) for c in np.unique(a)] + return np.mean(W) + +#gap statistic +def compute_gap(clustering, data, k_max=10, n_references=100): + if len(data.shape) == 1: + data = data.reshape(-1, 1) + reference_inertia = [] + for k in range(2, k_max+1): + reference = np.random.rand(*data.shape) + mins = np.min(data,axis=0) + maxs = np.max(data,axis=0) + reference = reference*(maxs-mins)+mins + local_inertia = [] + for _ in range(n_references): + clustering.n_clusters = k + assignments = clustering.fit_predict(reference) + local_inertia.append(compute_inertia(assignments, reference)) + reference_inertia.append(np.mean(local_inertia)) + + ondata_inertia = [] + for k in range(2, k_max+1): + clustering.n_clusters = k + assignments = clustering.fit_predict(data) + ondata_inertia.append(compute_inertia(assignments, data)) + + gap = np.log(reference_inertia)-np.log(ondata_inertia) + return gap, np.log(reference_inertia), np.log(ondata_inertia) + + + +#scATAC data +class scATAC_Sampler(object): + def __init__(self,name,dim=20,low=0.03,has_label=True): + self.name = name + self.dim = dim + self.has_label = has_label + X = pd.read_csv('datasets/%s/sc_mat.txt'%name,sep='\t',header=0,index_col=[0]).values + if has_label: + labels = [item.strip() for item in open('datasets/%s/label.txt'%name).readlines()] + uniq_labels = list(np.unique(labels)) + Y = np.array([uniq_labels.index(item) for item in labels]) + #X,Y = self.filter_cells(X,Y,min_peaks=10) + self.Y = Y + X = self.filter_peaks(X,low) + #TF-IDF transformation + nfreqs = 1.0 * X / np.tile(np.sum(X,axis=0), (X.shape[0],1)) + X = nfreqs * np.tile(np.log(1 + 1.0 * X.shape[1] / np.sum(X,axis=1)).reshape(-1,1), (1,X.shape[1])) + X = X.T #(cells, peaks) + X = MinMaxScaler().fit_transform(X) + #PCA transformation + pca = PCA(n_components=dim, random_state=3456).fit(X) + X = pca.transform(X) + self.X = X + self.total_size = self.X.shape[0] + + + def filter_peaks(self,X,ratio): + ind = np.sum(X>0,axis=1) > X.shape[1]*ratio + return X[ind,:] + + def filter_cells(self,X,Y,min_peaks): + ind = np.sum(X>0,axis=0) > min_peaks + return X[:,ind], Y[ind] + + def correlation(self,X,Y,heatmap=False): + nb_classes = len(set(Y)) + print(nb_classes) + km = KMeans(n_clusters=nb_classes,random_state=0).fit(X) + label_kmeans = km.labels_ + purity = metric.compute_purity(label_kmeans, Y) + nmi = normalized_mutual_info_score(Y, label_kmeans) + ari = adjusted_rand_score(Y, label_kmeans) + homogeneity = homogeneity_score(Y, label_kmeans) + ami = adjusted_mutual_info_score(Y, label_kmeans) + print('NMI = {}, ARI = {}, Purity = {},AMI = {}, Homogeneity = {}'.format(nmi,ari,purity,ami,homogeneity)) + + def train(self, batch_size): + indx = np.random.randint(low = 0, high = self.total_size, size = batch_size) + + if self.has_label: + return self.X[indx, :], self.Y[indx] + else: + return self.X[indx, :] + + def load_all(self): + if self.has_label: + return self.X, self.Y + else: + return self.X + + +#load data from 10x Genomic paired ARC technology +class ARC_Sampler(object): + def __init__(self,name='PBMC10k',n_components=50,scale=10000,filter_feat=True,filter_cell=False,random_seed=1234,mode=1, \ + min_rna_c=0,max_rna_c=None,min_atac_c=0,max_atac_c=None): + #c:cell, g:gene, l:locus + self.name = name + self.mode = mode + self.min_rna_c = min_rna_c + self.max_rna_c = max_rna_c + self.min_atac_c = min_atac_c + self.max_atac_c = max_atac_c + + self.rna_mat, self.atac_mat, self.genes, self.peaks = self.load_data(filter_feat,filter_cell) + + self.rna_mat = self.rna_mat*scale/self.rna_mat.sum(1) + self.rna_mat = np.log10(self.rna_mat+1) + self.atac_mat = self.atac_mat*scale/self.atac_mat.sum(1) + self.atac_mat = np.log10(self.atac_mat+1) + + self.rna_reducer = PCA(n_components=n_components, random_state=random_seed) + self.rna_reducer.fit(self.rna_mat) + self.pca_rna_mat = self.rna_reducer.transform(self.rna_mat) + + self.atac_reducer = PCA(n_components=n_components, random_state=random_seed) + self.atac_reducer.fit(self.atac_mat) + self.pca_atac_mat = self.atac_reducer.transform(self.atac_mat) + + def load_data(self,filter_feat,filter_cell): + mtx_file = 'datasets/%s/matrix.mtx'%self.name + feat_file = 'datasets/%s/features.tsv'%self.name + barcode_file = 'datasets/%s/barcodes.tsv'%self.name + combined_mat = mmread(mtx_file).T.tocsr() #(cells, genes+peaks) + cells = [item.strip() for item in open(barcode_file).readlines()] + genes = [item.split('\t')[1] for item in open(feat_file).readlines() if item.split('\t')[2]=="Gene Expression"] + peaks = [item.split('\t')[1] for item in open(feat_file).readlines() if item.split('\t')[2]=="Peaks"] + assert len(genes)+len(peaks) == combined_mat.shape[1] + rna_mat = combined_mat[:,:len(genes)] + atac_mat = combined_mat[:,len(genes):] + print('scRNA-seq: ', rna_mat.shape, 'scATAC-seq: ', atac_mat.shape) + + if filter_feat: + rna_mat, atac_mat, genes, peaks = self.filter_feats(rna_mat, atac_mat, genes, peaks) + print('scRNA-seq filtered: ', rna_mat.shape, 'scATAC-seq filtered: ', atac_mat.shape) + return rna_mat, atac_mat, genes, peaks + + def filter_feats(self, rna_mat_sp, atac_mat_sp, genes, peaks): + #filter genes + gene_select = np.array((rna_mat_sp>0).sum(axis=0)).squeeze() > self.min_rna_c + if self.max_rna_c is not None: + gene_select *= np.array((rna_mat_sp>0).sum(axis=0)).squeeze() < self.max_rna_c + rna_mat_sp = rna_mat_sp[:,gene_select] + genes = np.array(genes)[gene_select] + #filter peaks + locus_select = np.array((atac_mat_sp>0).sum(axis=0)).squeeze() > self.min_atac_c + if self.max_atac_c is not None: + locus_select *= np.array((atac_mat_sp>0).sum(axis=0)).squeeze() < self.max_atac_c + atac_mat_sp = atac_mat_sp[:,locus_select] + peaks = np.array(peaks)[locus_select] + return rna_mat_sp, atac_mat_sp, genes, peaks + + def get_batch(self,batch_size): + idx = np.random.randint(low = 0, high = self.atac_mat.shape[0], size = batch_size) + if self.mode == 1: + return self.pca_rna_mat[idx,:] + elif self.mode == 2: + return self.pca_atac_mat[idx,:] + elif self.mode == 3: + return np.hstack((self.pca_rna_mat, self.pca_atac_mat))[idx,:] + else: + print('Wrong mode!') + sys.exit() + + def load_all(self): + #mode: 1 only scRNA-seq, 2 only scATAC-seq, 3 both + if self.mode == 1: + return self.pca_rna_mat + elif self.mode == 2: + return self.pca_atac_mat + elif self.mode == 3: + return np.hstack((self.pca_rna_mat, self.pca_atac_mat)) + else: + print('Wrong mode!') + sys.exit() + + +#sample continuous (Gaussian) and discrete (Catagory) latent variables together +class Mixture_sampler(object): + def __init__(self, nb_classes, N, dim, sd, scale=1): + self.nb_classes = nb_classes + self.total_size = N + self.dim = dim + self.sd = sd + self.scale = scale + np.random.seed(1024) + self.X_c = self.scale*np.random.normal(0, self.sd**2, (self.total_size,self.dim)) + #self.X_c = self.scale*np.random.uniform(-1, 1, (self.total_size,self.dim)) + self.label_idx = np.random.randint(low = 0 , high = self.nb_classes, size = self.total_size) + self.X_d = np.eye(self.nb_classes)[self.label_idx] + self.X = np.hstack((self.X_c,self.X_d)) + + def train(self,batch_size,weights=None): + X_batch_c = self.scale*np.random.normal(0, 1, (batch_size,self.dim)) + #X_batch_c = self.scale*np.random.uniform(-1, 1, (batch_size,self.dim)) + if weights is None: + weights = np.ones(self.nb_classes, dtype=np.float64) / float(self.nb_classes) + label_batch_idx = np.random.choice(self.nb_classes, size=batch_size, replace=True, p=weights) + X_batch_d = np.eye(self.nb_classes)[label_batch_idx] + return X_batch_c, X_batch_d + + def load_all(self): + return self.X_c, self.X_d + +#sample continuous (Gaussian Mixture) and discrete (Catagory) latent variables together +class Mixture_sampler_v2(object): + def __init__(self, nb_classes, N, dim, weights=None,sd=0.5): + self.nb_classes = nb_classes + self.total_size = N + self.dim = dim + np.random.seed(1024) + if nb_classes<=dim: + self.mean = np.random.uniform(-5,5,size =(nb_classes, dim)) + #self.mean = np.zeros((nb_classes,dim)) + #self.mean[:,:nb_classes] = np.eye(nb_classes) + else: + if dim==2: + self.mean = np.array([(np.cos(2*np.pi*idx/float(self.nb_classes)),np.sin(2*np.pi*idx/float(self.nb_classes))) for idx in range(self.nb_classes)]) + else: + self.mean = np.zeros((nb_classes,dim)) + self.mean[:,:2] = np.array([(np.cos(2*np.pi*idx/float(self.nb_classes)),np.sin(2*np.pi*idx/float(self.nb_classes))) for idx in range(self.nb_classes)]) + self.cov = [sd**2*np.eye(dim) for item in range(nb_classes)] + if weights is None: + weights = np.ones(self.nb_classes, dtype=np.float64) / float(self.nb_classes) + self.Y = np.random.choice(self.nb_classes, size=N, replace=True, p=weights) + self.X_c = np.array([np.random.multivariate_normal(mean=self.mean[i],cov=self.cov[i]) for i in self.Y],dtype='float64') + self.X_d = np.eye(self.nb_classes)[self.Y] + self.X = np.hstack((self.X_c,self.X_d)) + + def train(self, batch_size, label = False): + indx = np.random.randint(low = 0, high = self.total_size, size = batch_size) + if label: + return self.X_c[indx, :], self.X_d[indx, :], self.Y[indx, :] + else: + return self.X_c[indx, :], self.X_d[indx, :] + + def get_batch(self,batch_size,weights=None): + if weights is None: + weights = np.ones(self.nb_classes, dtype=np.float64) / float(self.nb_classes) + label_batch_idx = np.random.choice(self.nb_classes, size=batch_size, replace=True, p=weights) + return self.X_c[label_batch_idx, :], self.X_d[label_batch_idx, :] + def predict_onepoint(self,array):#return component index with max likelyhood + from scipy.stats import multivariate_normal + assert len(array) == self.dim + return np.argmax([multivariate_normal.pdf(array,self.mean[idx],self.cov[idx]) for idx in range(self.nb_classes)]) + + def predict_multipoints(self,arrays): + assert arrays.shape[-1] == self.dim + return map(self.predict_onepoint,arrays) + def load_all(self): + return self.X_c, self.X_d, self.label_idx + +#get a batch of data from previous 50 batches, add stochastic +class DataPool(object): + def __init__(self, maxsize=50): + self.maxsize = maxsize + self.nb_batch = 0 + self.pool = [] + + def __call__(self, data): + if self.nb_batch < self.maxsize: + self.pool.append(data) + self.nb_batch += 1 + return data + if np.random.rand() > 0.5: + results=[] + for i in range(len(data)): + idx = int(np.random.rand()*self.maxsize) + results.append(copy.copy(self.pool[idx])[i]) + self.pool[idx][i] = data[i] + return results + else: + return data +