--- a +++ b/src/utils.py @@ -0,0 +1,177 @@ +import os +import sys +import math +import scanpy as sc +from scipy import stats, spatial, sparse +from scipy.linalg import norm +from sklearn.metrics.pairwise import euclidean_distances +import numpy as np +import random +import torch +import torch.nn as nn +import torch.nn.init as init +import torch.utils.data as data +from sklearn.neighbors import kneighbors_graph + +def cluster_acc(y_true, y_pred): + """ + Calculate clustering accuracy. Require scikit-learn installed + # Arguments + y: true labels, numpy.array with shape `(n_samples,)` + y_pred: predicted labels, numpy.array with shape `(n_samples,)` + # Return + accuracy, in [0,1] + """ + y_true = y_true.astype(np.int64) + assert y_pred.size == y_true.size + D = max(y_pred.max(), y_true.max()) + 1 + w = np.zeros((D, D), dtype=np.int64) + for i in range(y_pred.size): + w[y_pred[i], y_true[i]] += 1 + from sklearn.utils.linear_assignment_ import linear_assignment + ind = linear_assignment(w.max() - w) + return sum([w[i, j] for i, j in ind]) * 1.0 / y_pred.size + +def GetCluster(X, res, n): + adata0=sc.AnnData(X) + if adata0.shape[0]>200000: + np.random.seed(adata0.shape[0])#set seed + adata0=adata0[np.random.choice(adata0.shape[0],200000,replace=False)] + sc.pp.neighbors(adata0, n_neighbors=n, use_rep="X") + sc.tl.louvain(adata0,resolution=res) + Y_pred_init=adata0.obs['louvain'] + Y_pred_init=np.asarray(Y_pred_init,dtype=int) + if np.unique(Y_pred_init).shape[0]<=1: + #avoid only a cluster + exit("Error: There is only a cluster detected. The resolution:"+str(res)+"is too small, please choose a larger resolution!!") + else: + print("Estimated n_clusters is: ", np.shape(np.unique(Y_pred_init))[0]) + return(np.shape(np.unique(Y_pred_init))[0]) + +def torch_PCA(X, k, center=True, scale=False): + X = X.t() + n,p = X.size() + ones = torch.ones(n).cuda().view([n,1]) + h = ((1/n) * torch.mm(ones, ones.t())) if center else torch.zeros(n*n).view([n,n]) + H = torch.eye(n).cuda() - h + X_center = torch.mm(H.double(), X.double()) + covariance = 1/(n-1) * torch.mm(X_center.t(), X_center).view(p,p) + scaling = torch.sqrt(1/torch.diag(covariance)).double() if scale else torch.ones(p).cuda().double() + scaled_covariance = torch.mm(torch.diag(scaling).view(p,p), covariance) + eigenvalues, eigenvectors = torch.eig(scaled_covariance, True) + components = (eigenvectors[:, :k]) + #explained_variance = eigenvalues[:k, 0] + return components + +def best_map(L1,L2): + #L1 should be the groundtruth labels and L2 should be the clustering labels we got + Label1 = np.unique(L1) + nClass1 = len(Label1) + Label2 = np.unique(L2) + nClass2 = len(Label2) + nClass = np.maximum(nClass1,nClass2) + G = np.zeros((nClass,nClass)) + for i in range(nClass1): + ind_cla1 = L1 == Label1[i] + ind_cla1 = ind_cla1.astype(float) + for j in range(nClass2): + ind_cla2 = L2 == Label2[j] + ind_cla2 = ind_cla2.astype(float) + G[i,j] = np.sum(ind_cla2 * ind_cla1) + m = Munkres() + index = m.compute(-G.T) + index = np.array(index) + c = index[:,1] + newL2 = np.zeros(L2.shape) + for i in range(nClass2): + newL2[L2 == Label2[i]] = Label1[c[i]] + return newL2 + +def geneSelection(data, threshold=0, atleast=10, + yoffset=.02, xoffset=5, decay=1.5, n=None, + plot=True, markers=None, genes=None, figsize=(6,3.5), + markeroffsets=None, labelsize=10, alpha=1, verbose=1): + + if sparse.issparse(data): + zeroRate = 1 - np.squeeze(np.array((data>threshold).mean(axis=0))) + A = data.multiply(data>threshold) + A.data = np.log2(A.data) + meanExpr = np.zeros_like(zeroRate) * np.nan + detected = zeroRate < 1 + meanExpr[detected] = np.squeeze(np.array(A[:,detected].mean(axis=0))) / (1-zeroRate[detected]) + else: + zeroRate = 1 - np.mean(data>threshold, axis=0) + meanExpr = np.zeros_like(zeroRate) * np.nan + detected = zeroRate < 1 + mask = data[:,detected]>threshold + logs = np.zeros_like(data[:,detected]) * np.nan + logs[mask] = np.log2(data[:,detected][mask]) + meanExpr[detected] = np.nanmean(logs, axis=0) + + lowDetection = np.array(np.sum(data>threshold, axis=0)).squeeze() < atleast + zeroRate[lowDetection] = np.nan + meanExpr[lowDetection] = np.nan + + if n is not None: + up = 10 + low = 0 + for t in range(100): + nonan = ~np.isnan(zeroRate) + selected = np.zeros_like(zeroRate).astype(bool) + selected[nonan] = zeroRate[nonan] > np.exp(-decay*(meanExpr[nonan] - xoffset)) + yoffset + if np.sum(selected) == n: + break + elif np.sum(selected) < n: + up = xoffset + xoffset = (xoffset + low)/2 + else: + low = xoffset + xoffset = (xoffset + up)/2 + if verbose>0: + print('Chosen offset: {:.2f}'.format(xoffset)) + else: + nonan = ~np.isnan(zeroRate) + selected = np.zeros_like(zeroRate).astype(bool) + selected[nonan] = zeroRate[nonan] > np.exp(-decay*(meanExpr[nonan] - xoffset)) + yoffset + + if plot: + if figsize is not None: + plt.figure(figsize=figsize) + plt.ylim([0, 1]) + if threshold>0: + plt.xlim([np.log2(threshold), np.ceil(np.nanmax(meanExpr))]) + else: + plt.xlim([0, np.ceil(np.nanmax(meanExpr))]) + x = np.arange(plt.xlim()[0], plt.xlim()[1]+.1,.1) + y = np.exp(-decay*(x - xoffset)) + yoffset + if decay==1: + plt.text(.4, 0.2, '{} genes selected\ny = exp(-x+{:.2f})+{:.2f}'.format(np.sum(selected),xoffset, yoffset), + color='k', fontsize=labelsize, transform=plt.gca().transAxes) + else: + plt.text(.4, 0.2, '{} genes selected\ny = exp(-{:.1f}*(x-{:.2f}))+{:.2f}'.format(np.sum(selected),decay,xoffset, yoffset), + color='k', fontsize=labelsize, transform=plt.gca().transAxes) + + plt.plot(x, y, color=sns.color_palette()[1], linewidth=2) + xy = np.concatenate((np.concatenate((x[:,None],y[:,None]),axis=1), np.array([[plt.xlim()[1], 1]]))) + t = plt.matplotlib.patches.Polygon(xy, color=sns.color_palette()[1], alpha=.4) + plt.gca().add_patch(t) + + plt.scatter(meanExpr, zeroRate, s=1, alpha=alpha, rasterized=True) + if threshold==0: + plt.xlabel('Mean log2 nonzero expression') + plt.ylabel('Frequency of zero expression') + else: + plt.xlabel('Mean log2 nonzero expression') + plt.ylabel('Frequency of near-zero expression') + plt.tight_layout() + + if markers is not None and genes is not None: + if markeroffsets is None: + markeroffsets = [(0, 0) for g in markers] + for num,g in enumerate(markers): + i = np.where(genes==g)[0] + plt.scatter(meanExpr[i], zeroRate[i], s=10, color='k') + dx, dy = markeroffsets[num] + plt.text(meanExpr[i]+dx+.1, zeroRate[i]+dy, g, color='k', fontsize=labelsize) + + return selected