from pandas_plink import read_plink1_bin
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from multiprocessing import cpu_count
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import pairwise_distances
import warnings
import modas.multiprocess as mp
import numpy as np
import pandas as pd
from collections import Counter
import struct
import subprocess
import re
warnings.filterwarnings("ignore")
def convert(g, n):
c = [1, 4, 16, 64]
s = 0
for i in range(len(g)):
if g[i][0] != g[i][1]:
s += 2 * c[i]
elif g[i] == 'NN':
s += 1 * c[i]
elif g[i][0] == n[0]:
s += 0 * c[i]
else:
s += 3 * c[i]
return s
def hap2plink_bed(hap, plink):
try:
with open(hap) as h, open(plink + '.bed', 'wb') as b, open(plink + '.bim', 'w') as bim, open(plink + '.fam','w') as fam:
#b = open(plink + '.bed', 'wb')
#bim = open(plink + '.bim', 'w')
#fam = open(plink + '.fam','w')
b.write(struct.pack('B', 108))
b.write(struct.pack('B', 27))
b.write(struct.pack('B', 1))
out = list()
for _, l in enumerate(h):
l = l.strip().split('\t')
if _ == 0:
samples = l[11:]
for s in samples:
fam.write(' '.join([s, s, '0', '0', '0', '-9']) + '\n')
else:
g_seq = ''.join(l[11:])
nlu_num = list(set(g_seq))
if len(nlu_num) > 2 and 'N' not in nlu_num:
print("Warning: There is more than two nucleotide letter snp in hapmap file. skip this snp")
continue
if len(nlu_num) == 1:
print("Warning: There is one nucleotide letter snp in hapmap file. skip this snp")
continue
if 'N' in nlu_num:
c = Counter(g_seq)
del c['N']
n = sorted(c, key=lambda x: c[x])
else:
g_seq_sort = ''.join(sorted(g_seq))
major = g_seq_sort[len(g_seq) // 2]
if g_seq_sort[0] == major:
n = [g_seq_sort[-1], major]
else:
n = [g_seq_sort[0], major]
bim.write('\t'.join([l[2], l[0], '0', l[3]] + n) + '\n')
for num in range(11, len(l), 4):
if num + 4 < len(l):
#out += struct.pack('B', convert(l[num:num + 4], n)).decode()
out.append(convert(l[num:num + 4], n))
else:
#out += struct.pack('B', convert(l[num:len(l)], n)).decode()
out.append(convert(l[num:len(l)], n))
#b.write(out)
b.write(struct.pack('B'*len(out),*out))
except Exception:
return False
else:
return True
#def jaccard_tri_new(u, v):
# a = (u == v).sum() - np.bitwise_and(u == 0, v == 0).sum()
# b = np.logical_or(u, v).sum()
# return 1 - a / b if b != 0 else 0
def jaccard_tri(u, v):
u = u.astype(int)
v = v.astype(int)
a = np.logical_and(u, v)
minor_het_u = (np.bitwise_and(u, v)).astype(bool).sum()
minor_het_i = (np.bitwise_or(u, v)).astype(bool).sum() + a.sum() - minor_het_u
return 1 - np.double(minor_het_u)/minor_het_i if minor_het_i !=0 else 0
def optimal_cluster(g):
dis = pairwise_distances(g.T, metric=jaccard_tri, n_jobs=-1)
cluster = DBSCAN(eps=0.2, min_samples=10, metric='precomputed').fit(dis)
if sum(cluster.labels_ == -1) < g.shape[1]*0.3:
return cluster
else:
# cluster = DBSCAN(eps=0.35, min_samples=5, metric='jaccard').fit(dis)
#cluster = DBSCAN(eps=0.35, min_samples=5, metric=jaccard_tri_new).fit(g.T)
# cluster = DBSCAN(eps=0.35, min_samples=5, metric='jaccard').fit(g.T)
cluster = DBSCAN(eps=0.35, min_samples=5, metric='precomputed').fit(dis)
return cluster
def cluster_PCA(g,index,colname_prefix):
g = g.values
g_std = g - np.mean(g, axis=0)
cluster = optimal_cluster(g)
pca = PCA(n_components=3)
window_res = pd.DataFrame()
#cluster_variant = pd.DataFrame()
for label in np.unique(cluster.labels_):
if label != -1 and sum(cluster.labels_ == label) >= 10:
g_sub = g_std[:, cluster.labels_ == label]
try:
pca.fit(g_sub)
except np.linalg.LinAlgError:
continue
pc = pd.DataFrame(pca.transform(g_sub), index=index, columns=[colname_prefix+'_cluster'+str(int(label)+1)+'_PC'+str(i) for i in range(1,4)])
if pca.explained_variance_ratio_[0] >= 0.6:
pc = pc.iloc[:,0].to_frame()
else:
pc = pc.iloc[:,:2]
window_res = pd.concat([window_res, pc], axis=1)
#variant = pd.Series(g.snp.values)[cluster.labels_ == label].to_frame()
#variant.loc[:,'cluster'] = colname_prefix+'_cluster'+str(int(label)+1)
#variant.columns = ['rs','cluster']
#cluster_variant = pd.concat([cluster_variant,variant])
#return window_res,cluster_variant
return window_res
def chr_cluster_pca(G_chr,chrom,window,step):
chr_res = pd.DataFrame()
#chr_cluster_variant = pd.DataFrame()
paras = []
chr_end = G_chr.pos[-1]
start = 0
end = start+window
if step == 0:
step = window
while start < chr_end:
G_chr_sub = G_chr.where((G_chr.pos>=start) & (G_chr.pos<=end), drop=True)
if G_chr_sub.shape[1] <= 100:
start = start + step
end = start + window
continue
if end > chr_end:
colname_prefix = '_'.join(['chr',chrom,str(start),str(int(chr_end))])
else:
colname_prefix = '_'.join(['chr',chrom,str(start),str(end)])
paras.append((G_chr_sub, G_chr_sub.sample, colname_prefix))
#cluster_pc,cluster_variant = cluster_PCA(G_chr_sub,G_chr_sub.sample,colname_prefix)
cluster_pc = cluster_PCA(G_chr_sub,G_chr_sub.sample,colname_prefix)
if chr_res.empty:
chr_res = cluster_pc
else:
chr_res = pd.concat([chr_res, cluster_pc], axis=1)
#chr_cluster_variant = pd.concat([chr_cluster_variant,cluster_variant])
start = start + step
end = start + window
#return chr_res,chr_cluster_variant
return chr_res
def genome_cluster(G, window, step, threads):
paras = list()
if threads > np.unique(G.chrom).shape[0]:
threads = np.unique(G.chrom).shape[0]
for chrom in np.unique(G.chrom):
G_chr = G.where(G.chrom == chrom,drop=True)
paras.append((G_chr, chrom, window, step))
res = mp.parallel(chr_cluster_pca, paras, threads)
#res_pc = pd.concat([i[0] for i in res], axis=1)
res_pc = pd.concat(res, axis=1)
res_pc.loc[:,:] = np.around(MinMaxScaler(feature_range=(0, 2)).fit_transform(res_pc.values),decimals=3)
#res_variant = pd.concat([i[1] for i in res])
#return res_pc,res_variant
return res_pc
def read_genotype(geno_prefix):
try:
G = read_plink1_bin(geno_prefix + '.bed', geno_prefix + '.bim', geno_prefix + '.fam', ref='a0',verbose=False)
except Exception:
return None
return G
def snp_clumping(bed, r2, out):
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
import rpy2.robjects as robjects
pandas2ri.activate()
base = importr('base')
utils = importr('utils')
if not base.require('bigsnpr')[0]:
utils_path = subprocess.check_output('locate modas/utils', shell=True, text=True, encoding='utf-8')
# utils_path = '/'.join(re.search('\n(.*site-packages.*)\n', utils_path).group(1).split('/')[:-1])
utils_path = re.search('\n(.*site-packages.*)\n', utils_path).group(1)
if not utils_path.endswith('utils'):
utils_path = '/'.join(utils_path.split('/')[:-1])
utils.install_packages(utils_path + '/Matrix_1.6-5.tar.gz', repos=robjects.rinterface.NULL, type='source', quiet=True)
utils.install_packages('bigsnpr', dependence=True, repos='https://cloud.r-project.org', quiet=True)
robjects.r['options'](warn=-1)
robjects.r('options(datatable.showProgress = FALSE)')
base.sink('/dev/null')
bigsnpr = importr('bigsnpr')
g = bigsnpr.snp_readBed(bed, backingfile=base.tempfile())
g = bigsnpr.snp_attach(g)
snp_keep = bigsnpr.snp_clumping(g.rx2('genotypes'), infos_chr=g.rx2('map').rx2('chromosome'), infos_pos=g.rx2('map').rx2('physical.pos'),
thr_r2=r2, ncores=1)
g_clump = bigsnpr.subset_bigSNP(g, ind_col=snp_keep)
g_clump = bigsnpr.snp_attach(g_clump)
bigsnpr.snp_writeBed(g_clump, out)
base.sink()
return g[2].shape[0], len(snp_keep)