--- a +++ b/modas/coloc.py @@ -0,0 +1,189 @@ +import pandas as pd +import numpy as np +import bioframe as bf +from image_match.goldberg import ImageSignature +from pandas_plink import read_plink1_bin +from sklearn.cluster import DBSCAN +from scipy.spatial.distance import squareform +from scipy.cluster.hierarchy import linkage, leaves_list, cut_tree +from joblib import Parallel, delayed +from sklearn.metrics import silhouette_score, calinski_harabasz_score +import resource +import os + +resource.setrlimit(resource.RLIMIT_NOFILE, (4096, 4096)) + + +def qtl_cluster(qtl): + qtl.CHR = qtl.CHR.astype(str) + qtl = bf.cluster(qtl, cols=['CHR', 'qtl_start', 'qtl_end'], min_dist=0) + return qtl + + +def kin(g, top=2): + g = g - g.mean() + K = np.dot(g, g.T) + d = np.diag(K) + DL = np.min(d) + DU = np.max(d) + floor = np.min(K) + K = top * (K - floor) / (DU - floor) + Dmin = top * (DL - floor) / (DU - floor) + dig_index = np.eye(K.shape[0], dtype=bool) + if Dmin < 1: + K[dig_index] = (np.diag(K)-Dmin+1)/((top+1-Dmin)*0.5) + K[~dig_index] = K[~dig_index] * (1 / Dmin) + Omax = np.max(K[~dig_index]) + if Omax > top: + K[~dig_index] = K[~dig_index] * (top / Omax) + return K + + +def get_kin_info(qtl, gwas_dir, geno, pvalue): + var = pd.Series(geno.variant, index=geno.snp) + kin_info = dict() + for phe_name in qtl.phe_name.unique(): + fn = gwas_dir+'/tmp_' + phe_name + '_plink.assoc.txt' + if not os.path.exists(fn): + print('Warning: ' + fn + 'is not exist.') + continue + gwas = pd.read_csv(fn, sep='\t') + geno_sub = geno.sel(variant=var.reindex(gwas.loc[gwas.p_wald <= pvalue, 'rs']).dropna().values, drop=True) + geno_sub = pd.DataFrame(geno_sub.values, index=geno_sub.fid, columns=geno_sub.snp) + kin_res = kin(geno_sub) + ril_cluster = linkage(kin_res, method='ward') + idx = leaves_list(ril_cluster) + label = cut_tree(ril_cluster, n_clusters=2)[:, 0] + kin_info[phe_name] = dict([['kin', kin_res], ['idx', idx], ['label', label]]) + return kin_info + + +def get_ril_cluster_idx(kin1, kin2, metric): + score1 = calc_cluster_score(kin1['kin'], kin2['kin'], kin1['label'], metric) + score2 = calc_cluster_score(kin1['kin'], kin2['kin'], kin2['label'], metric) + if metric == 'silhouette': + if score1 > score2: + return kin1['idx'] + else: + return kin2['idx'] + if metric == 'calinski_harabasz': + if score1 > score2: + return kin1['idx'] + else: + return kin2['idx'] + + +def calc_cluster_score(kin1, kin2, label, metric): + if metric == 'silhouette': + from sklearn.metrics import silhouette_score + kin1_score = silhouette_score(kin1, label) + kin2_score = silhouette_score(kin2, label) + elif metric == 'calinski_harabasz': + from sklearn.metrics import calinski_harabasz_score + kin1_score = calinski_harabasz_score(kin1, label) + kin2_score = calinski_harabasz_score(kin2, label) + return np.mean([kin1_score, kin2_score]) + + +def get_signature(g, gis): + # image_limits = gis.crop_image(g, + # lower_percentile=gis.lower_percentile, + # upper_percentile=gis.upper_percentile, + # fix_ratio=gis.fix_ratio) + # x_coords, y_coords = gis.compute_grid_points(g, + # n=gis.n, window=image_limits) + x_coords, y_coords = gis.compute_grid_points(g, n=gis.n) + avg_grey = gis.compute_mean_level(g, x_coords, y_coords, P=gis.P) + diff_mat = gis.compute_differentials(avg_grey, + diagonal_neighbors=gis.diagonal_neighbors) + gis.normalize_and_threshold(diff_mat, + identical_tolerance=gis.identical_tolerance, + n_levels=gis.n_levels) + return np.ravel(diff_mat).astype('int8') + + +def calc_image_match_score(kin_info, phe_list, metric): + gis = ImageSignature() + score = list() + for _, phe1 in enumerate(phe_list): + for phe2 in phe_list[_+1:]: + idx = get_ril_cluster_idx(kin_info[phe1], kin_info[phe2], metric) + score.append(gis.normalized_distance(get_signature(kin_info[phe1]['kin'][idx, :][:, idx], gis), get_signature(kin_info[phe2]['kin'][idx, :][:, idx], gis))) + score = squareform(score) + return score + + +def cluster_coloc(kin_info, qtl, c, metric, cls): + qtl_sub = qtl.loc[qtl.cluster==c, :] + if qtl_sub.shape[0] > 1: + phe_name = qtl_sub.phe_name.unique() + trait_dis = calc_image_match_score(kin_info, phe_name, metric) + trait_dis = np.round(trait_dis, 2) + cls.fit(trait_dis) + cls_res = pd.Series(cls.labels_, index=phe_name).to_frame().reset_index() + cls_res.columns = ['phe_name', 'label'] + cls_res = cls_res.loc[cls_res.label != -1, :] + if not cls_res.empty: + cls_res['label'] = str(c) + '_' + cls_res.label.astype(str) + return pd.DataFrame(trait_dis, index=phe_name, columns=phe_name), cls_res + else: + return pd.DataFrame(trait_dis, index=phe_name, columns=phe_name), pd.DataFrame() + else: + return pd.DataFrame(), pd.DataFrame() + + +def trait_coloc(kin_info, qtl, metric, eps, p): + cls = DBSCAN(eps=eps, min_samples=2, metric='precomputed') + # dis = list() + # coloc_res = list() + # coloc_count = 0 + # for c in qtl.cluster.unique(): + # qtl_sub = qtl.loc[qtl.cluster==c, :] + # if qtl_sub.shape[0] > 1: + # phe_name = qtl_sub.phe_name.unique() + # trait_dis = calc_image_match_score(kin_info, phe_name, metric) + # dis.append(pd.DataFrame(trait_dis, index=phe_name, columns=phe_name)) + # cls.fit(trait_dis) + # cls_res = pd.Series(cls.labels_, index=phe_name).to_frame().reset_index() + # cls_res.columns = ['phe_name', 'label'] + # cls_res = cls_res.loc[cls_res.label != -1, :] + # if not cls_res.empty: + # cls_count = cls_res['label'].value_counts() + # cls_res['label'] = cls_res['label'].replace(cls_count.index, np.arange(coloc_count + 1, coloc_count + 1 + cls_count.shape[0])) + # coloc_res.append(cls_res) + # coloc_count = coloc_count + cls_count.shape[0] + res = Parallel(n_jobs=p)(delayed(cluster_coloc)(kin_info, qtl, c, metric, cls) for c in qtl.cluster.unique()) + coloc_res = [i[1] for i in res] + dis = [i[0] for i in res] + coloc_res = pd.concat(coloc_res, axis=0) + if not coloc_res.empty: + coloc_res_count = coloc_res['label'].value_counts() + coloc_res['label'] = coloc_res['label'].replace(coloc_res_count.index, np.arange(1, coloc_res_count.shape[0] + 1)) + coloc_res = pd.merge(qtl.drop(['cluster', 'cluster_start', 'cluster_end'], axis=1), coloc_res, on='phe_name', how='left') + coloc_res = coloc_res.fillna(-1) + coloc_res = coloc_res.loc[coloc_res.label != -1, :] + dis = pd.concat(dis) + dup_index = dis.index[dis.index.duplicated()] + dup_dis = dis[dis.index.duplicated(keep=False)] + dis = dis[~dis.index.duplicated()] + for index in dup_index: + dis.loc[index, :] = dup_dis.loc[index, :].apply(lambda x:pd.Series(x[~pd.isna(x)]).min() if(pd.isna(x).sum()!=x.shape[0]) else x[0], axis=0) + dis = dis.fillna(1) + dis_pairwise = dis.stack().reset_index() + dis_pairwise.columns = ['level_0', 'level_1', 'image_match_score'] + dis_pairwise = dis_pairwise.loc[dis_pairwise.level_0 != dis_pairwise.level_1, :] + dis_pairwise['id'] = dis_pairwise.apply(lambda x: ';'.join(sorted([x['level_0'], x['level_1']])), axis=1) + dis_pairwise = dis_pairwise.drop_duplicates(subset='id') + qtl_overlap = bf.overlap(qtl[['CHR', 'qtl_start', 'qtl_end', 'SNP', 'P', 'phe_name']], qtl[['CHR', 'qtl_start', 'qtl_end', 'SNP', 'P', 'phe_name']], + cols1=['CHR', 'qtl_start', 'qtl_end'], cols2=['CHR', 'qtl_start', 'qtl_end'], how='inner', suffixes=('_1', '_2')) + qtl_overlap = qtl_overlap.loc[qtl_overlap.phe_name_1 != qtl_overlap.phe_name_2, :] + qtl_overlap['id'] = qtl_overlap.apply(lambda x: ';'.join(sorted([x['phe_name_1'], x['phe_name_2']])), axis=1) + qtl_overlap = qtl_overlap.drop_duplicates(subset='id') + coloc_pairwise_res = pd.merge(qtl_overlap, dis_pairwise, on='id') + coloc_pairwise_res = coloc_pairwise_res.drop(['id', 'level_0', 'level_1'], axis=1) + coloc_pairwise_res['coloc'] = 'No' + coloc_pairwise_res.loc[coloc_pairwise_res['image_match_score'] <= 0.2, 'coloc'] = 'Yes' + dis = dis.reindex(qtl.phe_name.unique()).reindex(qtl.phe_name.unique(), axis=1) + dis = dis.fillna(1) + return coloc_res, coloc_pairwise_res, dis +