--- a +++ b/DeepMod_tools/hm_cluster_predict.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python + +import os, sys, time +from collections import defaultdict +import glob + +import numpy as np + +from scipy import stats + +import locale +locale.setlocale(locale.LC_ALL, 'en_US') + +import tensorflow as tf + +batch_size = 4096 + +cov_thrd = 5 + +def readBed(bedfile, t_chr=None, t_start=None, t_end=None): + print('read {}'.format(bedfile)); sys.stdout.flush() + beddict = defaultdict() + with open(bedfile, 'r') as bedreader: + start_time = time.time(); + line = bedreader.readline(); + while True: + line = bedreader.readline(); + if not line: break; + + line = line.strip(); + if len(line)>20: + mchr, start_pos, end_pos, _, _, m_strand, _, _, _, true_cov, meth_perc = line.split() + start_pos = int(start_pos) + true_cov = int(true_cov) + if true_cov < cov_thrd: continue; + meth_perc = round(int(meth_perc)/100.0, 3) + if (t_chr not in [None, mchr]) or (not ((t_start==None or start_pos>=t_start) and (t_end==None or start_pos<=t_end))): + continue; + if true_cov==0: continue + beddict[(mchr, m_strand, start_pos)] = meth_perc + return beddict + +def readpredmod(predmodf, preddict, t_chr=None, t_start=None, t_end=None, cgCposdict=None): + print('read {}'.format(predmodf)); sys.stdout.flush() + with open(predmodf, 'r') as mr: + while True: + line = mr.readline(); + if not line: break; + line = line.strip(); + if len(line)>0: + lsp = line.split(); + cur_chr = lsp[0]; + cur_pos = int(lsp[1]); + cur_strand = lsp[5]; + + if not (cgCposdict==None or (cur_chr, cur_strand, cur_pos) in cgCposdict): continue + + cur_cov = int(lsp[9]); + cur_m_p = int(lsp[10]); + cur_m_c = int(lsp[11]); + if (t_chr not in [None, cur_chr]) or (not ((t_start==None or cur_pos>=t_start) and (t_end==None or cur_pos<=t_end))): + continue; + if cur_cov==0: continue; + + if (cur_chr, cur_strand, cur_pos) not in preddict: + preddict[(cur_chr, cur_strand, cur_pos)] = [cur_cov, round(cur_m_p/100.0, 3), cur_m_c, line] + else: + print("Warning_duplicate {}".format(predmodf)) + preddict[(cur_chr, cur_strand, cur_pos)][0] += cur_cov + preddict[(cur_chr, cur_strand, cur_pos)][2] += cur_m_c + if preddict[(cur_chr, cur_strand, cur_pos)][0]>0: + preddict[(cur_chr, cur_strand, cur_pos)][1] = round(preddict[(cur_chr, cur_strand, cur_pos)][2]/float(preddict[(cur_chr, cur_strand, cur_pos)][0]), 3) + + + +pred_file = sys.argv[1]+'.%s.C.bed' +save_file = sys.argv[1]+'_clusterCpG.%s.C.bed' +gmotfolder = sys.argv[2] + +mpat = 'Cg'; mposinpat=0 +stposofinterest = None; edposofinterest = None; + +nbsize = 25; +train_mod = 'DeepMod/train_mod/na12878_cluster_train_mod-keep_prob0.7-nb25-chr1/{}.cov{}.nb{}'.format(mpat, cov_thrd, nbsize) + +chrkeys = [] +for i in range(1, 23): + chrkeys.append("chr%d" % i) +chrkeys.append("chrX") +chrkeys.append("chrY") +chrkeys.append("chrM") + + +new_saver = tf.train.import_meta_graph(train_mod+'.meta') +print(new_saver); sys.stdout.flush() +with tf.Session() as sess: + print("restore model: {} {}".format(train_mod+'.meta', train_mod[:train_mod.rindex('/')+1])) + print(new_saver.restore(sess,tf.train.latest_checkpoint(train_mod[:train_mod.rindex('/')+1]))); sys.stdout.flush() + + mgraph = tf.get_default_graph() + output = mgraph.get_tensor_by_name('output:0') + X = mgraph.get_tensor_by_name('X:0') + keep_prob = mgraph.get_tensor_by_name('keep_prob:0') + + for chrofinterest in chrkeys: + #read pred + preddict = defaultdict() + + cur_cg_pos = '%s/motif_%s_C.bed' % (gmotfolder, chrofinterest) + if not os.path.isfile(cur_cg_pos): + print("Warning_motif!!! no file {}".format(cur_cg_pos)) + continue; + if not os.path.isfile(pred_file % chrofinterest): + print("Warning_pred!!! no file {}".format(pred_file % chrofinterest)) + continue; + + cgposdict = defaultdict(); + with open(cur_cg_pos, 'r') as mr: + while True: + line = mr.readline(); + if not line: break; + lsp = line.split(); + cgposdict[ (lsp[0], lsp[2], int(lsp[1]) ) ] = True + print("{}: read {} done! ".format(len(cgposdict), cur_cg_pos)); sys.stdout.flush() + readpredmod(pred_file % chrofinterest, preddict, chrofinterest, cgCposdict=cgposdict) + print("size={} vs ".format(len(preddict), len(cgposdict) )); sys.stdout.flush() + + train_data = [] + pdkeys = sorted(list( preddict.keys() )) + for cspk in pdkeys: # preddict: + if cspk not in cgposdict: + print("not in cpg warning!!! {} {}".format(chrofinterest, cspk)) + + partner_pos = (cspk[0], '-' if cspk[1]=='+' else '+', cspk[2]+1 if cspk[1]=='+' else cspk[2]-1) + cur_x = [preddict[cspk][1], preddict[partner_pos][1] if partner_pos in preddict else 0] + for pdis in range(11): + cur_x.append(0) + cur_x.append(0) + if len(train_data)<10: print("test") + for rpos in range(cspk[2]-nbsize, cspk[2]+nbsize+1): + if rpos in [cspk[2], partner_pos[2]]: continue; + + if (cspk[0], '+', rpos) in cgposdict and (cspk[0], '+', rpos) in preddict: + cur_x[int(preddict[(cspk[0], '+', rpos)][1]/0.1+0.5) + 3] += 1 + cur_x[2] += 1 + if len(train_data)<10: print("\t\t{}: {}".format((cspk[0], '+', rpos), preddict[(cspk[0], '+', rpos)])) + elif (cspk[0], '-', rpos) in cgposdict and (cspk[0], '-', rpos) in preddict: + cur_x[int(preddict[(cspk[0], '-', rpos)][1]/0.1+0.5) + 3] += 1 + cur_x[2] += 1 + if len(train_data)<10: print("\t\t{}: {}".format((cspk[0], '-', rpos), preddict[(cspk[0], '-', rpos)])) + for i in range(3, len(cur_x)): + if cur_x[2]>0: cur_x[i] = round(cur_x[i]/float(cur_x[2]), 3) + if len(train_data)<10: print('\t{}'.format(cur_x)); sys.stdout.flush() + train_data.append(cur_x) + + print("format data: data={}; {}".format(len(train_data), len(train_data[0]))); sys.stdout.flush() + + batch_data = np.array_split(train_data, int(len(train_data)/batch_size) if len(train_data)>batch_size else 2) + m_pred_new_per = [] + for i in range(len(batch_data)): + moutp = sess.run([output], feed_dict={X:batch_data[i], keep_prob:1}) + for mpind in moutp: + for curpd in mpind: + m_pred_new_per.append(curpd) + print("new per: {}, {} {} {}".format(len(pdkeys), len(train_data), len(m_pred_new_per), curpd )) + for wind in range(10): + print("'{}' <{}> {}".format(preddict[pdkeys[wind]][-1], m_pred_new_per[wind], train_data[wind])) + with open(save_file % chrofinterest, 'w') as mwriter: + for wind in range(len(pdkeys)): + mwriter.write("{} {}\n".format(preddict[pdkeys[wind]][-1], int(m_pred_new_per[wind]*100))) + +