--- a +++ b/DeepMod_tools/generate_motif_pos.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python + +import os, sys, string, time +from collections import defaultdict; +import multiprocessing + + + +def read_genome(mfafile): + ref_genome = defaultdict(); + with open(mfafile, 'r') as mr: + cur_chr = None; + while True: + line = mr.readline(); + if not line: break; + line = line.strip(); + if len(line)==0: continue; + if line[0]=='>': + if not cur_chr==None: + ref_genome[cur_chr] = ''.join(seqlist); + cur_chr = line[1:].split()[0]; + seqlist = [] + else: + seqlist.append(line.upper()) + ref_genome[cur_chr] = ''.join(seqlist); + print("Total chr: {}".format(len(ref_genome))); sys.stdout.flush() + return ref_genome +#ref_genome = read_genome(ref_fa); + +def handle_motif_pos(run_Q): + handli = 0; + while not run_Q.empty(): + try: + rgkey, ref_genome, res_folder, na_bp, curna, curmotif = run_Q.get(block=False) + #print(rgkey, ref_genome, res_folder, na_bp, curna, curmotif); continue + except: + break; + + #curna_dict = defaultdict(); + #curmotif_dict = defaultdict(); + nafile = '%sna_%s_%s.bed' % (res_folder, rgkey, curna) + motiffile = '%smotif_%s_%s.bed' % (res_folder, rgkey, curna) + mw_na = open(nafile, 'w') + mw_motif = open(motiffile, 'w') + + batchsize = 500000 + print("get motif for {}={}".format(rgkey, len(ref_genome))); sys.stdout.flush() + cur_hi = 0; start_time = time.time(); + for na_ind in range(len(ref_genome)): + cur_hi += 1 + if cur_hi % batchsize == 0: + print('\t time consuming ({})= {} {}'.format( rgkey, cur_hi, time.time() - start_time ) ) + sys.stdout.flush() + start_time = time.time() + + if (ref_genome[na_ind]==curna or na_bp[ref_genome[na_ind]]==curna): + #curna_dict[(rgkey, na_ind, '+' if ref_genome[rgkey][na_ind]==curna else '-')] = True; + mw_na.write('%s\t%s\t%s\n' % (rgkey, na_ind, '+' if ref_genome[na_ind]==curna else '-')) + if ref_genome[na_ind]==curna and (not curmotif==None): + for cur_mot in curmotif: + is_mot = True; mot_ind = 0; + for cur_sub_r_ind in range(na_ind - curmotif[cur_mot], na_ind + len(cur_mot) - curmotif[cur_mot] ): + if cur_sub_r_ind<0 or cur_sub_r_ind>len(ref_genome)-1: + is_mot = False; break; + if not ref_genome[cur_sub_r_ind] == cur_mot[mot_ind]: + is_mot = False; break; + mot_ind += 1 + if is_mot: + mw_motif.write('%s\t%s\t%s\n' % (rgkey, na_ind, '+')) + mw_motif.write('%s\t%s\t%s\n' % (rgkey, na_ind+1, '-')) + break; + mw_na.close(); mw_motif.close() + + + +ref_fa = 'ref/hg38.fa' +ref_fa = sys.argv[1] +res_folder = 'genome.motif/C/' +res_folder = sys.argv[2]+'/' +if not os.path.isdir(res_folder): + os.system('mkdir -p '+res_folder) + +curna='C' +curmotif={'CG':0} +curna = sys.argv[3]; +curmotif = {sys.argv[4]:int(sys.argv[5])} + +if len(sys.argv)>6: + chrkeys = ["chr%s" % cid for cid in sys.argv[6].split(',')] +else: + chrkeys = [] + for i in range(1, 23): + chrkeys.append("chr%d" % i) + chrkeys.append("chrX") + chrkeys.append("chrY") + chrkeys.append("chrM") + +chrkeys = set(chrkeys) + + +na_bp = {"A":"T", \ + "C":"G", \ + "G":"C", \ + "T":"A", \ + "a":"t", \ + "c":"g", \ + "g":"c", \ + "t":"a", \ + "N":"N", \ + "n":"n" \ + } + + +ref_genome = read_genome(ref_fa); + +############################## +pmanager = multiprocessing.Manager(); +run_Q = pmanager.Queue(); +for curk in chrkeys: + run_Q.put((curk, ref_genome[curk], res_folder, na_bp, curna, curmotif)) + +mhandlers = []; +share_var = (run_Q, ) +m_thread_num = len(chrkeys); +for i in range(m_thread_num): + p = multiprocessing.Process(target=handle_motif_pos, args=share_var) + p.start(); + mhandlers.append(p); +while any(p.is_alive() for p in mhandlers): + try: + time.sleep(1); + except: + time.sleep(1); + continue; + + + + + +