a b/DeepMod_tools/generate_motif_pos.py
1
#!/usr/bin/env python
2
3
import os, sys, string, time
4
from collections import defaultdict;
5
import multiprocessing
6
7
8
9
def read_genome(mfafile):
10
   ref_genome = defaultdict();
11
   with open(mfafile, 'r') as mr:
12
      cur_chr = None;
13
      while True:
14
         line = mr.readline();
15
         if not line: break;
16
         line = line.strip();
17
         if len(line)==0: continue;
18
         if line[0]=='>':
19
            if not cur_chr==None:
20
               ref_genome[cur_chr] = ''.join(seqlist);
21
            cur_chr = line[1:].split()[0];
22
            seqlist = []
23
         else:
24
            seqlist.append(line.upper())
25
      ref_genome[cur_chr] = ''.join(seqlist);
26
   print("Total chr: {}".format(len(ref_genome))); sys.stdout.flush()
27
   return ref_genome
28
#ref_genome = read_genome(ref_fa);
29
30
def handle_motif_pos(run_Q):
31
   handli = 0;
32
   while not run_Q.empty():
33
      try:
34
         rgkey, ref_genome, res_folder, na_bp, curna, curmotif = run_Q.get(block=False)
35
         #print(rgkey, ref_genome, res_folder, na_bp, curna, curmotif); continue
36
      except:
37
         break;
38
39
      #curna_dict = defaultdict();
40
      #curmotif_dict = defaultdict();
41
      nafile = '%sna_%s_%s.bed' % (res_folder, rgkey, curna)
42
      motiffile = '%smotif_%s_%s.bed' % (res_folder, rgkey, curna)
43
      mw_na = open(nafile, 'w')
44
      mw_motif = open(motiffile, 'w')
45
46
      batchsize = 500000
47
      print("get motif for {}={}".format(rgkey, len(ref_genome))); sys.stdout.flush()
48
      cur_hi = 0; start_time = time.time();
49
      for na_ind in range(len(ref_genome)):
50
          cur_hi += 1
51
          if cur_hi % batchsize == 0:
52
             print('\t time consuming ({})= {} {}'.format( rgkey, cur_hi, time.time() - start_time ) )
53
             sys.stdout.flush()
54
             start_time = time.time()
55
56
          if (ref_genome[na_ind]==curna or na_bp[ref_genome[na_ind]]==curna):
57
             #curna_dict[(rgkey, na_ind, '+' if ref_genome[rgkey][na_ind]==curna else '-')] = True;
58
             mw_na.write('%s\t%s\t%s\n' % (rgkey, na_ind, '+' if ref_genome[na_ind]==curna else '-'))
59
          if ref_genome[na_ind]==curna and (not curmotif==None):
60
             for cur_mot in curmotif:
61
                 is_mot = True; mot_ind = 0;
62
                 for cur_sub_r_ind in range(na_ind - curmotif[cur_mot], na_ind + len(cur_mot) - curmotif[cur_mot] ):
63
                     if cur_sub_r_ind<0 or cur_sub_r_ind>len(ref_genome)-1: 
64
                        is_mot = False; break;
65
                     if not ref_genome[cur_sub_r_ind] == cur_mot[mot_ind]: 
66
                        is_mot = False; break;
67
                     mot_ind += 1
68
                 if is_mot:
69
                    mw_motif.write('%s\t%s\t%s\n' % (rgkey, na_ind, '+'))
70
                    mw_motif.write('%s\t%s\t%s\n' % (rgkey, na_ind+1, '-'))
71
                    break;
72
      mw_na.close(); mw_motif.close()
73
74
                   
75
76
ref_fa = 'ref/hg38.fa'
77
ref_fa = sys.argv[1]
78
res_folder = 'genome.motif/C/'
79
res_folder = sys.argv[2]+'/'
80
if not os.path.isdir(res_folder):
81
   os.system('mkdir -p '+res_folder)
82
83
curna='C'
84
curmotif={'CG':0}
85
curna = sys.argv[3];
86
curmotif = {sys.argv[4]:int(sys.argv[5])}
87
88
if len(sys.argv)>6:
89
   chrkeys = ["chr%s" % cid for cid in sys.argv[6].split(',')]
90
else:
91
   chrkeys = []
92
   for i in range(1, 23):
93
      chrkeys.append("chr%d" % i)
94
   chrkeys.append("chrX")
95
   chrkeys.append("chrY")
96
   chrkeys.append("chrM")
97
98
chrkeys = set(chrkeys)
99
100
101
na_bp = {"A":"T", \
102
         "C":"G", \
103
         "G":"C", \
104
         "T":"A", \
105
         "a":"t", \
106
         "c":"g", \
107
         "g":"c", \
108
         "t":"a", \
109
         "N":"N", \
110
         "n":"n" \
111
         }
112
113
114
ref_genome = read_genome(ref_fa);
115
116
##############################
117
pmanager = multiprocessing.Manager();
118
run_Q = pmanager.Queue();
119
for curk in chrkeys:
120
   run_Q.put((curk, ref_genome[curk], res_folder, na_bp, curna, curmotif))
121
122
mhandlers = [];
123
share_var = (run_Q, )
124
m_thread_num = len(chrkeys);
125
for i in range(m_thread_num):
126
   p = multiprocessing.Process(target=handle_motif_pos, args=share_var)
127
   p.start();
128
   mhandlers.append(p);
129
while any(p.is_alive() for p in mhandlers):
130
   try:
131
      time.sleep(1);
132
   except:
133
      time.sleep(1);
134
      continue;
135
136
137
138
139
140