a b/bin/DeepMod_scripts/myGetFeatureBasedPos.py
1
2
import os;
3
import sys;
4
import string;
5
import glob;
6
import time
7
import copy
8
9
import h5py
10
import numpy as np
11
import multiprocessing
12
13
from collections import defaultdict
14
from distutils.version import LooseVersion
15
16
import tempfile
17
import subprocess
18
19
import re;
20
21
from . import myCom
22
from . import myDetect
23
24
#
25
# map long reads
26
# then call another function to get feature for each base of interest
27
#
28
def mGetFeature1(moptions, sp_options, f5files):
29
   # associate signals to events
30
   f5data = myDetect.get_Event_Signals(moptions, sp_options, f5files)
31
32
   # save all sequences information
33
   if moptions['outLevel']<=myCom.OUTPUT_DEBUG: start_time = time.time();
34
   temp_fa = tempfile.NamedTemporaryFile(suffix='.fa', mode='w')
35
   f5keys = sorted(f5data.keys()); #f5keys.sort()
36
   for f5k in f5keys:
37
      temp_fa.write(''.join(['>', f5k, '\n', f5data[f5k][0], '\n']))
38
   temp_fa.flush();
39
   if moptions['outLevel']<=myCom.OUTPUT_DEBUG:
40
      end_time = time.time();
41
      print ("Write consuming time %d" % (end_time-start_time))
42
43
   # alignment using bwa-mem or minimap2
44
   temp_sam = tempfile.NamedTemporaryFile()
45
   if moptions['alignStr']=='bwa':
46
      cmd_opt = ['mem', '-x', 'ont2d', '-v', '1', '-t', '1', moptions['Ref'], temp_fa.name]
47
   else:
48
      cmd_opt = ['-ax', 'map-ont', moptions['Ref'], temp_fa.name]
49
   returncode = subprocess.call([moptions['alignStr'],]+cmd_opt, stdout=temp_sam)
50
   if not returncode==0:
51
      print ('Fatal Error!!! returncode is non-zero(%d) for "%s"' % (returncode, curcmd))
52
      errkey = "Cannot running aligment"
53
      for f5k in f5keys:
54
         sp_options["Error"][errkey].append(f5data[f5k][3])
55
      return;
56
57
   temp_fa.close();
58
   temp_sam.seek(0);
59
   # get sam information
60
   align_info = temp_sam.readlines()
61
   align_info = [str(align_info[i], 'utf-8').strip() for i in range(len(align_info))]
62
   temp_sam.close();
63
64
   sp_param = defaultdict();
65
   sp_param['f5data'] = f5data
66
67
   # for alignment
68
   f5align = defaultdict()
69
   f5keydict = defaultdict();
70
   sp_param['ref_info'] = defaultdict()
71
72
   if moptions['outLevel']<=myCom.OUTPUT_DEBUG:start_time = time.time();
73
   ilid = 0;
74
   # for each record in sam, get alignment information
75
   while ilid < len(align_info):
76
      if len(align_info[ilid])==0 or align_info[ilid][0]=='@':
77
         ilid += 1
78
         continue;
79
80
      sp_param['f5status'] = "";
81
      sp_param['line'] = align_info[ilid]
82
      qname = handle_line(moptions, sp_param, f5align)
83
      if sp_param['f5status'] == "":
84
         f5keydict[qname] = True;
85
      ilid += 1
86
87
   # for unmapped reads
88
   for f5k in f5keys:
89
      if f5k not in f5keydict:
90
         sp_options["Error"]["Not in alignment sam"].append(f5data[f5k][3])
91
92
   if moptions['outLevel']<=myCom.OUTPUT_DEBUG:
93
      end_time = time.time();
94
      print ("Get BAM consuming time %d" % (end_time-start_time))
95
96
   sp_param['f5status']= ""
97
   sp_param['line'] = ""
98
   if moptions['outLevel']<=myCom.OUTPUT_DEBUG:start_time = time.time();
99
   # handle each alignment
100
   handle_record(moptions, sp_options, sp_param, f5align, f5data)
101
   if moptions['outLevel']<=myCom.OUTPUT_DEBUG:
102
      end_time = time.time();
103
      print ("Analyze & annotate & save consuming time %d" % (end_time-start_time))
104
105
#
106
# get mapping information
107
# then call another function to get feature of each base in a long read
108
#
109
def handle_record(moptions, sp_options, sp_param, f5align, f5data):
110
   alignkeys = list(f5align.keys());
111
   # alignment detail
112
   numreg = re.compile('\d+')
113
   mdireg = re.compile('[MIDNSHPX=]{1}')
114
115
   feat_file_ind_dict = []
116
   feat_list = None; feat_file_ind = 0
117
   start_c_time = time.time();
118
119
   for readk in alignkeys:
120
     if len(feat_file_ind_dict)>0 and feat_list.nbytes > moptions['size_per_batch']:
121
        # save features when the size is larger than the defined size
122
        cur_feat_file_base = sp_options['ctfolder'] + '/'+str(feat_file_ind)
123
        np.savetxt(cur_feat_file_base+'.xy.gz', feat_list, fmt='%.3f')
124
        with open(cur_feat_file_base+'.xy.ind', 'w') as ind_mw:
125
            for f_ind in feat_file_ind_dict:
126
               ind_mw.write('%d %s\n' % (f_ind[1], f_ind[0]))
127
        print ("\t%s-%d Total consuming time %d" % (sp_options['ctfolder'][sp_options['ctfolder'].rfind('/'):], feat_file_ind, time.time()-start_c_time)); sys.stdout.flush()
128
        feat_file_ind_dict = []
129
        feat_list = None;
130
        feat_file_ind += 1
131
132
     # get alignment detail
133
     mapq, flag, rname, pos, cigar, readseq = f5align[readk]
134
135
     if not ( (rname in moptions['fulmodlist'] and len(moptions['fulmodlist'][rname])>0) or \
136
        ((not moptions['anymodlist']==None) and rname in moptions['anymodlist'] and len(moptions['anymodlist'][rname])>0) or \
137
        ((not moptions['nomodlist']==None) and rname in moptions['nomodlist'] and len(moptions['nomodlist'][rname])>0) ):
138
        continue;
139
140
     # get reference sequece
141
     if rname not in sp_param['ref_info']:
142
        myDetect.getRefSeq(moptions, sp_param, rname)
143
     refseq = sp_param['ref_info'][rname]
144
145
     # mapped starting position and strand
146
     pos = pos - 1
147
     forward_reverse = '-' if flag&0x10 else '+'
148
149
     numinfo = numreg.findall(cigar);
150
     mdiinfo = mdireg.findall(cigar)
151
     numinfo = [int(numinfo[i]) for i in range(len(numinfo))] #map(int, numinfo)
152
153
     # remove clip from both tails
154
     leftclip = 0; rightclip = 0;
155
     while mdiinfo[0] in ['I', 'D', 'N', 'S', 'H', 'P', 'X']:
156
         if mdiinfo[0] in ['I', 'S', 'X']:
157
            leftclip += numinfo[0];  readseq = readseq[numinfo[0]:]
158
         if mdiinfo[0] in ['H']: leftclip += numinfo[0]
159
         if mdiinfo[0] in ['D', 'N', 'X']:
160
            pos += numinfo[0]
161
         numinfo = numinfo[1:];  mdiinfo = mdiinfo[1:]
162
     while mdiinfo[-1] in ['I', 'D', 'N', 'S', 'H', 'P', 'X']:
163
         if mdiinfo[-1] in ['I', 'S', 'X']:
164
            rightclip += numinfo[-1]; readseq = readseq[:-numinfo[-1]]
165
         if mdiinfo[-1] in ['H']: rightclip += numinfo[-1]
166
         numinfo = numinfo[:-1]; mdiinfo = mdiinfo[:-1]
167
     if forward_reverse=='+':
168
        if rightclip>0: m_event = f5data[readk][1][leftclip:-rightclip]
169
        else: m_event = f5data[readk][1][leftclip:]
170
     else:
171
        if leftclip>0: m_event = f5data[readk][1][rightclip:-leftclip]
172
        else: m_event = f5data[readk][1][rightclip:]
173
174
     # is in region of interest if provided
175
     isinreg = False;
176
     if (moptions['region'][0] in ['', None, rname]) and \
177
        (moptions['region'][1] in ['', None] or pos>moptions['region'][1]) and \
178
        (moptions['region'][2] in ['', None] or pos+len(m_event)<moptions['region'][2]):
179
        isinreg = True;
180
     if not isinreg:
181
        continue;
182
183
     # associate mapped reference positions with read positions
184
     lastmatch = None; firstmatch = None;
185
     first_match_pos = None; last_match_pos = None
186
     last_al_match = None;  first_al_match = None
187
     lasmtind = 0;
188
     base_map_info = []; #indel_groups = defaultdict()
189
     nummismatch = 0; numinsert = 0; numdel = 0;
190
     read_ind = 0;
191
     for n1ind in range(len(numinfo)):
192
        mdi = mdiinfo[n1ind];
193
        # for each mapped types
194
        for n1i in range(numinfo[n1ind]):
195
           if mdi=='M':
196
              base_map_info.append((refseq[pos], readseq[read_ind], pos, read_ind))
197
              if refseq[pos]==readseq[read_ind]:
198
                 if firstmatch==None: firstmatch = read_ind
199
                 if lastmatch==None or lastmatch<read_ind: lastmatch = read_ind; lasmtind=n1ind
200
                 if first_al_match==None: first_al_match=len(base_map_info)-1
201
                 if last_al_match==None or last_al_match<len(base_map_info): last_al_match=len(base_map_info)-1
202
                 if first_match_pos==None: first_match_pos = pos
203
                 if last_match_pos==None or last_match_pos<pos: last_match_pos = pos
204
              else: nummismatch += 1
205
              pos += 1; read_ind += 1;
206
           elif mdi =='I':
207
              base_map_info.append(('-', readseq[read_ind], pos, read_ind))
208
              read_ind += 1;
209
              numinsert += 1
210
           elif mdi == 'D':
211
              base_map_info.append((refseq[pos], '-', pos, read_ind))
212
              pos += 1;
213
              numdel += 1
214
           elif mdi == 'N':
215
              base_map_info.append((refseq[pos], '-', pos, read_ind))
216
              pos += 1;
217
              if moptions['outLevel']<=myCom.OUTPUT_WARNING:
218
                 print ('CIGAR-Error N exist', f5data[readk][3])
219
           elif mdi == 'S':
220
              read_ind += 1;
221
              if moptions['outLevel']<=myCom.OUTPUT_WARNING:
222
                 print ('CIGAR-Error!!! S in the middle of the sequence', f5data[readk][3])
223
           elif mdi == 'H':
224
              if moptions['outLevel']<=myCom.OUTPUT_WARNING:
225
                 print ('CIGAR-Error!!! H in the middle of the sequence', f5data[readk][3])
226
           elif mdi == 'P':
227
              if moptions['outLevel']<=myCom.OUTPUT_WARNING:
228
                 print ('CIGAR-Error!!! P exist', f5data[readk][3])
229
           elif mdi == '=':
230
             base_map_info.append((refseq[pos], readseq[read_ind], pos, read_ind))
231
             if first_match_pos==None: first_match_pos  = pos
232
             if last_match_pos==None or last_match_pos<pos: last_match_pos = pos
233
             pos += 1; read_ind += 1;
234
             if firstmatch==None: firstmatch = read_ind - 1
235
             if lastmatch==None or lastmatch<read_ind-1: lastmatch = read_ind - 1; lasmtind=n1ind
236
             if last_al_match==None or last_al_match<len(base_map_info): last_al_match=len(base_map_info)-1
237
             if first_al_match==None: first_al_match=len(base_map_info)-1
238
           elif mdi == 'X':
239
             base_map_info.append((refseq[pos], readseq[read_ind], pos, read_ind))
240
             pos += 1; read_ind += 1;
241
             nummismatch += 1
242
           else:
243
             if moptions['outLevel']<=myCom.OUTPUT_WARNING:
244
                print ('CIGAR-Error!!!', 'Warning unknow CIGAR element ' + str(numinfo[n1ind]) + ' ' + mdi, f5data[readk][3])
245
     if firstmatch==None or lastmatch==None or firstmatch<0 or lastmatch<0:
246
        if moptions['outLevel']<=myCom.OUTPUT_WARNING:
247
           print ("Errorfast5 "+f5data[readk][3])
248
           print('match-Error!!! no first and/or last match',f5data[readk][3],('firstmatch=%d' % firstmatch) if not (firstmatch==None) else "N", ('lastmatch%d' % lastmatch) if not (lastmatch==None) else "N", str(flag), rname, str(pos));
249
           print('\tf=%d, chr=%s, p=%d, c=%s, s=%s' % (flag, rname, pos, cigar, readseq))
250
           continue;
251
252
     # remove unmatch in both tails
253
     if not firstmatch==None: leftclip += firstmatch
254
     if (not lastmatch==None) and len(m_event)-lastmatch>1: rightclip += len(m_event)-lastmatch-1
255
     # remove events whose bases are not mapped.
256
     if forward_reverse=='+':
257
        if len(m_event)-lastmatch>1:
258
           m_event = m_event[firstmatch:(lastmatch+1-len(m_event))]
259
        elif firstmatch>0: m_event = m_event[firstmatch:]
260
     else:
261
        if firstmatch>0: m_event = m_event[(len(m_event)-1-lastmatch):-firstmatch]
262
        elif len(m_event)-lastmatch>1: m_event = m_event[(len(m_event)-1-lastmatch):]
263
     # print detail if unexpected error occurs
264
     if firstmatch>0 or len(base_map_info)-last_al_match>1:
265
        if moptions['outLevel']<=myCom.OUTPUT_WARNING and ((firstmatch>0) or (len(base_map_info)-last_al_match>1 and refseq[last_match_pos+1] not in ['N'])):
266
           print ("Errorfast5"+f5data[readk][3])
267
           print ('Warning!!! first not match', firstmatch, lastmatch, first_al_match, last_al_match, len(base_map_info), numinfo[lasmtind-2:(lasmtind+5)], mdiinfo[lasmtind-2:(lasmtind+5)], lasmtind, len(numinfo))
268
           print('\tref='+refseq[last_match_pos:last_match_pos+20]+"\n\tred="+readseq[lastmatch:lastmatch+20])
269
           if firstmatch>0:
270
              print('\tref='+refseq[(first_match_pos-20 if first_match_pos-20>0 else 0):first_match_pos]+"\n\tred="+readseq[(firstmatch-20 if firstmatch-20>0 else 0):firstmatch])
271
           print('\tf=%d, chr=%s, p=%d, c=%s, s=%s' % (flag, rname, pos, cigar, readseq)) # flag, rname, pos, cigar, readseq
272
273
        if len(base_map_info)-last_al_match>1:
274
           base_map_info = base_map_info[first_al_match:(last_al_match+1-len(base_map_info))]
275
        elif first_al_match>0:
276
           base_map_info = base_map_info[first_al_match:]
277
278
     # post-process mapping information
279
     base_map_info = np.array(base_map_info, dtype=[('refbase', 'U1'), ('readbase', 'U1'), ('refbasei', np.uint64), ('readbasei', np.uint64)])
280
     if forward_reverse=='-':
281
        base_map_info = np.flipud(base_map_info)
282
        for bmii in range(len(base_map_info)):
283
            base_map_info['refbase'][bmii]  = get_complement(base_map_info['refbase'][bmii])
284
            base_map_info['readbase'][bmii] = get_complement(base_map_info['readbase'][bmii])
285
        leftclip, rightclip = rightclip, leftclip
286
     if False: #True: # for test base_map_info  ### for check consistency
287
        ref_align_key = '/Analyses/NanomoCorrected_000/BaseCalled_template/Alignment/genome_alignment'
288
        read_align_key = '/Analyses/NanomoCorrected_000/BaseCalled_template/Alignment/read_alignment'
289
        with h5py.File(f5data[readk][3], 'r') as mf5:
290
           read_align_list = [bt.decode(encoding="utf-8") for bt in mf5[read_align_key]]
291
           ref_align_list = [bt.decode(encoding="utf-8") for bt in mf5[ref_align_key]]
292
           for rali in range(len(read_align_list)):
293
              if not read_align_list[rali]==base_map_info['readbase'][rali]:
294
                 print ("Error not equal1! %s %s %d %s" % (read_align_list[rali], base_map_info['readbase'][rali], rali, f5data[readk][3]))
295
              if not ref_align_list[rali]==base_map_info['refbase'][rali]:
296
                 print ("Error not equal2! %s %s %d %s" % (ref_align_list[rali], base_map_info['refbase'][rali], rali, f5data[readk][3]))
297
     #
298
     # handle map like
299
     # CCG    or CGG
300
     # C-G       C-G
301
     #
302
     if 'motif' in moptions and moptions['motif'][0]=='CG':
303
        for ali in range(len(base_map_info)):
304
           if base_map_info['refbase'][ali]=='C' and base_map_info['readbase'][ali]=='C':
305
              if ali+1<len(base_map_info) and base_map_info['readbase'][ali+1]=='-' and base_map_info['refbase'][ali+1]=='G':
306
                 addali = 2;
307
                 while ali + addali < len(base_map_info):
308
                     if base_map_info['readbase'][ali+addali]=='-' and base_map_info['refbase'][ali+addali]=='G': addali += 1;
309
                     else: break;
310
                 if ali + addali < len(base_map_info) and base_map_info['readbase'][ali+addali]=='G' and base_map_info['refbase'][ali+addali]=='G':
311
                    base_map_info['readbase'][ali+1], base_map_info['readbase'][ali+addali] = base_map_info['readbase'][ali+addali], base_map_info['readbase'][ali+1]
312
           if base_map_info['refbase'][ali]=='G' and base_map_info['readbase'][ali]=='G':
313
              if ali-1>-1 and base_map_info['readbase'][ali-1]=='-' and base_map_info['refbase'][ali-1]=='C':
314
                 addali = 2;
315
                 while ali - addali >-1:
316
                     if base_map_info['readbase'][ali-addali]=='-' and base_map_info['refbase'][ali-addali]=='C': addali += 1;
317
                     else: break;
318
                 if ali - addali>-1 and base_map_info['readbase'][ali-addali]=='C' and base_map_info['refbase'][ali-addali]=='C':
319
                     base_map_info['readbase'][ali-1], base_map_info['readbase'][ali-addali] = base_map_info['readbase'][ali-addali], base_map_info['readbase'][ali-1]
320
     # too short reads
321
     if len(m_event)<500:
322
         sp_options["Error"]["Less(<500) events"].append(f5data[readk][3])
323
         continue;
324
325
     # get feautre
326
     mfeatures,isdif = get_Feature(moptions, sp_options, sp_param, f5align, f5data, readk, leftclip, rightclip, base_map_info, forward_reverse, rname, first_match_pos, numinsert, numdel)
327
     if isdif and moptions['outLevel']<=myCom.OUTPUT_WARNING:
328
        print("Dif is true")
329
        print([lastmatch, firstmatch, first_match_pos, last_match_pos, first_al_match, last_al_match, lasmtind, len(base_map_info), nummismatch, numinsert, numdel, len(base_map_info)-nummismatch-numinsert-numdel])
330
331
     # merge to previously handled features of other fast5 files
332
     if len(mfeatures)>0:
333
        if len(feat_file_ind_dict)==0:
334
           feat_file_ind_dict.append((f5data[readk][3], 0));
335
           feat_list = mfeatures
336
        else:
337
           feat_file_ind_dict.append((f5data[readk][3], len(feat_list)))
338
           feat_list = np.concatenate((feat_list, mfeatures), axis=0)
339
340
   # store the last feature data.
341
   if len(feat_file_ind_dict)>0:
342
      cur_feat_file_base = sp_options['ctfolder'] + '/'+str(feat_file_ind)
343
      np.savetxt(cur_feat_file_base+'.xy.gz', feat_list, fmt='%.3f')
344
      with open(cur_feat_file_base+'.xy.ind', 'w') as ind_mw:
345
          for f_ind in feat_file_ind_dict:
346
             ind_mw.write('%d %s\n' % (f_ind[1], f_ind[0]))
347
      print ("\t%s-%d Total consuming time %d" % (sp_options['ctfolder'][sp_options['ctfolder'].rfind('/'):], feat_file_ind, time.time()-start_c_time)); sys.stdout.flush()
348
      feat_file_ind_dict = []
349
      feat_list = None;
350
      feat_file_ind += 1
351
352
#
353
# get feature for each base of interest in long reads according to raw signals and mapping information
354
#
355
def get_Feature(moptions, sp_options, sp_param, f5align, f5data, readk, start_clip, end_clip, base_map_info, forward_reverse, rname, mapped_start_pos, num_insertions, num_deletions):
356
   # event information
357
   modevents = sp_param['f5data'][readk][1]
358
   # class number, bin num and bin length
359
   clnum = 2; binnum = 50; binlen = 0.2;
360
   if forward_reverse=='+':
361
      align_ref_pos = mapped_start_pos
362
   else:
363
      align_ref_pos = mapped_start_pos + len(base_map_info) - num_insertions - 1
364
365
   # initialize feature matrix for all events.
366
   if moptions['fnum']==57:
367
      #mfeatures = np.zeros((len(modevents)-end_clip+100-(start_clip-100), (binnum+3+3+4)));
368
      mfeatures = np.zeros((len(modevents)-end_clip+100-(start_clip-100), (binnum+3+3+4)));
369
   else: mfeatures = np.zeros((len(modevents)-end_clip+100-(start_clip-100), (3+3+4)));
370
371
   # filter poor alignment
372
   checkneighbornums = [3,6]
373
   checkratios = {3:[6,5,4,2], 6:[11,10,9,3]}
374
   checkratios = {3:[6,5,4,2], 6:[12,10,9,3]}
375
   cgpos = [[], []]
376
   affectneighbor = 1; # 2;
377
   for aligni in range(len(base_map_info)):
378
      # for methylated positions and not-used adjacent positions
379
      if 'motif' in moptions and base_map_info['readbase'][aligni]==moptions['motif'][0][moptions['motif'][1]]:
380
         m_a_st = aligni-moptions['motif'][1]; m_a_end = aligni+len(moptions['motif'][0])-moptions['motif'][1]
381
         if m_a_st>-1 and m_a_end<=len(base_map_info) and ''.join(base_map_info['readbase'][m_a_st:m_a_end])==moptions['motif'][0] and (not ''.join(base_map_info['refbase'][m_a_st:m_a_end])==moptions['motif'][0]):
382
            cgpos[1].extend([(forward_reverse, base_map_info['refbasei'][addi]) for addi in range(aligni-affectneighbor if aligni-affectneighbor>-1 else 0, aligni+affectneighbor+1 if aligni+affectneighbor+1<len(base_map_info) else len(base_map_info))])
383
      if (not base_map_info['refbase'][aligni]=='-') and \
384
         (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['fulmodlist'][rname]:
385
         if not base_map_info['readbase'][aligni]=='-':
386
            nextnogap = aligni + 1;
387
            while nextnogap<len(base_map_info):
388
               if not base_map_info['refbase'][nextnogap]=='-': break;
389
               nextnogap += 1
390
            iscg = False;
391
            # find gaps
392
            for checkneighbornum in checkneighbornums:
393
               if not nextnogap<len(base_map_info): continue;
394
               matchnum = 0; gapnum = 0;
395
               # get gaps for two window sizes
396
               for checki in range(aligni-checkneighbornum, aligni+checkneighbornum+1):
397
                  if checki>-1 and checki<len(base_map_info):
398
                     if base_map_info['refbase'][checki]==base_map_info['readbase'][checki]: matchnum += 1
399
                     if base_map_info['refbase'][checki]=='-' or base_map_info['readbase'][checki]=='-': gapnum += 1
400
               if gapnum<=checkratios[checkneighbornum][3]:
401
                  for addi in range(aligni-affectneighbor if aligni-affectneighbor>-1 else 0, nextnogap+affectneighbor if nextnogap+affectneighbor<len(base_map_info) else len(base_map_info)):
402
                     if addi==aligni: # for methylated positions
403
                        cgpos[0].append((forward_reverse, base_map_info['refbasei'][addi]))
404
                     else: # for non-used positions
405
                        cgpos[1].append((forward_reverse, base_map_info['refbasei'][addi]))
406
                  iscg = True; break;
407
            if iscg: continue;
408
         # add more not-used positions if more gaps exist
409
         if not base_map_info['readbase'][aligni]=='-':
410
            nextnogap = aligni
411
            for _ in range(affectneighbor):
412
               nextnogap += 1;
413
               while nextnogap<len(base_map_info['refbase']):
414
                 if not base_map_info['refbase'][nextnogap]=='-': break;
415
                 nextnogap += 1
416
            prenogap = aligni
417
            for _ in range(affectneighbor):
418
               prenogap -= 1;
419
               while prenogap>-1:
420
                  if not base_map_info['refbase'][prenogap]=='-': break;
421
                  prenogap -= 1
422
423
            read0 = aligni; read1 = aligni
424
            for _ in range(affectneighbor):
425
               read0 -= 1
426
               while read0>-1:
427
                  if base_map_info['readbase'][read0]=='-': read0 -= 1
428
                  else: break;
429
               read1 += 1
430
               while read1<len(base_map_info['readbase']):
431
                  if base_map_info['readbase'][read1]=='-': read1 += 1
432
                  else: break;
433
434
            if read0<prenogap:
435
               if read0>-1: prenogap = read0
436
               else: prenogap = 0
437
            if read1>nextnogap:
438
               if read1<len(base_map_info['readbase']): nextnogap = read1
439
               else: nextnogap = len(base_map_info['readbase'])-1
440
            if prenogap<0: prenogap = 0
441
            if not nextnogap<len(base_map_info['readbase']): nextnogap=len(base_map_info['readbase'])-1
442
            if not prenogap<len(base_map_info['readbase']): prenogap=len(base_map_info['readbase'])-1
443
            for excldi in range(prenogap, nextnogap+1):
444
               cgpos[1].append((forward_reverse, base_map_info['refbasei'][excldi]))
445
446
   print ('%s%s %d, %d >> %d %d, %d-%d=%d' % (forward_reverse, f5data[readk][3], len(cgpos[0]), len(cgpos[1]), len(modevents)-end_clip-start_clip, start_clip, len(modevents), end_clip, len(modevents)-end_clip))
447
448
   aligni = 0; isdif = False;
449
   for ie in range(start_clip-100, len(modevents)-end_clip+100):
450
      cur_row_num = ie - (start_clip-100); cur_base = ''
451
      # for aligned bases
452
      if ie>=start_clip and ie<len(modevents)-end_clip:
453
         if align_ref_pos<mapped_start_pos:
454
            print ('ERRRR align_ref_pos(%d)<mapped_start_pos(%d)' % (align_ref_pos, mapped_start_pos))
455
         while base_map_info['readbase'][aligni]=='-':
456
            if not align_ref_pos==base_map_info['refbasei'][aligni]:
457
               print ('ERRRR align_ref_pos(%d) not equal to %d' % (align_ref_pos, base_map_info['refbasei'][aligni] ))
458
            if not base_map_info['refbase'][aligni]=='-':
459
               if forward_reverse=='+': align_ref_pos += 1
460
               else: align_ref_pos -= 1
461
            aligni += 1
462
         if not base_map_info['readbase'][aligni] == modevents['model_state'][ie][2]:
463
            print ('Error Does not match', base_map_info['readbase'][aligni], modevents['model_state'][ie][2], aligni, ie)
464
            isdif = True;
465
         # the first column is the aligned reference position
466
         mfeatures[cur_row_num][0] = align_ref_pos
467
         cur_base = base_map_info['refbase'][aligni]
468
         # the second/third column is for negative/positive labels of methylation
469
         if moptions['posneg'] == 0: # for a data without any modification
470
            if ( (not moptions['anymodlist']==None) and rname in moptions['nomodlist'] and (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['nomodlist'][rname] ):
471
                mfeatures[cur_row_num][1] = 1; mfeatures[cur_row_num][2] = 0
472
            elif (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['fulmodlist'][rname]:
473
                mfeatures[cur_row_num][1] = 1; mfeatures[cur_row_num][2] = 0
474
            elif ((not moptions['anymodlist']==None) and rname in moptions['anymodlist'] and (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['anymodlist'][rname] ):
475
                mfeatures[cur_row_num][1] = 1; mfeatures[cur_row_num][2] = 0
476
         else: # for a data with both modified and un-modified positions
477
            if (forward_reverse, base_map_info['refbasei'][aligni]) in cgpos[0] and (not base_map_info['refbase'][aligni]=='-'):
478
               mfeatures[cur_row_num][1] = 0; mfeatures[cur_row_num][2] = 1
479
            else:
480
               if (forward_reverse, base_map_info['refbasei'][aligni]) not in cgpos[1]:
481
                  if moptions['anymodlist']==None:
482
                      if moptions['nomodlist']==None or ( rname in moptions['nomodlist'] and (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['nomodlist'][rname] ):
483
                         mfeatures[cur_row_num][1] = 1; mfeatures[cur_row_num][2] = 0
484
                  elif rname in moptions['anymodlist'] and (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['anymodlist'][rname]:
485
                      pass
486
                  else:
487
                      if moptions['nomodlist']==None or ( rname in moptions['nomodlist'] and (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['nomodlist'][rname] ):
488
                         mfeatures[cur_row_num][1] = 1; mfeatures[cur_row_num][2] = 0
489
         if not base_map_info['refbase'][aligni]=='-':
490
            if forward_reverse=='+': align_ref_pos += 1
491
            else: align_ref_pos -= 1
492
         aligni += 1
493
494
      # for bin features
495
      if ie>=0 and ie<len(modevents) and moptions['fnum']==57:
496
         for currs in sp_param['f5data'][readk][2][modevents['start'][ie]:int(modevents['start'][ie]+int(modevents['length'][ie]+0.5))]:
497
             if currs>10 or currs<-10: print ('Error raw signal', currs, ie, modevents['start'][ie], modevents['length'][ie])
498
             curbin = int((currs+5)/binlen)
499
             if curbin<0: curbin = 0
500
             elif not curbin<binnum: curbin = binnum-1
501
             mfeatures[cur_row_num][curbin+3] += 1
502
      if ie>=0 and ie<len(modevents):
503
         # for reference base type feature
504
         if cur_base in myCom.g_ACGT:
505
            mfeatures[cur_row_num][moptions['fnum']-3+3-4+myCom.g_ACGT.index(cur_base)] = 1
506
         cur_index_add = moptions['fnum'] - 3 + 3
507
         # for signal mean std and length.
508
         mfeatures[cur_row_num][cur_index_add + 0] = modevents["mean"][ie]
509
         mfeatures[cur_row_num][cur_index_add + 1] = modevents["stdv"][ie]
510
         mfeatures[cur_row_num][cur_index_add + 2] = modevents["length"][ie]
511
512
   # truncated too much not-used positions
513
   nbkeys = defaultdict();
514
   for mfi in range(len(mfeatures)):
515
      if mfeatures[mfi][1] + mfeatures[mfi][2] > 0.9:
516
         for ini in range(mfi-25, mfi+26):
517
            if ini<0 or ini>len(mfeatures)-1:
518
               print("Warning wrong del mfeatures id %d for %s" % (ini, f5data[readk][3]))
519
            else:
520
               nbkeys[ini] = True;
521
   keepInd = sorted(list(nbkeys.keys()));
522
   if len(keepInd)>0:
523
      if not len(keepInd)>len(mfeatures)*0.9:
524
         mfeatures = mfeatures[np.array(keepInd)]
525
   else:
526
      mfeatures = []
527
528
   return (mfeatures, isdif)
529
530
531
#
532
# get the complementary bases
533
#
534
def get_complement(na):
535
   if na in myCom.acgt: return myCom.na_bp[na]
536
   else: return na;
537
538
#
539
# get required information for reach mapping records.
540
#
541
def handle_line(moptions, sp_param, f5align):
542
   lsp = sp_param['line'].split('\t')
543
   qname, flag, rname, pos, mapq, cigar, _, _, _, seq, _ = lsp[:11]
544
   # checked query name
545
   if qname=='*': sp_param['f5status'] = "qname is *"
546
   # check mapping quality
547
   elif int(mapq)==255: sp_param['f5status'] = "mapq is 255"
548
   # check mapped positions
549
   elif int(pos)==0: sp_param['f5status'] = "pos is 0"
550
   # check mapped string
551
   elif cigar=='*': sp_param['f5status'] = "cigar is *"
552
   # check reference name
553
   elif rname=='*': sp_param['f5status'] = "rname is *"
554
   if not sp_param['f5status']=="": return qname
555
556
   if (qname not in f5align) or f5align[qname][0]<int(mapq):
557
      f5align[qname] = (int(mapq), int(flag), rname, int(pos), cigar, seq)
558
559
   return qname
560
561
#
562
# feature handler/workder for multiprocessing
563
#
564
def getFeature_handler(moptions, h5files_Q, failed_Q, version_Q):
565
   while not h5files_Q.empty():
566
      try:
567
         # get a list of files
568
         f5files, ctfolderid = h5files_Q.get(block=False)
569
      except:
570
         break;
571
572
      sp_options = defaultdict();
573
      sp_options['ctfolder'] = moptions['outFolder']+str(ctfolderid)
574
      if not os.path.isdir(sp_options['ctfolder']):
575
         os.system('mkdir '+sp_options['ctfolder'])
576
      # get features
577
      mGetFeature1(moptions, sp_options, f5files)
578
      # output errors
579
      for errtype, errfiles in sp_options["Error"].items():
580
         failed_Q.put((errtype, errfiles));
581
      # double check albacore version
582
      for vk in sp_options["get_albacore_version"]:
583
         version_Q.put((vk, sp_options["get_albacore_version"][vk]))
584
585
#
586
# read sequence information from a reference genome
587
#
588
def readFA(mfa, t_chr=None):
589
   fadict = defaultdict();
590
   with open(mfa, 'r') as mr:
591
      cur_chr = None;
592
      line = mr.readline();
593
      while line:
594
         # remove empty spaces
595
         line = line.strip();
596
         if len(line)>0:
597
            if line[0]=='>': # for each chromosome line
598
               if (not cur_chr==None) and (t_chr in [None, cur_chr]):
599
                  fadict[cur_chr] = ''.join(fadict[cur_chr])
600
               cur_chr = line[1:].split()[0]
601
               if t_chr in [None, cur_chr]:
602
                  fadict[cur_chr] = []
603
            else: # for sub-sequence line in a reference file
604
               if t_chr in [None, cur_chr]:
605
                  fadict[cur_chr].append(line.upper())
606
         line = mr.readline();
607
      # for the last chromosome in the file
608
      if (not cur_chr==None) and (t_chr in [None, cur_chr]):
609
         fadict[cur_chr] = ''.join(fadict[cur_chr])
610
   return fadict
611
612
#
613
# get reference positions for motif-based modifications
614
#
615
def readMotifMod(fadict, mpat='Cg', mposinpat=0, t_chr=None, t_start=None, t_end=None):
616
   pos_dict = defaultdict(int)
617
618
   # get motif and complementary motif
619
   pat3 = copy.deepcopy(mpat.upper())
620
   comp_pat3 = ''.join([get_complement(curna) for curna in pat3][::-1])
621
   comp_mposinpat = len(comp_pat3)-1-mposinpat
622
623
   fakeys = fadict.keys();
624
   cpgdict = defaultdict(int);
625
   all_a = defaultdict()
626
   for fak in fakeys:
627
       cpgnum = [0, 0]
628
       # motif-based reference positions
629
       cpgdict[fak] = defaultdict()
630
       # position of bases of interest
631
       all_a[fak] = defaultdict()
632
       for i in range(len(fadict[fak])):
633
          if (t_start==None or i>=t_start) and (t_end==None or i<=t_end):
634
             if fadict[fak][i]==mpat[mposinpat]: # for forward strand
635
                all_a[fak][('+', i)] = True;
636
             elif get_complement(fadict[fak][i])==mpat[mposinpat]: # for reverse strand
637
                all_a[fak][('-', i)] = True;
638
639
             # check motif in forward strand
640
             if i-mposinpat>=0 and i+len(comp_pat3)-1-mposinpat<len(fadict[fak]) and ''.join(fadict[fak][i-mposinpat:(i+len(comp_pat3)-1-mposinpat+1)])==pat3:
641
                cpgdict[fak][('+', i)] = [1, fadict[fak][i]]; cpgnum[0] += 1
642
             elif i-comp_mposinpat>=0 and i+len(comp_pat3)-1-comp_mposinpat<len(fadict[fak]) and ''.join(fadict[fak][i-comp_mposinpat:(i+len(comp_pat3)-1-comp_mposinpat+1)])==comp_pat3: # check motif in reverse strand
643
                cpgdict[fak][('-', i)] = [1, fadict[fak][i]]; cpgnum[1] += 1
644
             else:
645
                pass
646
       print('%s%d site: %d(+) %d(-) for %s' % (pat3, mposinpat, cpgnum[0], cpgnum[1], fak))
647
   return (cpgdict, all_a)
648
649
650
#
651
# a multiprocessing manager to get features from all long reads.
652
#
653
def getFeature_manager(moptions):
654
   start_time = time.time();
655
   # multipprocessing manager
656
   pmanager = multiprocessing.Manager();
657
658
   # prepare output folder
659
   if os.path.isdir(moptions['outFolder']):
660
      os.system('rm -dr '+moptions['outFolder'])
661
   if not os.path.isdir(moptions['outFolder']):
662
      os.system('mkdir '+moptions['outFolder'])
663
664
   moptions['size_per_batch'] = moptions['size_per_batch'] * (10**7)
665
666
   # read reference information
667
   fadict = readFA(moptions['Ref'],moptions['region'][0])
668
   if moptions['motifORPos']==1: # get motif-based positions for modifications
669
      moptions['fulmodlist'], moptions['nomodlist'] = readMotifMod(fadict, moptions['motif'][0], moptions['motif'][1], moptions['region'][0], moptions['region'][1], moptions['region'][2])
670
      moptions['anymodlist'] = None
671
      moptions['nomodlist'] = None; # add for simple process
672
   elif moptions['motifORPos']==2: # modification position is specified by the files
673
      fuldfiles = glob.glob(moptions["fulmod"]);
674
      moptions['fulmodlist'] = defaultdict(lambda: defaultdict());
675
      if not moptions["anymod"]==None: # partially modified positions
676
         anydfiles = glob.glob(moptions["anymod"])
677
         moptions['anymodlist'] = defaultdict(lambda: defaultdict());
678
      else:
679
         moptions['anymodlist'] = None
680
      if not moptions["nomod"]==None: # completely un-modified positions
681
         nodfiles = glob.glob(moptions["nomod"])
682
         moptions['nomodlist'] = defaultdict(lambda: defaultdict());
683
      else:
684
         moptions['nomodlist'] = None
685
      mthreadin = [moptions['fulmodlist'], moptions['anymodlist'], moptions['nomodlist']]
686
      mthfiles = [fuldfiles, anydfiles, nodfiles]
687
      # read completely modified positions, partially modified positions, completely un-modified positions from files
688
      for mthi in range(len(mthreadin)):
689
         curmeth = mthreadin[mthi]; curfilelist = mthfiles[mthi]
690
         if curmeth==None or curfilelist==None: continue;
691
         for curmthf in curfilelist:
692
             with open(curmthf, 'r') as mreader:
693
                line = mreader.readline();
694
                while line:
695
                   if len(line)>0:
696
                      tchr, tstrand, tpos = line.split()[:3]
697
                      curmeth[tchr][(tstrand, int(tpos))] = [1-mthi, fadict[tchr][int(tpos)]];
698
                   line = mreader.readline();
699
   for tchr in moptions['fulmodlist'] if moptions['anymodlist']==None else moptions['anymodlist']:
700
      if len(moptions['fulmodlist'][tchr])>0 or ((not moptions['anymodlist']==None) and len(moptions['anymodlist'][tchr])>0):
701
          print ('%s fulmod=%d anymod=%d nomod=%d' % (tchr, len(moptions['fulmodlist'][tchr]), len(moptions['anymodlist'][tchr]) if (not moptions['anymodlist']==None) else -1, len(moptions['nomodlist'][tchr]) if (not moptions['nomodlist']==None) else -1))
702
703
   if True: #False:
704
      # get all input fast5 files
705
      f5files = glob.glob(os.path.join(moptions['wrkBase'],"*.fast5" ))
706
      if moptions['recursive']==1:
707
         f5files.extend(glob.glob(os.path.join(moptions['wrkBase'],"*/*.fast5" )))
708
         f5files.extend(glob.glob(os.path.join(moptions['wrkBase'],"*/*/*.fast5" )))
709
         f5files.extend(glob.glob(os.path.join(moptions['wrkBase'],"*/*/*/*.fast5" )))
710
711
712
   print('Total files=%d' % len(f5files))
713
   h5files_Q = pmanager.Queue();
714
   failed_Q = pmanager.Queue()
715
   version_Q = pmanager.Queue()
716
717
   # split input fast5 files into different batch
718
   h5_batch = []; h5batchind = 0;
719
   for f5f in f5files:
720
      h5_batch.append(f5f);
721
      if len(h5_batch)==moptions['files_per_thread']:
722
         h5files_Q.put((h5_batch, h5batchind))
723
         h5batchind += 1
724
         h5_batch = []; #break; ### feature500
725
   if len(h5_batch)>0:
726
      h5files_Q.put((h5_batch, h5batchind))
727
728
   # each thread handle a batch a time and repeat for all batches.
729
   share_var = (moptions, h5files_Q, failed_Q, version_Q)
730
   handlers = []
731
   for hid in range(moptions['threads']):
732
      p = multiprocessing.Process(target=getFeature_handler, args=share_var);
733
      p.start();
734
      handlers.append(p);
735
736
   # get failed files.
737
   failed_files = defaultdict(list);
738
   version_default = defaultdict(lambda: defaultdict(int));
739
   while any(p.is_alive() for p in handlers):
740
      try:
741
         errk, fns = failed_Q.get(block=False);
742
         failed_files[errk].extend(fns)
743
         curv, curv_num = version_Q.get(block=False);
744
         version_default[curv] += curv_num
745
      except:
746
         time.sleep(1);
747
         continue;
748
749
   # output failure information
750
   if len(failed_files)>0:
751
      print ('Error information for different fast5 files:')
752
      for errtype, errfiles in failed_files.items():
753
         print ('\t%s %d' % (errtype, len(errfiles)))
754
   print("abversion info {}".format(str(version_default)))
755
   sys.stdout.flush()
756
   end_time = time.time();
757
   print ("Total consuming time %d" % (end_time-start_time))
758
759
760
761
# for indepdent testing of code
762
if __name__=='__main__':
763
#   if len(sys.argv)>4:
764
      moptions = {}
765
      moptions['basecall_1d'] = 'Basecall_1D_000'
766
      moptions['basecall_1d'] = ['Basecall_1D_000']
767
      moptions['basecall_2strand'] = 'BaseCalled_template'
768
769
      moptions['outLevel'] = myCom.OUTPUT_WARNING
770
      moptions['outLevel'] = myCom.OUTPUT_INFO
771
772
      moptions['modfile'] = '../../mod_output/train1/2/mod_train'
773
774
      moptions['fnum'] = 53;
775
      moptions['hidden'] = 100;
776
      moptions['windowsize'] = 21;
777
778
      moptions['threads'] = 8
779
      moptions['threads'] = 1
780
      moptions['files_per_thread'] = 500
781
782
      mDetect_manager(moptions)