[c66173]: / bin / DeepMod_scripts / myGetFeatureBasedPos.py

Download this file

783 lines (706 with data), 38.0 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
import os;
import sys;
import string;
import glob;
import time
import copy
import h5py
import numpy as np
import multiprocessing
from collections import defaultdict
from distutils.version import LooseVersion
import tempfile
import subprocess
import re;
from . import myCom
from . import myDetect
#
# map long reads
# then call another function to get feature for each base of interest
#
def mGetFeature1(moptions, sp_options, f5files):
# associate signals to events
f5data = myDetect.get_Event_Signals(moptions, sp_options, f5files)
# save all sequences information
if moptions['outLevel']<=myCom.OUTPUT_DEBUG: start_time = time.time();
temp_fa = tempfile.NamedTemporaryFile(suffix='.fa', mode='w')
f5keys = sorted(f5data.keys()); #f5keys.sort()
for f5k in f5keys:
temp_fa.write(''.join(['>', f5k, '\n', f5data[f5k][0], '\n']))
temp_fa.flush();
if moptions['outLevel']<=myCom.OUTPUT_DEBUG:
end_time = time.time();
print ("Write consuming time %d" % (end_time-start_time))
# alignment using bwa-mem or minimap2
temp_sam = tempfile.NamedTemporaryFile()
if moptions['alignStr']=='bwa':
cmd_opt = ['mem', '-x', 'ont2d', '-v', '1', '-t', '1', moptions['Ref'], temp_fa.name]
else:
cmd_opt = ['-ax', 'map-ont', moptions['Ref'], temp_fa.name]
returncode = subprocess.call([moptions['alignStr'],]+cmd_opt, stdout=temp_sam)
if not returncode==0:
print ('Fatal Error!!! returncode is non-zero(%d) for "%s"' % (returncode, curcmd))
errkey = "Cannot running aligment"
for f5k in f5keys:
sp_options["Error"][errkey].append(f5data[f5k][3])
return;
temp_fa.close();
temp_sam.seek(0);
# get sam information
align_info = temp_sam.readlines()
align_info = [str(align_info[i], 'utf-8').strip() for i in range(len(align_info))]
temp_sam.close();
sp_param = defaultdict();
sp_param['f5data'] = f5data
# for alignment
f5align = defaultdict()
f5keydict = defaultdict();
sp_param['ref_info'] = defaultdict()
if moptions['outLevel']<=myCom.OUTPUT_DEBUG:start_time = time.time();
ilid = 0;
# for each record in sam, get alignment information
while ilid < len(align_info):
if len(align_info[ilid])==0 or align_info[ilid][0]=='@':
ilid += 1
continue;
sp_param['f5status'] = "";
sp_param['line'] = align_info[ilid]
qname = handle_line(moptions, sp_param, f5align)
if sp_param['f5status'] == "":
f5keydict[qname] = True;
ilid += 1
# for unmapped reads
for f5k in f5keys:
if f5k not in f5keydict:
sp_options["Error"]["Not in alignment sam"].append(f5data[f5k][3])
if moptions['outLevel']<=myCom.OUTPUT_DEBUG:
end_time = time.time();
print ("Get BAM consuming time %d" % (end_time-start_time))
sp_param['f5status']= ""
sp_param['line'] = ""
if moptions['outLevel']<=myCom.OUTPUT_DEBUG:start_time = time.time();
# handle each alignment
handle_record(moptions, sp_options, sp_param, f5align, f5data)
if moptions['outLevel']<=myCom.OUTPUT_DEBUG:
end_time = time.time();
print ("Analyze & annotate & save consuming time %d" % (end_time-start_time))
#
# get mapping information
# then call another function to get feature of each base in a long read
#
def handle_record(moptions, sp_options, sp_param, f5align, f5data):
alignkeys = list(f5align.keys());
# alignment detail
numreg = re.compile('\d+')
mdireg = re.compile('[MIDNSHPX=]{1}')
feat_file_ind_dict = []
feat_list = None; feat_file_ind = 0
start_c_time = time.time();
for readk in alignkeys:
if len(feat_file_ind_dict)>0 and feat_list.nbytes > moptions['size_per_batch']:
# save features when the size is larger than the defined size
cur_feat_file_base = sp_options['ctfolder'] + '/'+str(feat_file_ind)
np.savetxt(cur_feat_file_base+'.xy.gz', feat_list, fmt='%.3f')
with open(cur_feat_file_base+'.xy.ind', 'w') as ind_mw:
for f_ind in feat_file_ind_dict:
ind_mw.write('%d %s\n' % (f_ind[1], f_ind[0]))
print ("\t%s-%d Total consuming time %d" % (sp_options['ctfolder'][sp_options['ctfolder'].rfind('/'):], feat_file_ind, time.time()-start_c_time)); sys.stdout.flush()
feat_file_ind_dict = []
feat_list = None;
feat_file_ind += 1
# get alignment detail
mapq, flag, rname, pos, cigar, readseq = f5align[readk]
if not ( (rname in moptions['fulmodlist'] and len(moptions['fulmodlist'][rname])>0) or \
((not moptions['anymodlist']==None) and rname in moptions['anymodlist'] and len(moptions['anymodlist'][rname])>0) or \
((not moptions['nomodlist']==None) and rname in moptions['nomodlist'] and len(moptions['nomodlist'][rname])>0) ):
continue;
# get reference sequece
if rname not in sp_param['ref_info']:
myDetect.getRefSeq(moptions, sp_param, rname)
refseq = sp_param['ref_info'][rname]
# mapped starting position and strand
pos = pos - 1
forward_reverse = '-' if flag&0x10 else '+'
numinfo = numreg.findall(cigar);
mdiinfo = mdireg.findall(cigar)
numinfo = [int(numinfo[i]) for i in range(len(numinfo))] #map(int, numinfo)
# remove clip from both tails
leftclip = 0; rightclip = 0;
while mdiinfo[0] in ['I', 'D', 'N', 'S', 'H', 'P', 'X']:
if mdiinfo[0] in ['I', 'S', 'X']:
leftclip += numinfo[0]; readseq = readseq[numinfo[0]:]
if mdiinfo[0] in ['H']: leftclip += numinfo[0]
if mdiinfo[0] in ['D', 'N', 'X']:
pos += numinfo[0]
numinfo = numinfo[1:]; mdiinfo = mdiinfo[1:]
while mdiinfo[-1] in ['I', 'D', 'N', 'S', 'H', 'P', 'X']:
if mdiinfo[-1] in ['I', 'S', 'X']:
rightclip += numinfo[-1]; readseq = readseq[:-numinfo[-1]]
if mdiinfo[-1] in ['H']: rightclip += numinfo[-1]
numinfo = numinfo[:-1]; mdiinfo = mdiinfo[:-1]
if forward_reverse=='+':
if rightclip>0: m_event = f5data[readk][1][leftclip:-rightclip]
else: m_event = f5data[readk][1][leftclip:]
else:
if leftclip>0: m_event = f5data[readk][1][rightclip:-leftclip]
else: m_event = f5data[readk][1][rightclip:]
# is in region of interest if provided
isinreg = False;
if (moptions['region'][0] in ['', None, rname]) and \
(moptions['region'][1] in ['', None] or pos>moptions['region'][1]) and \
(moptions['region'][2] in ['', None] or pos+len(m_event)<moptions['region'][2]):
isinreg = True;
if not isinreg:
continue;
# associate mapped reference positions with read positions
lastmatch = None; firstmatch = None;
first_match_pos = None; last_match_pos = None
last_al_match = None; first_al_match = None
lasmtind = 0;
base_map_info = []; #indel_groups = defaultdict()
nummismatch = 0; numinsert = 0; numdel = 0;
read_ind = 0;
for n1ind in range(len(numinfo)):
mdi = mdiinfo[n1ind];
# for each mapped types
for n1i in range(numinfo[n1ind]):
if mdi=='M':
base_map_info.append((refseq[pos], readseq[read_ind], pos, read_ind))
if refseq[pos]==readseq[read_ind]:
if firstmatch==None: firstmatch = read_ind
if lastmatch==None or lastmatch<read_ind: lastmatch = read_ind; lasmtind=n1ind
if first_al_match==None: first_al_match=len(base_map_info)-1
if last_al_match==None or last_al_match<len(base_map_info): last_al_match=len(base_map_info)-1
if first_match_pos==None: first_match_pos = pos
if last_match_pos==None or last_match_pos<pos: last_match_pos = pos
else: nummismatch += 1
pos += 1; read_ind += 1;
elif mdi =='I':
base_map_info.append(('-', readseq[read_ind], pos, read_ind))
read_ind += 1;
numinsert += 1
elif mdi == 'D':
base_map_info.append((refseq[pos], '-', pos, read_ind))
pos += 1;
numdel += 1
elif mdi == 'N':
base_map_info.append((refseq[pos], '-', pos, read_ind))
pos += 1;
if moptions['outLevel']<=myCom.OUTPUT_WARNING:
print ('CIGAR-Error N exist', f5data[readk][3])
elif mdi == 'S':
read_ind += 1;
if moptions['outLevel']<=myCom.OUTPUT_WARNING:
print ('CIGAR-Error!!! S in the middle of the sequence', f5data[readk][3])
elif mdi == 'H':
if moptions['outLevel']<=myCom.OUTPUT_WARNING:
print ('CIGAR-Error!!! H in the middle of the sequence', f5data[readk][3])
elif mdi == 'P':
if moptions['outLevel']<=myCom.OUTPUT_WARNING:
print ('CIGAR-Error!!! P exist', f5data[readk][3])
elif mdi == '=':
base_map_info.append((refseq[pos], readseq[read_ind], pos, read_ind))
if first_match_pos==None: first_match_pos = pos
if last_match_pos==None or last_match_pos<pos: last_match_pos = pos
pos += 1; read_ind += 1;
if firstmatch==None: firstmatch = read_ind - 1
if lastmatch==None or lastmatch<read_ind-1: lastmatch = read_ind - 1; lasmtind=n1ind
if last_al_match==None or last_al_match<len(base_map_info): last_al_match=len(base_map_info)-1
if first_al_match==None: first_al_match=len(base_map_info)-1
elif mdi == 'X':
base_map_info.append((refseq[pos], readseq[read_ind], pos, read_ind))
pos += 1; read_ind += 1;
nummismatch += 1
else:
if moptions['outLevel']<=myCom.OUTPUT_WARNING:
print ('CIGAR-Error!!!', 'Warning unknow CIGAR element ' + str(numinfo[n1ind]) + ' ' + mdi, f5data[readk][3])
if firstmatch==None or lastmatch==None or firstmatch<0 or lastmatch<0:
if moptions['outLevel']<=myCom.OUTPUT_WARNING:
print ("Errorfast5 "+f5data[readk][3])
print('match-Error!!! no first and/or last match',f5data[readk][3],('firstmatch=%d' % firstmatch) if not (firstmatch==None) else "N", ('lastmatch%d' % lastmatch) if not (lastmatch==None) else "N", str(flag), rname, str(pos));
print('\tf=%d, chr=%s, p=%d, c=%s, s=%s' % (flag, rname, pos, cigar, readseq))
continue;
# remove unmatch in both tails
if not firstmatch==None: leftclip += firstmatch
if (not lastmatch==None) and len(m_event)-lastmatch>1: rightclip += len(m_event)-lastmatch-1
# remove events whose bases are not mapped.
if forward_reverse=='+':
if len(m_event)-lastmatch>1:
m_event = m_event[firstmatch:(lastmatch+1-len(m_event))]
elif firstmatch>0: m_event = m_event[firstmatch:]
else:
if firstmatch>0: m_event = m_event[(len(m_event)-1-lastmatch):-firstmatch]
elif len(m_event)-lastmatch>1: m_event = m_event[(len(m_event)-1-lastmatch):]
# print detail if unexpected error occurs
if firstmatch>0 or len(base_map_info)-last_al_match>1:
if moptions['outLevel']<=myCom.OUTPUT_WARNING and ((firstmatch>0) or (len(base_map_info)-last_al_match>1 and refseq[last_match_pos+1] not in ['N'])):
print ("Errorfast5"+f5data[readk][3])
print ('Warning!!! first not match', firstmatch, lastmatch, first_al_match, last_al_match, len(base_map_info), numinfo[lasmtind-2:(lasmtind+5)], mdiinfo[lasmtind-2:(lasmtind+5)], lasmtind, len(numinfo))
print('\tref='+refseq[last_match_pos:last_match_pos+20]+"\n\tred="+readseq[lastmatch:lastmatch+20])
if firstmatch>0:
print('\tref='+refseq[(first_match_pos-20 if first_match_pos-20>0 else 0):first_match_pos]+"\n\tred="+readseq[(firstmatch-20 if firstmatch-20>0 else 0):firstmatch])
print('\tf=%d, chr=%s, p=%d, c=%s, s=%s' % (flag, rname, pos, cigar, readseq)) # flag, rname, pos, cigar, readseq
if len(base_map_info)-last_al_match>1:
base_map_info = base_map_info[first_al_match:(last_al_match+1-len(base_map_info))]
elif first_al_match>0:
base_map_info = base_map_info[first_al_match:]
# post-process mapping information
base_map_info = np.array(base_map_info, dtype=[('refbase', 'U1'), ('readbase', 'U1'), ('refbasei', np.uint64), ('readbasei', np.uint64)])
if forward_reverse=='-':
base_map_info = np.flipud(base_map_info)
for bmii in range(len(base_map_info)):
base_map_info['refbase'][bmii] = get_complement(base_map_info['refbase'][bmii])
base_map_info['readbase'][bmii] = get_complement(base_map_info['readbase'][bmii])
leftclip, rightclip = rightclip, leftclip
if False: #True: # for test base_map_info ### for check consistency
ref_align_key = '/Analyses/NanomoCorrected_000/BaseCalled_template/Alignment/genome_alignment'
read_align_key = '/Analyses/NanomoCorrected_000/BaseCalled_template/Alignment/read_alignment'
with h5py.File(f5data[readk][3], 'r') as mf5:
read_align_list = [bt.decode(encoding="utf-8") for bt in mf5[read_align_key]]
ref_align_list = [bt.decode(encoding="utf-8") for bt in mf5[ref_align_key]]
for rali in range(len(read_align_list)):
if not read_align_list[rali]==base_map_info['readbase'][rali]:
print ("Error not equal1! %s %s %d %s" % (read_align_list[rali], base_map_info['readbase'][rali], rali, f5data[readk][3]))
if not ref_align_list[rali]==base_map_info['refbase'][rali]:
print ("Error not equal2! %s %s %d %s" % (ref_align_list[rali], base_map_info['refbase'][rali], rali, f5data[readk][3]))
#
# handle map like
# CCG or CGG
# C-G C-G
#
if 'motif' in moptions and moptions['motif'][0]=='CG':
for ali in range(len(base_map_info)):
if base_map_info['refbase'][ali]=='C' and base_map_info['readbase'][ali]=='C':
if ali+1<len(base_map_info) and base_map_info['readbase'][ali+1]=='-' and base_map_info['refbase'][ali+1]=='G':
addali = 2;
while ali + addali < len(base_map_info):
if base_map_info['readbase'][ali+addali]=='-' and base_map_info['refbase'][ali+addali]=='G': addali += 1;
else: break;
if ali + addali < len(base_map_info) and base_map_info['readbase'][ali+addali]=='G' and base_map_info['refbase'][ali+addali]=='G':
base_map_info['readbase'][ali+1], base_map_info['readbase'][ali+addali] = base_map_info['readbase'][ali+addali], base_map_info['readbase'][ali+1]
if base_map_info['refbase'][ali]=='G' and base_map_info['readbase'][ali]=='G':
if ali-1>-1 and base_map_info['readbase'][ali-1]=='-' and base_map_info['refbase'][ali-1]=='C':
addali = 2;
while ali - addali >-1:
if base_map_info['readbase'][ali-addali]=='-' and base_map_info['refbase'][ali-addali]=='C': addali += 1;
else: break;
if ali - addali>-1 and base_map_info['readbase'][ali-addali]=='C' and base_map_info['refbase'][ali-addali]=='C':
base_map_info['readbase'][ali-1], base_map_info['readbase'][ali-addali] = base_map_info['readbase'][ali-addali], base_map_info['readbase'][ali-1]
# too short reads
if len(m_event)<500:
sp_options["Error"]["Less(<500) events"].append(f5data[readk][3])
continue;
# get feautre
mfeatures,isdif = get_Feature(moptions, sp_options, sp_param, f5align, f5data, readk, leftclip, rightclip, base_map_info, forward_reverse, rname, first_match_pos, numinsert, numdel)
if isdif and moptions['outLevel']<=myCom.OUTPUT_WARNING:
print("Dif is true")
print([lastmatch, firstmatch, first_match_pos, last_match_pos, first_al_match, last_al_match, lasmtind, len(base_map_info), nummismatch, numinsert, numdel, len(base_map_info)-nummismatch-numinsert-numdel])
# merge to previously handled features of other fast5 files
if len(mfeatures)>0:
if len(feat_file_ind_dict)==0:
feat_file_ind_dict.append((f5data[readk][3], 0));
feat_list = mfeatures
else:
feat_file_ind_dict.append((f5data[readk][3], len(feat_list)))
feat_list = np.concatenate((feat_list, mfeatures), axis=0)
# store the last feature data.
if len(feat_file_ind_dict)>0:
cur_feat_file_base = sp_options['ctfolder'] + '/'+str(feat_file_ind)
np.savetxt(cur_feat_file_base+'.xy.gz', feat_list, fmt='%.3f')
with open(cur_feat_file_base+'.xy.ind', 'w') as ind_mw:
for f_ind in feat_file_ind_dict:
ind_mw.write('%d %s\n' % (f_ind[1], f_ind[0]))
print ("\t%s-%d Total consuming time %d" % (sp_options['ctfolder'][sp_options['ctfolder'].rfind('/'):], feat_file_ind, time.time()-start_c_time)); sys.stdout.flush()
feat_file_ind_dict = []
feat_list = None;
feat_file_ind += 1
#
# get feature for each base of interest in long reads according to raw signals and mapping information
#
def get_Feature(moptions, sp_options, sp_param, f5align, f5data, readk, start_clip, end_clip, base_map_info, forward_reverse, rname, mapped_start_pos, num_insertions, num_deletions):
# event information
modevents = sp_param['f5data'][readk][1]
# class number, bin num and bin length
clnum = 2; binnum = 50; binlen = 0.2;
if forward_reverse=='+':
align_ref_pos = mapped_start_pos
else:
align_ref_pos = mapped_start_pos + len(base_map_info) - num_insertions - 1
# initialize feature matrix for all events.
if moptions['fnum']==57:
#mfeatures = np.zeros((len(modevents)-end_clip+100-(start_clip-100), (binnum+3+3+4)));
mfeatures = np.zeros((len(modevents)-end_clip+100-(start_clip-100), (binnum+3+3+4)));
else: mfeatures = np.zeros((len(modevents)-end_clip+100-(start_clip-100), (3+3+4)));
# filter poor alignment
checkneighbornums = [3,6]
checkratios = {3:[6,5,4,2], 6:[11,10,9,3]}
checkratios = {3:[6,5,4,2], 6:[12,10,9,3]}
cgpos = [[], []]
affectneighbor = 1; # 2;
for aligni in range(len(base_map_info)):
# for methylated positions and not-used adjacent positions
if 'motif' in moptions and base_map_info['readbase'][aligni]==moptions['motif'][0][moptions['motif'][1]]:
m_a_st = aligni-moptions['motif'][1]; m_a_end = aligni+len(moptions['motif'][0])-moptions['motif'][1]
if m_a_st>-1 and m_a_end<=len(base_map_info) and ''.join(base_map_info['readbase'][m_a_st:m_a_end])==moptions['motif'][0] and (not ''.join(base_map_info['refbase'][m_a_st:m_a_end])==moptions['motif'][0]):
cgpos[1].extend([(forward_reverse, base_map_info['refbasei'][addi]) for addi in range(aligni-affectneighbor if aligni-affectneighbor>-1 else 0, aligni+affectneighbor+1 if aligni+affectneighbor+1<len(base_map_info) else len(base_map_info))])
if (not base_map_info['refbase'][aligni]=='-') and \
(forward_reverse, base_map_info['refbasei'][aligni]) in moptions['fulmodlist'][rname]:
if not base_map_info['readbase'][aligni]=='-':
nextnogap = aligni + 1;
while nextnogap<len(base_map_info):
if not base_map_info['refbase'][nextnogap]=='-': break;
nextnogap += 1
iscg = False;
# find gaps
for checkneighbornum in checkneighbornums:
if not nextnogap<len(base_map_info): continue;
matchnum = 0; gapnum = 0;
# get gaps for two window sizes
for checki in range(aligni-checkneighbornum, aligni+checkneighbornum+1):
if checki>-1 and checki<len(base_map_info):
if base_map_info['refbase'][checki]==base_map_info['readbase'][checki]: matchnum += 1
if base_map_info['refbase'][checki]=='-' or base_map_info['readbase'][checki]=='-': gapnum += 1
if gapnum<=checkratios[checkneighbornum][3]:
for addi in range(aligni-affectneighbor if aligni-affectneighbor>-1 else 0, nextnogap+affectneighbor if nextnogap+affectneighbor<len(base_map_info) else len(base_map_info)):
if addi==aligni: # for methylated positions
cgpos[0].append((forward_reverse, base_map_info['refbasei'][addi]))
else: # for non-used positions
cgpos[1].append((forward_reverse, base_map_info['refbasei'][addi]))
iscg = True; break;
if iscg: continue;
# add more not-used positions if more gaps exist
if not base_map_info['readbase'][aligni]=='-':
nextnogap = aligni
for _ in range(affectneighbor):
nextnogap += 1;
while nextnogap<len(base_map_info['refbase']):
if not base_map_info['refbase'][nextnogap]=='-': break;
nextnogap += 1
prenogap = aligni
for _ in range(affectneighbor):
prenogap -= 1;
while prenogap>-1:
if not base_map_info['refbase'][prenogap]=='-': break;
prenogap -= 1
read0 = aligni; read1 = aligni
for _ in range(affectneighbor):
read0 -= 1
while read0>-1:
if base_map_info['readbase'][read0]=='-': read0 -= 1
else: break;
read1 += 1
while read1<len(base_map_info['readbase']):
if base_map_info['readbase'][read1]=='-': read1 += 1
else: break;
if read0<prenogap:
if read0>-1: prenogap = read0
else: prenogap = 0
if read1>nextnogap:
if read1<len(base_map_info['readbase']): nextnogap = read1
else: nextnogap = len(base_map_info['readbase'])-1
if prenogap<0: prenogap = 0
if not nextnogap<len(base_map_info['readbase']): nextnogap=len(base_map_info['readbase'])-1
if not prenogap<len(base_map_info['readbase']): prenogap=len(base_map_info['readbase'])-1
for excldi in range(prenogap, nextnogap+1):
cgpos[1].append((forward_reverse, base_map_info['refbasei'][excldi]))
print ('%s%s %d, %d >> %d %d, %d-%d=%d' % (forward_reverse, f5data[readk][3], len(cgpos[0]), len(cgpos[1]), len(modevents)-end_clip-start_clip, start_clip, len(modevents), end_clip, len(modevents)-end_clip))
aligni = 0; isdif = False;
for ie in range(start_clip-100, len(modevents)-end_clip+100):
cur_row_num = ie - (start_clip-100); cur_base = ''
# for aligned bases
if ie>=start_clip and ie<len(modevents)-end_clip:
if align_ref_pos<mapped_start_pos:
print ('ERRRR align_ref_pos(%d)<mapped_start_pos(%d)' % (align_ref_pos, mapped_start_pos))
while base_map_info['readbase'][aligni]=='-':
if not align_ref_pos==base_map_info['refbasei'][aligni]:
print ('ERRRR align_ref_pos(%d) not equal to %d' % (align_ref_pos, base_map_info['refbasei'][aligni] ))
if not base_map_info['refbase'][aligni]=='-':
if forward_reverse=='+': align_ref_pos += 1
else: align_ref_pos -= 1
aligni += 1
if not base_map_info['readbase'][aligni] == modevents['model_state'][ie][2]:
print ('Error Does not match', base_map_info['readbase'][aligni], modevents['model_state'][ie][2], aligni, ie)
isdif = True;
# the first column is the aligned reference position
mfeatures[cur_row_num][0] = align_ref_pos
cur_base = base_map_info['refbase'][aligni]
# the second/third column is for negative/positive labels of methylation
if moptions['posneg'] == 0: # for a data without any modification
if ( (not moptions['anymodlist']==None) and rname in moptions['nomodlist'] and (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['nomodlist'][rname] ):
mfeatures[cur_row_num][1] = 1; mfeatures[cur_row_num][2] = 0
elif (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['fulmodlist'][rname]:
mfeatures[cur_row_num][1] = 1; mfeatures[cur_row_num][2] = 0
elif ((not moptions['anymodlist']==None) and rname in moptions['anymodlist'] and (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['anymodlist'][rname] ):
mfeatures[cur_row_num][1] = 1; mfeatures[cur_row_num][2] = 0
else: # for a data with both modified and un-modified positions
if (forward_reverse, base_map_info['refbasei'][aligni]) in cgpos[0] and (not base_map_info['refbase'][aligni]=='-'):
mfeatures[cur_row_num][1] = 0; mfeatures[cur_row_num][2] = 1
else:
if (forward_reverse, base_map_info['refbasei'][aligni]) not in cgpos[1]:
if moptions['anymodlist']==None:
if moptions['nomodlist']==None or ( rname in moptions['nomodlist'] and (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['nomodlist'][rname] ):
mfeatures[cur_row_num][1] = 1; mfeatures[cur_row_num][2] = 0
elif rname in moptions['anymodlist'] and (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['anymodlist'][rname]:
pass
else:
if moptions['nomodlist']==None or ( rname in moptions['nomodlist'] and (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['nomodlist'][rname] ):
mfeatures[cur_row_num][1] = 1; mfeatures[cur_row_num][2] = 0
if not base_map_info['refbase'][aligni]=='-':
if forward_reverse=='+': align_ref_pos += 1
else: align_ref_pos -= 1
aligni += 1
# for bin features
if ie>=0 and ie<len(modevents) and moptions['fnum']==57:
for currs in sp_param['f5data'][readk][2][modevents['start'][ie]:int(modevents['start'][ie]+int(modevents['length'][ie]+0.5))]:
if currs>10 or currs<-10: print ('Error raw signal', currs, ie, modevents['start'][ie], modevents['length'][ie])
curbin = int((currs+5)/binlen)
if curbin<0: curbin = 0
elif not curbin<binnum: curbin = binnum-1
mfeatures[cur_row_num][curbin+3] += 1
if ie>=0 and ie<len(modevents):
# for reference base type feature
if cur_base in myCom.g_ACGT:
mfeatures[cur_row_num][moptions['fnum']-3+3-4+myCom.g_ACGT.index(cur_base)] = 1
cur_index_add = moptions['fnum'] - 3 + 3
# for signal mean std and length.
mfeatures[cur_row_num][cur_index_add + 0] = modevents["mean"][ie]
mfeatures[cur_row_num][cur_index_add + 1] = modevents["stdv"][ie]
mfeatures[cur_row_num][cur_index_add + 2] = modevents["length"][ie]
# truncated too much not-used positions
nbkeys = defaultdict();
for mfi in range(len(mfeatures)):
if mfeatures[mfi][1] + mfeatures[mfi][2] > 0.9:
for ini in range(mfi-25, mfi+26):
if ini<0 or ini>len(mfeatures)-1:
print("Warning wrong del mfeatures id %d for %s" % (ini, f5data[readk][3]))
else:
nbkeys[ini] = True;
keepInd = sorted(list(nbkeys.keys()));
if len(keepInd)>0:
if not len(keepInd)>len(mfeatures)*0.9:
mfeatures = mfeatures[np.array(keepInd)]
else:
mfeatures = []
return (mfeatures, isdif)
#
# get the complementary bases
#
def get_complement(na):
if na in myCom.acgt: return myCom.na_bp[na]
else: return na;
#
# get required information for reach mapping records.
#
def handle_line(moptions, sp_param, f5align):
lsp = sp_param['line'].split('\t')
qname, flag, rname, pos, mapq, cigar, _, _, _, seq, _ = lsp[:11]
# checked query name
if qname=='*': sp_param['f5status'] = "qname is *"
# check mapping quality
elif int(mapq)==255: sp_param['f5status'] = "mapq is 255"
# check mapped positions
elif int(pos)==0: sp_param['f5status'] = "pos is 0"
# check mapped string
elif cigar=='*': sp_param['f5status'] = "cigar is *"
# check reference name
elif rname=='*': sp_param['f5status'] = "rname is *"
if not sp_param['f5status']=="": return qname
if (qname not in f5align) or f5align[qname][0]<int(mapq):
f5align[qname] = (int(mapq), int(flag), rname, int(pos), cigar, seq)
return qname
#
# feature handler/workder for multiprocessing
#
def getFeature_handler(moptions, h5files_Q, failed_Q, version_Q):
while not h5files_Q.empty():
try:
# get a list of files
f5files, ctfolderid = h5files_Q.get(block=False)
except:
break;
sp_options = defaultdict();
sp_options['ctfolder'] = moptions['outFolder']+str(ctfolderid)
if not os.path.isdir(sp_options['ctfolder']):
os.system('mkdir '+sp_options['ctfolder'])
# get features
mGetFeature1(moptions, sp_options, f5files)
# output errors
for errtype, errfiles in sp_options["Error"].items():
failed_Q.put((errtype, errfiles));
# double check albacore version
for vk in sp_options["get_albacore_version"]:
version_Q.put((vk, sp_options["get_albacore_version"][vk]))
#
# read sequence information from a reference genome
#
def readFA(mfa, t_chr=None):
fadict = defaultdict();
with open(mfa, 'r') as mr:
cur_chr = None;
line = mr.readline();
while line:
# remove empty spaces
line = line.strip();
if len(line)>0:
if line[0]=='>': # for each chromosome line
if (not cur_chr==None) and (t_chr in [None, cur_chr]):
fadict[cur_chr] = ''.join(fadict[cur_chr])
cur_chr = line[1:].split()[0]
if t_chr in [None, cur_chr]:
fadict[cur_chr] = []
else: # for sub-sequence line in a reference file
if t_chr in [None, cur_chr]:
fadict[cur_chr].append(line.upper())
line = mr.readline();
# for the last chromosome in the file
if (not cur_chr==None) and (t_chr in [None, cur_chr]):
fadict[cur_chr] = ''.join(fadict[cur_chr])
return fadict
#
# get reference positions for motif-based modifications
#
def readMotifMod(fadict, mpat='Cg', mposinpat=0, t_chr=None, t_start=None, t_end=None):
pos_dict = defaultdict(int)
# get motif and complementary motif
pat3 = copy.deepcopy(mpat.upper())
comp_pat3 = ''.join([get_complement(curna) for curna in pat3][::-1])
comp_mposinpat = len(comp_pat3)-1-mposinpat
fakeys = fadict.keys();
cpgdict = defaultdict(int);
all_a = defaultdict()
for fak in fakeys:
cpgnum = [0, 0]
# motif-based reference positions
cpgdict[fak] = defaultdict()
# position of bases of interest
all_a[fak] = defaultdict()
for i in range(len(fadict[fak])):
if (t_start==None or i>=t_start) and (t_end==None or i<=t_end):
if fadict[fak][i]==mpat[mposinpat]: # for forward strand
all_a[fak][('+', i)] = True;
elif get_complement(fadict[fak][i])==mpat[mposinpat]: # for reverse strand
all_a[fak][('-', i)] = True;
# check motif in forward strand
if i-mposinpat>=0 and i+len(comp_pat3)-1-mposinpat<len(fadict[fak]) and ''.join(fadict[fak][i-mposinpat:(i+len(comp_pat3)-1-mposinpat+1)])==pat3:
cpgdict[fak][('+', i)] = [1, fadict[fak][i]]; cpgnum[0] += 1
elif i-comp_mposinpat>=0 and i+len(comp_pat3)-1-comp_mposinpat<len(fadict[fak]) and ''.join(fadict[fak][i-comp_mposinpat:(i+len(comp_pat3)-1-comp_mposinpat+1)])==comp_pat3: # check motif in reverse strand
cpgdict[fak][('-', i)] = [1, fadict[fak][i]]; cpgnum[1] += 1
else:
pass
print('%s%d site: %d(+) %d(-) for %s' % (pat3, mposinpat, cpgnum[0], cpgnum[1], fak))
return (cpgdict, all_a)
#
# a multiprocessing manager to get features from all long reads.
#
def getFeature_manager(moptions):
start_time = time.time();
# multipprocessing manager
pmanager = multiprocessing.Manager();
# prepare output folder
if os.path.isdir(moptions['outFolder']):
os.system('rm -dr '+moptions['outFolder'])
if not os.path.isdir(moptions['outFolder']):
os.system('mkdir '+moptions['outFolder'])
moptions['size_per_batch'] = moptions['size_per_batch'] * (10**7)
# read reference information
fadict = readFA(moptions['Ref'],moptions['region'][0])
if moptions['motifORPos']==1: # get motif-based positions for modifications
moptions['fulmodlist'], moptions['nomodlist'] = readMotifMod(fadict, moptions['motif'][0], moptions['motif'][1], moptions['region'][0], moptions['region'][1], moptions['region'][2])
moptions['anymodlist'] = None
moptions['nomodlist'] = None; # add for simple process
elif moptions['motifORPos']==2: # modification position is specified by the files
fuldfiles = glob.glob(moptions["fulmod"]);
moptions['fulmodlist'] = defaultdict(lambda: defaultdict());
if not moptions["anymod"]==None: # partially modified positions
anydfiles = glob.glob(moptions["anymod"])
moptions['anymodlist'] = defaultdict(lambda: defaultdict());
else:
moptions['anymodlist'] = None
if not moptions["nomod"]==None: # completely un-modified positions
nodfiles = glob.glob(moptions["nomod"])
moptions['nomodlist'] = defaultdict(lambda: defaultdict());
else:
moptions['nomodlist'] = None
mthreadin = [moptions['fulmodlist'], moptions['anymodlist'], moptions['nomodlist']]
mthfiles = [fuldfiles, anydfiles, nodfiles]
# read completely modified positions, partially modified positions, completely un-modified positions from files
for mthi in range(len(mthreadin)):
curmeth = mthreadin[mthi]; curfilelist = mthfiles[mthi]
if curmeth==None or curfilelist==None: continue;
for curmthf in curfilelist:
with open(curmthf, 'r') as mreader:
line = mreader.readline();
while line:
if len(line)>0:
tchr, tstrand, tpos = line.split()[:3]
curmeth[tchr][(tstrand, int(tpos))] = [1-mthi, fadict[tchr][int(tpos)]];
line = mreader.readline();
for tchr in moptions['fulmodlist'] if moptions['anymodlist']==None else moptions['anymodlist']:
if len(moptions['fulmodlist'][tchr])>0 or ((not moptions['anymodlist']==None) and len(moptions['anymodlist'][tchr])>0):
print ('%s fulmod=%d anymod=%d nomod=%d' % (tchr, len(moptions['fulmodlist'][tchr]), len(moptions['anymodlist'][tchr]) if (not moptions['anymodlist']==None) else -1, len(moptions['nomodlist'][tchr]) if (not moptions['nomodlist']==None) else -1))
if True: #False:
# get all input fast5 files
f5files = glob.glob(os.path.join(moptions['wrkBase'],"*.fast5" ))
if moptions['recursive']==1:
f5files.extend(glob.glob(os.path.join(moptions['wrkBase'],"*/*.fast5" )))
f5files.extend(glob.glob(os.path.join(moptions['wrkBase'],"*/*/*.fast5" )))
f5files.extend(glob.glob(os.path.join(moptions['wrkBase'],"*/*/*/*.fast5" )))
print('Total files=%d' % len(f5files))
h5files_Q = pmanager.Queue();
failed_Q = pmanager.Queue()
version_Q = pmanager.Queue()
# split input fast5 files into different batch
h5_batch = []; h5batchind = 0;
for f5f in f5files:
h5_batch.append(f5f);
if len(h5_batch)==moptions['files_per_thread']:
h5files_Q.put((h5_batch, h5batchind))
h5batchind += 1
h5_batch = []; #break; ### feature500
if len(h5_batch)>0:
h5files_Q.put((h5_batch, h5batchind))
# each thread handle a batch a time and repeat for all batches.
share_var = (moptions, h5files_Q, failed_Q, version_Q)
handlers = []
for hid in range(moptions['threads']):
p = multiprocessing.Process(target=getFeature_handler, args=share_var);
p.start();
handlers.append(p);
# get failed files.
failed_files = defaultdict(list);
version_default = defaultdict(lambda: defaultdict(int));
while any(p.is_alive() for p in handlers):
try:
errk, fns = failed_Q.get(block=False);
failed_files[errk].extend(fns)
curv, curv_num = version_Q.get(block=False);
version_default[curv] += curv_num
except:
time.sleep(1);
continue;
# output failure information
if len(failed_files)>0:
print ('Error information for different fast5 files:')
for errtype, errfiles in failed_files.items():
print ('\t%s %d' % (errtype, len(errfiles)))
print("abversion info {}".format(str(version_default)))
sys.stdout.flush()
end_time = time.time();
print ("Total consuming time %d" % (end_time-start_time))
# for indepdent testing of code
if __name__=='__main__':
# if len(sys.argv)>4:
moptions = {}
moptions['basecall_1d'] = 'Basecall_1D_000'
moptions['basecall_1d'] = ['Basecall_1D_000']
moptions['basecall_2strand'] = 'BaseCalled_template'
moptions['outLevel'] = myCom.OUTPUT_WARNING
moptions['outLevel'] = myCom.OUTPUT_INFO
moptions['modfile'] = '../../mod_output/train1/2/mod_train'
moptions['fnum'] = 53;
moptions['hidden'] = 100;
moptions['windowsize'] = 21;
moptions['threads'] = 8
moptions['threads'] = 1
moptions['files_per_thread'] = 500
mDetect_manager(moptions)