|
a |
|
b/bin/DeepMod_scripts/myGetFeatureBasedPos.py |
|
|
1 |
|
|
|
2 |
import os; |
|
|
3 |
import sys; |
|
|
4 |
import string; |
|
|
5 |
import glob; |
|
|
6 |
import time |
|
|
7 |
import copy |
|
|
8 |
|
|
|
9 |
import h5py |
|
|
10 |
import numpy as np |
|
|
11 |
import multiprocessing |
|
|
12 |
|
|
|
13 |
from collections import defaultdict |
|
|
14 |
from distutils.version import LooseVersion |
|
|
15 |
|
|
|
16 |
import tempfile |
|
|
17 |
import subprocess |
|
|
18 |
|
|
|
19 |
import re; |
|
|
20 |
|
|
|
21 |
from . import myCom |
|
|
22 |
from . import myDetect |
|
|
23 |
|
|
|
24 |
# |
|
|
25 |
# map long reads |
|
|
26 |
# then call another function to get feature for each base of interest |
|
|
27 |
# |
|
|
28 |
def mGetFeature1(moptions, sp_options, f5files): |
|
|
29 |
# associate signals to events |
|
|
30 |
f5data = myDetect.get_Event_Signals(moptions, sp_options, f5files) |
|
|
31 |
|
|
|
32 |
# save all sequences information |
|
|
33 |
if moptions['outLevel']<=myCom.OUTPUT_DEBUG: start_time = time.time(); |
|
|
34 |
temp_fa = tempfile.NamedTemporaryFile(suffix='.fa', mode='w') |
|
|
35 |
f5keys = sorted(f5data.keys()); #f5keys.sort() |
|
|
36 |
for f5k in f5keys: |
|
|
37 |
temp_fa.write(''.join(['>', f5k, '\n', f5data[f5k][0], '\n'])) |
|
|
38 |
temp_fa.flush(); |
|
|
39 |
if moptions['outLevel']<=myCom.OUTPUT_DEBUG: |
|
|
40 |
end_time = time.time(); |
|
|
41 |
print ("Write consuming time %d" % (end_time-start_time)) |
|
|
42 |
|
|
|
43 |
# alignment using bwa-mem or minimap2 |
|
|
44 |
temp_sam = tempfile.NamedTemporaryFile() |
|
|
45 |
if moptions['alignStr']=='bwa': |
|
|
46 |
cmd_opt = ['mem', '-x', 'ont2d', '-v', '1', '-t', '1', moptions['Ref'], temp_fa.name] |
|
|
47 |
else: |
|
|
48 |
cmd_opt = ['-ax', 'map-ont', moptions['Ref'], temp_fa.name] |
|
|
49 |
returncode = subprocess.call([moptions['alignStr'],]+cmd_opt, stdout=temp_sam) |
|
|
50 |
if not returncode==0: |
|
|
51 |
print ('Fatal Error!!! returncode is non-zero(%d) for "%s"' % (returncode, curcmd)) |
|
|
52 |
errkey = "Cannot running aligment" |
|
|
53 |
for f5k in f5keys: |
|
|
54 |
sp_options["Error"][errkey].append(f5data[f5k][3]) |
|
|
55 |
return; |
|
|
56 |
|
|
|
57 |
temp_fa.close(); |
|
|
58 |
temp_sam.seek(0); |
|
|
59 |
# get sam information |
|
|
60 |
align_info = temp_sam.readlines() |
|
|
61 |
align_info = [str(align_info[i], 'utf-8').strip() for i in range(len(align_info))] |
|
|
62 |
temp_sam.close(); |
|
|
63 |
|
|
|
64 |
sp_param = defaultdict(); |
|
|
65 |
sp_param['f5data'] = f5data |
|
|
66 |
|
|
|
67 |
# for alignment |
|
|
68 |
f5align = defaultdict() |
|
|
69 |
f5keydict = defaultdict(); |
|
|
70 |
sp_param['ref_info'] = defaultdict() |
|
|
71 |
|
|
|
72 |
if moptions['outLevel']<=myCom.OUTPUT_DEBUG:start_time = time.time(); |
|
|
73 |
ilid = 0; |
|
|
74 |
# for each record in sam, get alignment information |
|
|
75 |
while ilid < len(align_info): |
|
|
76 |
if len(align_info[ilid])==0 or align_info[ilid][0]=='@': |
|
|
77 |
ilid += 1 |
|
|
78 |
continue; |
|
|
79 |
|
|
|
80 |
sp_param['f5status'] = ""; |
|
|
81 |
sp_param['line'] = align_info[ilid] |
|
|
82 |
qname = handle_line(moptions, sp_param, f5align) |
|
|
83 |
if sp_param['f5status'] == "": |
|
|
84 |
f5keydict[qname] = True; |
|
|
85 |
ilid += 1 |
|
|
86 |
|
|
|
87 |
# for unmapped reads |
|
|
88 |
for f5k in f5keys: |
|
|
89 |
if f5k not in f5keydict: |
|
|
90 |
sp_options["Error"]["Not in alignment sam"].append(f5data[f5k][3]) |
|
|
91 |
|
|
|
92 |
if moptions['outLevel']<=myCom.OUTPUT_DEBUG: |
|
|
93 |
end_time = time.time(); |
|
|
94 |
print ("Get BAM consuming time %d" % (end_time-start_time)) |
|
|
95 |
|
|
|
96 |
sp_param['f5status']= "" |
|
|
97 |
sp_param['line'] = "" |
|
|
98 |
if moptions['outLevel']<=myCom.OUTPUT_DEBUG:start_time = time.time(); |
|
|
99 |
# handle each alignment |
|
|
100 |
handle_record(moptions, sp_options, sp_param, f5align, f5data) |
|
|
101 |
if moptions['outLevel']<=myCom.OUTPUT_DEBUG: |
|
|
102 |
end_time = time.time(); |
|
|
103 |
print ("Analyze & annotate & save consuming time %d" % (end_time-start_time)) |
|
|
104 |
|
|
|
105 |
# |
|
|
106 |
# get mapping information |
|
|
107 |
# then call another function to get feature of each base in a long read |
|
|
108 |
# |
|
|
109 |
def handle_record(moptions, sp_options, sp_param, f5align, f5data): |
|
|
110 |
alignkeys = list(f5align.keys()); |
|
|
111 |
# alignment detail |
|
|
112 |
numreg = re.compile('\d+') |
|
|
113 |
mdireg = re.compile('[MIDNSHPX=]{1}') |
|
|
114 |
|
|
|
115 |
feat_file_ind_dict = [] |
|
|
116 |
feat_list = None; feat_file_ind = 0 |
|
|
117 |
start_c_time = time.time(); |
|
|
118 |
|
|
|
119 |
for readk in alignkeys: |
|
|
120 |
if len(feat_file_ind_dict)>0 and feat_list.nbytes > moptions['size_per_batch']: |
|
|
121 |
# save features when the size is larger than the defined size |
|
|
122 |
cur_feat_file_base = sp_options['ctfolder'] + '/'+str(feat_file_ind) |
|
|
123 |
np.savetxt(cur_feat_file_base+'.xy.gz', feat_list, fmt='%.3f') |
|
|
124 |
with open(cur_feat_file_base+'.xy.ind', 'w') as ind_mw: |
|
|
125 |
for f_ind in feat_file_ind_dict: |
|
|
126 |
ind_mw.write('%d %s\n' % (f_ind[1], f_ind[0])) |
|
|
127 |
print ("\t%s-%d Total consuming time %d" % (sp_options['ctfolder'][sp_options['ctfolder'].rfind('/'):], feat_file_ind, time.time()-start_c_time)); sys.stdout.flush() |
|
|
128 |
feat_file_ind_dict = [] |
|
|
129 |
feat_list = None; |
|
|
130 |
feat_file_ind += 1 |
|
|
131 |
|
|
|
132 |
# get alignment detail |
|
|
133 |
mapq, flag, rname, pos, cigar, readseq = f5align[readk] |
|
|
134 |
|
|
|
135 |
if not ( (rname in moptions['fulmodlist'] and len(moptions['fulmodlist'][rname])>0) or \ |
|
|
136 |
((not moptions['anymodlist']==None) and rname in moptions['anymodlist'] and len(moptions['anymodlist'][rname])>0) or \ |
|
|
137 |
((not moptions['nomodlist']==None) and rname in moptions['nomodlist'] and len(moptions['nomodlist'][rname])>0) ): |
|
|
138 |
continue; |
|
|
139 |
|
|
|
140 |
# get reference sequece |
|
|
141 |
if rname not in sp_param['ref_info']: |
|
|
142 |
myDetect.getRefSeq(moptions, sp_param, rname) |
|
|
143 |
refseq = sp_param['ref_info'][rname] |
|
|
144 |
|
|
|
145 |
# mapped starting position and strand |
|
|
146 |
pos = pos - 1 |
|
|
147 |
forward_reverse = '-' if flag&0x10 else '+' |
|
|
148 |
|
|
|
149 |
numinfo = numreg.findall(cigar); |
|
|
150 |
mdiinfo = mdireg.findall(cigar) |
|
|
151 |
numinfo = [int(numinfo[i]) for i in range(len(numinfo))] #map(int, numinfo) |
|
|
152 |
|
|
|
153 |
# remove clip from both tails |
|
|
154 |
leftclip = 0; rightclip = 0; |
|
|
155 |
while mdiinfo[0] in ['I', 'D', 'N', 'S', 'H', 'P', 'X']: |
|
|
156 |
if mdiinfo[0] in ['I', 'S', 'X']: |
|
|
157 |
leftclip += numinfo[0]; readseq = readseq[numinfo[0]:] |
|
|
158 |
if mdiinfo[0] in ['H']: leftclip += numinfo[0] |
|
|
159 |
if mdiinfo[0] in ['D', 'N', 'X']: |
|
|
160 |
pos += numinfo[0] |
|
|
161 |
numinfo = numinfo[1:]; mdiinfo = mdiinfo[1:] |
|
|
162 |
while mdiinfo[-1] in ['I', 'D', 'N', 'S', 'H', 'P', 'X']: |
|
|
163 |
if mdiinfo[-1] in ['I', 'S', 'X']: |
|
|
164 |
rightclip += numinfo[-1]; readseq = readseq[:-numinfo[-1]] |
|
|
165 |
if mdiinfo[-1] in ['H']: rightclip += numinfo[-1] |
|
|
166 |
numinfo = numinfo[:-1]; mdiinfo = mdiinfo[:-1] |
|
|
167 |
if forward_reverse=='+': |
|
|
168 |
if rightclip>0: m_event = f5data[readk][1][leftclip:-rightclip] |
|
|
169 |
else: m_event = f5data[readk][1][leftclip:] |
|
|
170 |
else: |
|
|
171 |
if leftclip>0: m_event = f5data[readk][1][rightclip:-leftclip] |
|
|
172 |
else: m_event = f5data[readk][1][rightclip:] |
|
|
173 |
|
|
|
174 |
# is in region of interest if provided |
|
|
175 |
isinreg = False; |
|
|
176 |
if (moptions['region'][0] in ['', None, rname]) and \ |
|
|
177 |
(moptions['region'][1] in ['', None] or pos>moptions['region'][1]) and \ |
|
|
178 |
(moptions['region'][2] in ['', None] or pos+len(m_event)<moptions['region'][2]): |
|
|
179 |
isinreg = True; |
|
|
180 |
if not isinreg: |
|
|
181 |
continue; |
|
|
182 |
|
|
|
183 |
# associate mapped reference positions with read positions |
|
|
184 |
lastmatch = None; firstmatch = None; |
|
|
185 |
first_match_pos = None; last_match_pos = None |
|
|
186 |
last_al_match = None; first_al_match = None |
|
|
187 |
lasmtind = 0; |
|
|
188 |
base_map_info = []; #indel_groups = defaultdict() |
|
|
189 |
nummismatch = 0; numinsert = 0; numdel = 0; |
|
|
190 |
read_ind = 0; |
|
|
191 |
for n1ind in range(len(numinfo)): |
|
|
192 |
mdi = mdiinfo[n1ind]; |
|
|
193 |
# for each mapped types |
|
|
194 |
for n1i in range(numinfo[n1ind]): |
|
|
195 |
if mdi=='M': |
|
|
196 |
base_map_info.append((refseq[pos], readseq[read_ind], pos, read_ind)) |
|
|
197 |
if refseq[pos]==readseq[read_ind]: |
|
|
198 |
if firstmatch==None: firstmatch = read_ind |
|
|
199 |
if lastmatch==None or lastmatch<read_ind: lastmatch = read_ind; lasmtind=n1ind |
|
|
200 |
if first_al_match==None: first_al_match=len(base_map_info)-1 |
|
|
201 |
if last_al_match==None or last_al_match<len(base_map_info): last_al_match=len(base_map_info)-1 |
|
|
202 |
if first_match_pos==None: first_match_pos = pos |
|
|
203 |
if last_match_pos==None or last_match_pos<pos: last_match_pos = pos |
|
|
204 |
else: nummismatch += 1 |
|
|
205 |
pos += 1; read_ind += 1; |
|
|
206 |
elif mdi =='I': |
|
|
207 |
base_map_info.append(('-', readseq[read_ind], pos, read_ind)) |
|
|
208 |
read_ind += 1; |
|
|
209 |
numinsert += 1 |
|
|
210 |
elif mdi == 'D': |
|
|
211 |
base_map_info.append((refseq[pos], '-', pos, read_ind)) |
|
|
212 |
pos += 1; |
|
|
213 |
numdel += 1 |
|
|
214 |
elif mdi == 'N': |
|
|
215 |
base_map_info.append((refseq[pos], '-', pos, read_ind)) |
|
|
216 |
pos += 1; |
|
|
217 |
if moptions['outLevel']<=myCom.OUTPUT_WARNING: |
|
|
218 |
print ('CIGAR-Error N exist', f5data[readk][3]) |
|
|
219 |
elif mdi == 'S': |
|
|
220 |
read_ind += 1; |
|
|
221 |
if moptions['outLevel']<=myCom.OUTPUT_WARNING: |
|
|
222 |
print ('CIGAR-Error!!! S in the middle of the sequence', f5data[readk][3]) |
|
|
223 |
elif mdi == 'H': |
|
|
224 |
if moptions['outLevel']<=myCom.OUTPUT_WARNING: |
|
|
225 |
print ('CIGAR-Error!!! H in the middle of the sequence', f5data[readk][3]) |
|
|
226 |
elif mdi == 'P': |
|
|
227 |
if moptions['outLevel']<=myCom.OUTPUT_WARNING: |
|
|
228 |
print ('CIGAR-Error!!! P exist', f5data[readk][3]) |
|
|
229 |
elif mdi == '=': |
|
|
230 |
base_map_info.append((refseq[pos], readseq[read_ind], pos, read_ind)) |
|
|
231 |
if first_match_pos==None: first_match_pos = pos |
|
|
232 |
if last_match_pos==None or last_match_pos<pos: last_match_pos = pos |
|
|
233 |
pos += 1; read_ind += 1; |
|
|
234 |
if firstmatch==None: firstmatch = read_ind - 1 |
|
|
235 |
if lastmatch==None or lastmatch<read_ind-1: lastmatch = read_ind - 1; lasmtind=n1ind |
|
|
236 |
if last_al_match==None or last_al_match<len(base_map_info): last_al_match=len(base_map_info)-1 |
|
|
237 |
if first_al_match==None: first_al_match=len(base_map_info)-1 |
|
|
238 |
elif mdi == 'X': |
|
|
239 |
base_map_info.append((refseq[pos], readseq[read_ind], pos, read_ind)) |
|
|
240 |
pos += 1; read_ind += 1; |
|
|
241 |
nummismatch += 1 |
|
|
242 |
else: |
|
|
243 |
if moptions['outLevel']<=myCom.OUTPUT_WARNING: |
|
|
244 |
print ('CIGAR-Error!!!', 'Warning unknow CIGAR element ' + str(numinfo[n1ind]) + ' ' + mdi, f5data[readk][3]) |
|
|
245 |
if firstmatch==None or lastmatch==None or firstmatch<0 or lastmatch<0: |
|
|
246 |
if moptions['outLevel']<=myCom.OUTPUT_WARNING: |
|
|
247 |
print ("Errorfast5 "+f5data[readk][3]) |
|
|
248 |
print('match-Error!!! no first and/or last match',f5data[readk][3],('firstmatch=%d' % firstmatch) if not (firstmatch==None) else "N", ('lastmatch%d' % lastmatch) if not (lastmatch==None) else "N", str(flag), rname, str(pos)); |
|
|
249 |
print('\tf=%d, chr=%s, p=%d, c=%s, s=%s' % (flag, rname, pos, cigar, readseq)) |
|
|
250 |
continue; |
|
|
251 |
|
|
|
252 |
# remove unmatch in both tails |
|
|
253 |
if not firstmatch==None: leftclip += firstmatch |
|
|
254 |
if (not lastmatch==None) and len(m_event)-lastmatch>1: rightclip += len(m_event)-lastmatch-1 |
|
|
255 |
# remove events whose bases are not mapped. |
|
|
256 |
if forward_reverse=='+': |
|
|
257 |
if len(m_event)-lastmatch>1: |
|
|
258 |
m_event = m_event[firstmatch:(lastmatch+1-len(m_event))] |
|
|
259 |
elif firstmatch>0: m_event = m_event[firstmatch:] |
|
|
260 |
else: |
|
|
261 |
if firstmatch>0: m_event = m_event[(len(m_event)-1-lastmatch):-firstmatch] |
|
|
262 |
elif len(m_event)-lastmatch>1: m_event = m_event[(len(m_event)-1-lastmatch):] |
|
|
263 |
# print detail if unexpected error occurs |
|
|
264 |
if firstmatch>0 or len(base_map_info)-last_al_match>1: |
|
|
265 |
if moptions['outLevel']<=myCom.OUTPUT_WARNING and ((firstmatch>0) or (len(base_map_info)-last_al_match>1 and refseq[last_match_pos+1] not in ['N'])): |
|
|
266 |
print ("Errorfast5"+f5data[readk][3]) |
|
|
267 |
print ('Warning!!! first not match', firstmatch, lastmatch, first_al_match, last_al_match, len(base_map_info), numinfo[lasmtind-2:(lasmtind+5)], mdiinfo[lasmtind-2:(lasmtind+5)], lasmtind, len(numinfo)) |
|
|
268 |
print('\tref='+refseq[last_match_pos:last_match_pos+20]+"\n\tred="+readseq[lastmatch:lastmatch+20]) |
|
|
269 |
if firstmatch>0: |
|
|
270 |
print('\tref='+refseq[(first_match_pos-20 if first_match_pos-20>0 else 0):first_match_pos]+"\n\tred="+readseq[(firstmatch-20 if firstmatch-20>0 else 0):firstmatch]) |
|
|
271 |
print('\tf=%d, chr=%s, p=%d, c=%s, s=%s' % (flag, rname, pos, cigar, readseq)) # flag, rname, pos, cigar, readseq |
|
|
272 |
|
|
|
273 |
if len(base_map_info)-last_al_match>1: |
|
|
274 |
base_map_info = base_map_info[first_al_match:(last_al_match+1-len(base_map_info))] |
|
|
275 |
elif first_al_match>0: |
|
|
276 |
base_map_info = base_map_info[first_al_match:] |
|
|
277 |
|
|
|
278 |
# post-process mapping information |
|
|
279 |
base_map_info = np.array(base_map_info, dtype=[('refbase', 'U1'), ('readbase', 'U1'), ('refbasei', np.uint64), ('readbasei', np.uint64)]) |
|
|
280 |
if forward_reverse=='-': |
|
|
281 |
base_map_info = np.flipud(base_map_info) |
|
|
282 |
for bmii in range(len(base_map_info)): |
|
|
283 |
base_map_info['refbase'][bmii] = get_complement(base_map_info['refbase'][bmii]) |
|
|
284 |
base_map_info['readbase'][bmii] = get_complement(base_map_info['readbase'][bmii]) |
|
|
285 |
leftclip, rightclip = rightclip, leftclip |
|
|
286 |
if False: #True: # for test base_map_info ### for check consistency |
|
|
287 |
ref_align_key = '/Analyses/NanomoCorrected_000/BaseCalled_template/Alignment/genome_alignment' |
|
|
288 |
read_align_key = '/Analyses/NanomoCorrected_000/BaseCalled_template/Alignment/read_alignment' |
|
|
289 |
with h5py.File(f5data[readk][3], 'r') as mf5: |
|
|
290 |
read_align_list = [bt.decode(encoding="utf-8") for bt in mf5[read_align_key]] |
|
|
291 |
ref_align_list = [bt.decode(encoding="utf-8") for bt in mf5[ref_align_key]] |
|
|
292 |
for rali in range(len(read_align_list)): |
|
|
293 |
if not read_align_list[rali]==base_map_info['readbase'][rali]: |
|
|
294 |
print ("Error not equal1! %s %s %d %s" % (read_align_list[rali], base_map_info['readbase'][rali], rali, f5data[readk][3])) |
|
|
295 |
if not ref_align_list[rali]==base_map_info['refbase'][rali]: |
|
|
296 |
print ("Error not equal2! %s %s %d %s" % (ref_align_list[rali], base_map_info['refbase'][rali], rali, f5data[readk][3])) |
|
|
297 |
# |
|
|
298 |
# handle map like |
|
|
299 |
# CCG or CGG |
|
|
300 |
# C-G C-G |
|
|
301 |
# |
|
|
302 |
if 'motif' in moptions and moptions['motif'][0]=='CG': |
|
|
303 |
for ali in range(len(base_map_info)): |
|
|
304 |
if base_map_info['refbase'][ali]=='C' and base_map_info['readbase'][ali]=='C': |
|
|
305 |
if ali+1<len(base_map_info) and base_map_info['readbase'][ali+1]=='-' and base_map_info['refbase'][ali+1]=='G': |
|
|
306 |
addali = 2; |
|
|
307 |
while ali + addali < len(base_map_info): |
|
|
308 |
if base_map_info['readbase'][ali+addali]=='-' and base_map_info['refbase'][ali+addali]=='G': addali += 1; |
|
|
309 |
else: break; |
|
|
310 |
if ali + addali < len(base_map_info) and base_map_info['readbase'][ali+addali]=='G' and base_map_info['refbase'][ali+addali]=='G': |
|
|
311 |
base_map_info['readbase'][ali+1], base_map_info['readbase'][ali+addali] = base_map_info['readbase'][ali+addali], base_map_info['readbase'][ali+1] |
|
|
312 |
if base_map_info['refbase'][ali]=='G' and base_map_info['readbase'][ali]=='G': |
|
|
313 |
if ali-1>-1 and base_map_info['readbase'][ali-1]=='-' and base_map_info['refbase'][ali-1]=='C': |
|
|
314 |
addali = 2; |
|
|
315 |
while ali - addali >-1: |
|
|
316 |
if base_map_info['readbase'][ali-addali]=='-' and base_map_info['refbase'][ali-addali]=='C': addali += 1; |
|
|
317 |
else: break; |
|
|
318 |
if ali - addali>-1 and base_map_info['readbase'][ali-addali]=='C' and base_map_info['refbase'][ali-addali]=='C': |
|
|
319 |
base_map_info['readbase'][ali-1], base_map_info['readbase'][ali-addali] = base_map_info['readbase'][ali-addali], base_map_info['readbase'][ali-1] |
|
|
320 |
# too short reads |
|
|
321 |
if len(m_event)<500: |
|
|
322 |
sp_options["Error"]["Less(<500) events"].append(f5data[readk][3]) |
|
|
323 |
continue; |
|
|
324 |
|
|
|
325 |
# get feautre |
|
|
326 |
mfeatures,isdif = get_Feature(moptions, sp_options, sp_param, f5align, f5data, readk, leftclip, rightclip, base_map_info, forward_reverse, rname, first_match_pos, numinsert, numdel) |
|
|
327 |
if isdif and moptions['outLevel']<=myCom.OUTPUT_WARNING: |
|
|
328 |
print("Dif is true") |
|
|
329 |
print([lastmatch, firstmatch, first_match_pos, last_match_pos, first_al_match, last_al_match, lasmtind, len(base_map_info), nummismatch, numinsert, numdel, len(base_map_info)-nummismatch-numinsert-numdel]) |
|
|
330 |
|
|
|
331 |
# merge to previously handled features of other fast5 files |
|
|
332 |
if len(mfeatures)>0: |
|
|
333 |
if len(feat_file_ind_dict)==0: |
|
|
334 |
feat_file_ind_dict.append((f5data[readk][3], 0)); |
|
|
335 |
feat_list = mfeatures |
|
|
336 |
else: |
|
|
337 |
feat_file_ind_dict.append((f5data[readk][3], len(feat_list))) |
|
|
338 |
feat_list = np.concatenate((feat_list, mfeatures), axis=0) |
|
|
339 |
|
|
|
340 |
# store the last feature data. |
|
|
341 |
if len(feat_file_ind_dict)>0: |
|
|
342 |
cur_feat_file_base = sp_options['ctfolder'] + '/'+str(feat_file_ind) |
|
|
343 |
np.savetxt(cur_feat_file_base+'.xy.gz', feat_list, fmt='%.3f') |
|
|
344 |
with open(cur_feat_file_base+'.xy.ind', 'w') as ind_mw: |
|
|
345 |
for f_ind in feat_file_ind_dict: |
|
|
346 |
ind_mw.write('%d %s\n' % (f_ind[1], f_ind[0])) |
|
|
347 |
print ("\t%s-%d Total consuming time %d" % (sp_options['ctfolder'][sp_options['ctfolder'].rfind('/'):], feat_file_ind, time.time()-start_c_time)); sys.stdout.flush() |
|
|
348 |
feat_file_ind_dict = [] |
|
|
349 |
feat_list = None; |
|
|
350 |
feat_file_ind += 1 |
|
|
351 |
|
|
|
352 |
# |
|
|
353 |
# get feature for each base of interest in long reads according to raw signals and mapping information |
|
|
354 |
# |
|
|
355 |
def get_Feature(moptions, sp_options, sp_param, f5align, f5data, readk, start_clip, end_clip, base_map_info, forward_reverse, rname, mapped_start_pos, num_insertions, num_deletions): |
|
|
356 |
# event information |
|
|
357 |
modevents = sp_param['f5data'][readk][1] |
|
|
358 |
# class number, bin num and bin length |
|
|
359 |
clnum = 2; binnum = 50; binlen = 0.2; |
|
|
360 |
if forward_reverse=='+': |
|
|
361 |
align_ref_pos = mapped_start_pos |
|
|
362 |
else: |
|
|
363 |
align_ref_pos = mapped_start_pos + len(base_map_info) - num_insertions - 1 |
|
|
364 |
|
|
|
365 |
# initialize feature matrix for all events. |
|
|
366 |
if moptions['fnum']==57: |
|
|
367 |
#mfeatures = np.zeros((len(modevents)-end_clip+100-(start_clip-100), (binnum+3+3+4))); |
|
|
368 |
mfeatures = np.zeros((len(modevents)-end_clip+100-(start_clip-100), (binnum+3+3+4))); |
|
|
369 |
else: mfeatures = np.zeros((len(modevents)-end_clip+100-(start_clip-100), (3+3+4))); |
|
|
370 |
|
|
|
371 |
# filter poor alignment |
|
|
372 |
checkneighbornums = [3,6] |
|
|
373 |
checkratios = {3:[6,5,4,2], 6:[11,10,9,3]} |
|
|
374 |
checkratios = {3:[6,5,4,2], 6:[12,10,9,3]} |
|
|
375 |
cgpos = [[], []] |
|
|
376 |
affectneighbor = 1; # 2; |
|
|
377 |
for aligni in range(len(base_map_info)): |
|
|
378 |
# for methylated positions and not-used adjacent positions |
|
|
379 |
if 'motif' in moptions and base_map_info['readbase'][aligni]==moptions['motif'][0][moptions['motif'][1]]: |
|
|
380 |
m_a_st = aligni-moptions['motif'][1]; m_a_end = aligni+len(moptions['motif'][0])-moptions['motif'][1] |
|
|
381 |
if m_a_st>-1 and m_a_end<=len(base_map_info) and ''.join(base_map_info['readbase'][m_a_st:m_a_end])==moptions['motif'][0] and (not ''.join(base_map_info['refbase'][m_a_st:m_a_end])==moptions['motif'][0]): |
|
|
382 |
cgpos[1].extend([(forward_reverse, base_map_info['refbasei'][addi]) for addi in range(aligni-affectneighbor if aligni-affectneighbor>-1 else 0, aligni+affectneighbor+1 if aligni+affectneighbor+1<len(base_map_info) else len(base_map_info))]) |
|
|
383 |
if (not base_map_info['refbase'][aligni]=='-') and \ |
|
|
384 |
(forward_reverse, base_map_info['refbasei'][aligni]) in moptions['fulmodlist'][rname]: |
|
|
385 |
if not base_map_info['readbase'][aligni]=='-': |
|
|
386 |
nextnogap = aligni + 1; |
|
|
387 |
while nextnogap<len(base_map_info): |
|
|
388 |
if not base_map_info['refbase'][nextnogap]=='-': break; |
|
|
389 |
nextnogap += 1 |
|
|
390 |
iscg = False; |
|
|
391 |
# find gaps |
|
|
392 |
for checkneighbornum in checkneighbornums: |
|
|
393 |
if not nextnogap<len(base_map_info): continue; |
|
|
394 |
matchnum = 0; gapnum = 0; |
|
|
395 |
# get gaps for two window sizes |
|
|
396 |
for checki in range(aligni-checkneighbornum, aligni+checkneighbornum+1): |
|
|
397 |
if checki>-1 and checki<len(base_map_info): |
|
|
398 |
if base_map_info['refbase'][checki]==base_map_info['readbase'][checki]: matchnum += 1 |
|
|
399 |
if base_map_info['refbase'][checki]=='-' or base_map_info['readbase'][checki]=='-': gapnum += 1 |
|
|
400 |
if gapnum<=checkratios[checkneighbornum][3]: |
|
|
401 |
for addi in range(aligni-affectneighbor if aligni-affectneighbor>-1 else 0, nextnogap+affectneighbor if nextnogap+affectneighbor<len(base_map_info) else len(base_map_info)): |
|
|
402 |
if addi==aligni: # for methylated positions |
|
|
403 |
cgpos[0].append((forward_reverse, base_map_info['refbasei'][addi])) |
|
|
404 |
else: # for non-used positions |
|
|
405 |
cgpos[1].append((forward_reverse, base_map_info['refbasei'][addi])) |
|
|
406 |
iscg = True; break; |
|
|
407 |
if iscg: continue; |
|
|
408 |
# add more not-used positions if more gaps exist |
|
|
409 |
if not base_map_info['readbase'][aligni]=='-': |
|
|
410 |
nextnogap = aligni |
|
|
411 |
for _ in range(affectneighbor): |
|
|
412 |
nextnogap += 1; |
|
|
413 |
while nextnogap<len(base_map_info['refbase']): |
|
|
414 |
if not base_map_info['refbase'][nextnogap]=='-': break; |
|
|
415 |
nextnogap += 1 |
|
|
416 |
prenogap = aligni |
|
|
417 |
for _ in range(affectneighbor): |
|
|
418 |
prenogap -= 1; |
|
|
419 |
while prenogap>-1: |
|
|
420 |
if not base_map_info['refbase'][prenogap]=='-': break; |
|
|
421 |
prenogap -= 1 |
|
|
422 |
|
|
|
423 |
read0 = aligni; read1 = aligni |
|
|
424 |
for _ in range(affectneighbor): |
|
|
425 |
read0 -= 1 |
|
|
426 |
while read0>-1: |
|
|
427 |
if base_map_info['readbase'][read0]=='-': read0 -= 1 |
|
|
428 |
else: break; |
|
|
429 |
read1 += 1 |
|
|
430 |
while read1<len(base_map_info['readbase']): |
|
|
431 |
if base_map_info['readbase'][read1]=='-': read1 += 1 |
|
|
432 |
else: break; |
|
|
433 |
|
|
|
434 |
if read0<prenogap: |
|
|
435 |
if read0>-1: prenogap = read0 |
|
|
436 |
else: prenogap = 0 |
|
|
437 |
if read1>nextnogap: |
|
|
438 |
if read1<len(base_map_info['readbase']): nextnogap = read1 |
|
|
439 |
else: nextnogap = len(base_map_info['readbase'])-1 |
|
|
440 |
if prenogap<0: prenogap = 0 |
|
|
441 |
if not nextnogap<len(base_map_info['readbase']): nextnogap=len(base_map_info['readbase'])-1 |
|
|
442 |
if not prenogap<len(base_map_info['readbase']): prenogap=len(base_map_info['readbase'])-1 |
|
|
443 |
for excldi in range(prenogap, nextnogap+1): |
|
|
444 |
cgpos[1].append((forward_reverse, base_map_info['refbasei'][excldi])) |
|
|
445 |
|
|
|
446 |
print ('%s%s %d, %d >> %d %d, %d-%d=%d' % (forward_reverse, f5data[readk][3], len(cgpos[0]), len(cgpos[1]), len(modevents)-end_clip-start_clip, start_clip, len(modevents), end_clip, len(modevents)-end_clip)) |
|
|
447 |
|
|
|
448 |
aligni = 0; isdif = False; |
|
|
449 |
for ie in range(start_clip-100, len(modevents)-end_clip+100): |
|
|
450 |
cur_row_num = ie - (start_clip-100); cur_base = '' |
|
|
451 |
# for aligned bases |
|
|
452 |
if ie>=start_clip and ie<len(modevents)-end_clip: |
|
|
453 |
if align_ref_pos<mapped_start_pos: |
|
|
454 |
print ('ERRRR align_ref_pos(%d)<mapped_start_pos(%d)' % (align_ref_pos, mapped_start_pos)) |
|
|
455 |
while base_map_info['readbase'][aligni]=='-': |
|
|
456 |
if not align_ref_pos==base_map_info['refbasei'][aligni]: |
|
|
457 |
print ('ERRRR align_ref_pos(%d) not equal to %d' % (align_ref_pos, base_map_info['refbasei'][aligni] )) |
|
|
458 |
if not base_map_info['refbase'][aligni]=='-': |
|
|
459 |
if forward_reverse=='+': align_ref_pos += 1 |
|
|
460 |
else: align_ref_pos -= 1 |
|
|
461 |
aligni += 1 |
|
|
462 |
if not base_map_info['readbase'][aligni] == modevents['model_state'][ie][2]: |
|
|
463 |
print ('Error Does not match', base_map_info['readbase'][aligni], modevents['model_state'][ie][2], aligni, ie) |
|
|
464 |
isdif = True; |
|
|
465 |
# the first column is the aligned reference position |
|
|
466 |
mfeatures[cur_row_num][0] = align_ref_pos |
|
|
467 |
cur_base = base_map_info['refbase'][aligni] |
|
|
468 |
# the second/third column is for negative/positive labels of methylation |
|
|
469 |
if moptions['posneg'] == 0: # for a data without any modification |
|
|
470 |
if ( (not moptions['anymodlist']==None) and rname in moptions['nomodlist'] and (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['nomodlist'][rname] ): |
|
|
471 |
mfeatures[cur_row_num][1] = 1; mfeatures[cur_row_num][2] = 0 |
|
|
472 |
elif (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['fulmodlist'][rname]: |
|
|
473 |
mfeatures[cur_row_num][1] = 1; mfeatures[cur_row_num][2] = 0 |
|
|
474 |
elif ((not moptions['anymodlist']==None) and rname in moptions['anymodlist'] and (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['anymodlist'][rname] ): |
|
|
475 |
mfeatures[cur_row_num][1] = 1; mfeatures[cur_row_num][2] = 0 |
|
|
476 |
else: # for a data with both modified and un-modified positions |
|
|
477 |
if (forward_reverse, base_map_info['refbasei'][aligni]) in cgpos[0] and (not base_map_info['refbase'][aligni]=='-'): |
|
|
478 |
mfeatures[cur_row_num][1] = 0; mfeatures[cur_row_num][2] = 1 |
|
|
479 |
else: |
|
|
480 |
if (forward_reverse, base_map_info['refbasei'][aligni]) not in cgpos[1]: |
|
|
481 |
if moptions['anymodlist']==None: |
|
|
482 |
if moptions['nomodlist']==None or ( rname in moptions['nomodlist'] and (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['nomodlist'][rname] ): |
|
|
483 |
mfeatures[cur_row_num][1] = 1; mfeatures[cur_row_num][2] = 0 |
|
|
484 |
elif rname in moptions['anymodlist'] and (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['anymodlist'][rname]: |
|
|
485 |
pass |
|
|
486 |
else: |
|
|
487 |
if moptions['nomodlist']==None or ( rname in moptions['nomodlist'] and (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['nomodlist'][rname] ): |
|
|
488 |
mfeatures[cur_row_num][1] = 1; mfeatures[cur_row_num][2] = 0 |
|
|
489 |
if not base_map_info['refbase'][aligni]=='-': |
|
|
490 |
if forward_reverse=='+': align_ref_pos += 1 |
|
|
491 |
else: align_ref_pos -= 1 |
|
|
492 |
aligni += 1 |
|
|
493 |
|
|
|
494 |
# for bin features |
|
|
495 |
if ie>=0 and ie<len(modevents) and moptions['fnum']==57: |
|
|
496 |
for currs in sp_param['f5data'][readk][2][modevents['start'][ie]:int(modevents['start'][ie]+int(modevents['length'][ie]+0.5))]: |
|
|
497 |
if currs>10 or currs<-10: print ('Error raw signal', currs, ie, modevents['start'][ie], modevents['length'][ie]) |
|
|
498 |
curbin = int((currs+5)/binlen) |
|
|
499 |
if curbin<0: curbin = 0 |
|
|
500 |
elif not curbin<binnum: curbin = binnum-1 |
|
|
501 |
mfeatures[cur_row_num][curbin+3] += 1 |
|
|
502 |
if ie>=0 and ie<len(modevents): |
|
|
503 |
# for reference base type feature |
|
|
504 |
if cur_base in myCom.g_ACGT: |
|
|
505 |
mfeatures[cur_row_num][moptions['fnum']-3+3-4+myCom.g_ACGT.index(cur_base)] = 1 |
|
|
506 |
cur_index_add = moptions['fnum'] - 3 + 3 |
|
|
507 |
# for signal mean std and length. |
|
|
508 |
mfeatures[cur_row_num][cur_index_add + 0] = modevents["mean"][ie] |
|
|
509 |
mfeatures[cur_row_num][cur_index_add + 1] = modevents["stdv"][ie] |
|
|
510 |
mfeatures[cur_row_num][cur_index_add + 2] = modevents["length"][ie] |
|
|
511 |
|
|
|
512 |
# truncated too much not-used positions |
|
|
513 |
nbkeys = defaultdict(); |
|
|
514 |
for mfi in range(len(mfeatures)): |
|
|
515 |
if mfeatures[mfi][1] + mfeatures[mfi][2] > 0.9: |
|
|
516 |
for ini in range(mfi-25, mfi+26): |
|
|
517 |
if ini<0 or ini>len(mfeatures)-1: |
|
|
518 |
print("Warning wrong del mfeatures id %d for %s" % (ini, f5data[readk][3])) |
|
|
519 |
else: |
|
|
520 |
nbkeys[ini] = True; |
|
|
521 |
keepInd = sorted(list(nbkeys.keys())); |
|
|
522 |
if len(keepInd)>0: |
|
|
523 |
if not len(keepInd)>len(mfeatures)*0.9: |
|
|
524 |
mfeatures = mfeatures[np.array(keepInd)] |
|
|
525 |
else: |
|
|
526 |
mfeatures = [] |
|
|
527 |
|
|
|
528 |
return (mfeatures, isdif) |
|
|
529 |
|
|
|
530 |
|
|
|
531 |
# |
|
|
532 |
# get the complementary bases |
|
|
533 |
# |
|
|
534 |
def get_complement(na): |
|
|
535 |
if na in myCom.acgt: return myCom.na_bp[na] |
|
|
536 |
else: return na; |
|
|
537 |
|
|
|
538 |
# |
|
|
539 |
# get required information for reach mapping records. |
|
|
540 |
# |
|
|
541 |
def handle_line(moptions, sp_param, f5align): |
|
|
542 |
lsp = sp_param['line'].split('\t') |
|
|
543 |
qname, flag, rname, pos, mapq, cigar, _, _, _, seq, _ = lsp[:11] |
|
|
544 |
# checked query name |
|
|
545 |
if qname=='*': sp_param['f5status'] = "qname is *" |
|
|
546 |
# check mapping quality |
|
|
547 |
elif int(mapq)==255: sp_param['f5status'] = "mapq is 255" |
|
|
548 |
# check mapped positions |
|
|
549 |
elif int(pos)==0: sp_param['f5status'] = "pos is 0" |
|
|
550 |
# check mapped string |
|
|
551 |
elif cigar=='*': sp_param['f5status'] = "cigar is *" |
|
|
552 |
# check reference name |
|
|
553 |
elif rname=='*': sp_param['f5status'] = "rname is *" |
|
|
554 |
if not sp_param['f5status']=="": return qname |
|
|
555 |
|
|
|
556 |
if (qname not in f5align) or f5align[qname][0]<int(mapq): |
|
|
557 |
f5align[qname] = (int(mapq), int(flag), rname, int(pos), cigar, seq) |
|
|
558 |
|
|
|
559 |
return qname |
|
|
560 |
|
|
|
561 |
# |
|
|
562 |
# feature handler/workder for multiprocessing |
|
|
563 |
# |
|
|
564 |
def getFeature_handler(moptions, h5files_Q, failed_Q, version_Q): |
|
|
565 |
while not h5files_Q.empty(): |
|
|
566 |
try: |
|
|
567 |
# get a list of files |
|
|
568 |
f5files, ctfolderid = h5files_Q.get(block=False) |
|
|
569 |
except: |
|
|
570 |
break; |
|
|
571 |
|
|
|
572 |
sp_options = defaultdict(); |
|
|
573 |
sp_options['ctfolder'] = moptions['outFolder']+str(ctfolderid) |
|
|
574 |
if not os.path.isdir(sp_options['ctfolder']): |
|
|
575 |
os.system('mkdir '+sp_options['ctfolder']) |
|
|
576 |
# get features |
|
|
577 |
mGetFeature1(moptions, sp_options, f5files) |
|
|
578 |
# output errors |
|
|
579 |
for errtype, errfiles in sp_options["Error"].items(): |
|
|
580 |
failed_Q.put((errtype, errfiles)); |
|
|
581 |
# double check albacore version |
|
|
582 |
for vk in sp_options["get_albacore_version"]: |
|
|
583 |
version_Q.put((vk, sp_options["get_albacore_version"][vk])) |
|
|
584 |
|
|
|
585 |
# |
|
|
586 |
# read sequence information from a reference genome |
|
|
587 |
# |
|
|
588 |
def readFA(mfa, t_chr=None): |
|
|
589 |
fadict = defaultdict(); |
|
|
590 |
with open(mfa, 'r') as mr: |
|
|
591 |
cur_chr = None; |
|
|
592 |
line = mr.readline(); |
|
|
593 |
while line: |
|
|
594 |
# remove empty spaces |
|
|
595 |
line = line.strip(); |
|
|
596 |
if len(line)>0: |
|
|
597 |
if line[0]=='>': # for each chromosome line |
|
|
598 |
if (not cur_chr==None) and (t_chr in [None, cur_chr]): |
|
|
599 |
fadict[cur_chr] = ''.join(fadict[cur_chr]) |
|
|
600 |
cur_chr = line[1:].split()[0] |
|
|
601 |
if t_chr in [None, cur_chr]: |
|
|
602 |
fadict[cur_chr] = [] |
|
|
603 |
else: # for sub-sequence line in a reference file |
|
|
604 |
if t_chr in [None, cur_chr]: |
|
|
605 |
fadict[cur_chr].append(line.upper()) |
|
|
606 |
line = mr.readline(); |
|
|
607 |
# for the last chromosome in the file |
|
|
608 |
if (not cur_chr==None) and (t_chr in [None, cur_chr]): |
|
|
609 |
fadict[cur_chr] = ''.join(fadict[cur_chr]) |
|
|
610 |
return fadict |
|
|
611 |
|
|
|
612 |
# |
|
|
613 |
# get reference positions for motif-based modifications |
|
|
614 |
# |
|
|
615 |
def readMotifMod(fadict, mpat='Cg', mposinpat=0, t_chr=None, t_start=None, t_end=None): |
|
|
616 |
pos_dict = defaultdict(int) |
|
|
617 |
|
|
|
618 |
# get motif and complementary motif |
|
|
619 |
pat3 = copy.deepcopy(mpat.upper()) |
|
|
620 |
comp_pat3 = ''.join([get_complement(curna) for curna in pat3][::-1]) |
|
|
621 |
comp_mposinpat = len(comp_pat3)-1-mposinpat |
|
|
622 |
|
|
|
623 |
fakeys = fadict.keys(); |
|
|
624 |
cpgdict = defaultdict(int); |
|
|
625 |
all_a = defaultdict() |
|
|
626 |
for fak in fakeys: |
|
|
627 |
cpgnum = [0, 0] |
|
|
628 |
# motif-based reference positions |
|
|
629 |
cpgdict[fak] = defaultdict() |
|
|
630 |
# position of bases of interest |
|
|
631 |
all_a[fak] = defaultdict() |
|
|
632 |
for i in range(len(fadict[fak])): |
|
|
633 |
if (t_start==None or i>=t_start) and (t_end==None or i<=t_end): |
|
|
634 |
if fadict[fak][i]==mpat[mposinpat]: # for forward strand |
|
|
635 |
all_a[fak][('+', i)] = True; |
|
|
636 |
elif get_complement(fadict[fak][i])==mpat[mposinpat]: # for reverse strand |
|
|
637 |
all_a[fak][('-', i)] = True; |
|
|
638 |
|
|
|
639 |
# check motif in forward strand |
|
|
640 |
if i-mposinpat>=0 and i+len(comp_pat3)-1-mposinpat<len(fadict[fak]) and ''.join(fadict[fak][i-mposinpat:(i+len(comp_pat3)-1-mposinpat+1)])==pat3: |
|
|
641 |
cpgdict[fak][('+', i)] = [1, fadict[fak][i]]; cpgnum[0] += 1 |
|
|
642 |
elif i-comp_mposinpat>=0 and i+len(comp_pat3)-1-comp_mposinpat<len(fadict[fak]) and ''.join(fadict[fak][i-comp_mposinpat:(i+len(comp_pat3)-1-comp_mposinpat+1)])==comp_pat3: # check motif in reverse strand |
|
|
643 |
cpgdict[fak][('-', i)] = [1, fadict[fak][i]]; cpgnum[1] += 1 |
|
|
644 |
else: |
|
|
645 |
pass |
|
|
646 |
print('%s%d site: %d(+) %d(-) for %s' % (pat3, mposinpat, cpgnum[0], cpgnum[1], fak)) |
|
|
647 |
return (cpgdict, all_a) |
|
|
648 |
|
|
|
649 |
|
|
|
650 |
# |
|
|
651 |
# a multiprocessing manager to get features from all long reads. |
|
|
652 |
# |
|
|
653 |
def getFeature_manager(moptions): |
|
|
654 |
start_time = time.time(); |
|
|
655 |
# multipprocessing manager |
|
|
656 |
pmanager = multiprocessing.Manager(); |
|
|
657 |
|
|
|
658 |
# prepare output folder |
|
|
659 |
if os.path.isdir(moptions['outFolder']): |
|
|
660 |
os.system('rm -dr '+moptions['outFolder']) |
|
|
661 |
if not os.path.isdir(moptions['outFolder']): |
|
|
662 |
os.system('mkdir '+moptions['outFolder']) |
|
|
663 |
|
|
|
664 |
moptions['size_per_batch'] = moptions['size_per_batch'] * (10**7) |
|
|
665 |
|
|
|
666 |
# read reference information |
|
|
667 |
fadict = readFA(moptions['Ref'],moptions['region'][0]) |
|
|
668 |
if moptions['motifORPos']==1: # get motif-based positions for modifications |
|
|
669 |
moptions['fulmodlist'], moptions['nomodlist'] = readMotifMod(fadict, moptions['motif'][0], moptions['motif'][1], moptions['region'][0], moptions['region'][1], moptions['region'][2]) |
|
|
670 |
moptions['anymodlist'] = None |
|
|
671 |
moptions['nomodlist'] = None; # add for simple process |
|
|
672 |
elif moptions['motifORPos']==2: # modification position is specified by the files |
|
|
673 |
fuldfiles = glob.glob(moptions["fulmod"]); |
|
|
674 |
moptions['fulmodlist'] = defaultdict(lambda: defaultdict()); |
|
|
675 |
if not moptions["anymod"]==None: # partially modified positions |
|
|
676 |
anydfiles = glob.glob(moptions["anymod"]) |
|
|
677 |
moptions['anymodlist'] = defaultdict(lambda: defaultdict()); |
|
|
678 |
else: |
|
|
679 |
moptions['anymodlist'] = None |
|
|
680 |
if not moptions["nomod"]==None: # completely un-modified positions |
|
|
681 |
nodfiles = glob.glob(moptions["nomod"]) |
|
|
682 |
moptions['nomodlist'] = defaultdict(lambda: defaultdict()); |
|
|
683 |
else: |
|
|
684 |
moptions['nomodlist'] = None |
|
|
685 |
mthreadin = [moptions['fulmodlist'], moptions['anymodlist'], moptions['nomodlist']] |
|
|
686 |
mthfiles = [fuldfiles, anydfiles, nodfiles] |
|
|
687 |
# read completely modified positions, partially modified positions, completely un-modified positions from files |
|
|
688 |
for mthi in range(len(mthreadin)): |
|
|
689 |
curmeth = mthreadin[mthi]; curfilelist = mthfiles[mthi] |
|
|
690 |
if curmeth==None or curfilelist==None: continue; |
|
|
691 |
for curmthf in curfilelist: |
|
|
692 |
with open(curmthf, 'r') as mreader: |
|
|
693 |
line = mreader.readline(); |
|
|
694 |
while line: |
|
|
695 |
if len(line)>0: |
|
|
696 |
tchr, tstrand, tpos = line.split()[:3] |
|
|
697 |
curmeth[tchr][(tstrand, int(tpos))] = [1-mthi, fadict[tchr][int(tpos)]]; |
|
|
698 |
line = mreader.readline(); |
|
|
699 |
for tchr in moptions['fulmodlist'] if moptions['anymodlist']==None else moptions['anymodlist']: |
|
|
700 |
if len(moptions['fulmodlist'][tchr])>0 or ((not moptions['anymodlist']==None) and len(moptions['anymodlist'][tchr])>0): |
|
|
701 |
print ('%s fulmod=%d anymod=%d nomod=%d' % (tchr, len(moptions['fulmodlist'][tchr]), len(moptions['anymodlist'][tchr]) if (not moptions['anymodlist']==None) else -1, len(moptions['nomodlist'][tchr]) if (not moptions['nomodlist']==None) else -1)) |
|
|
702 |
|
|
|
703 |
if True: #False: |
|
|
704 |
# get all input fast5 files |
|
|
705 |
f5files = glob.glob(os.path.join(moptions['wrkBase'],"*.fast5" )) |
|
|
706 |
if moptions['recursive']==1: |
|
|
707 |
f5files.extend(glob.glob(os.path.join(moptions['wrkBase'],"*/*.fast5" ))) |
|
|
708 |
f5files.extend(glob.glob(os.path.join(moptions['wrkBase'],"*/*/*.fast5" ))) |
|
|
709 |
f5files.extend(glob.glob(os.path.join(moptions['wrkBase'],"*/*/*/*.fast5" ))) |
|
|
710 |
|
|
|
711 |
|
|
|
712 |
print('Total files=%d' % len(f5files)) |
|
|
713 |
h5files_Q = pmanager.Queue(); |
|
|
714 |
failed_Q = pmanager.Queue() |
|
|
715 |
version_Q = pmanager.Queue() |
|
|
716 |
|
|
|
717 |
# split input fast5 files into different batch |
|
|
718 |
h5_batch = []; h5batchind = 0; |
|
|
719 |
for f5f in f5files: |
|
|
720 |
h5_batch.append(f5f); |
|
|
721 |
if len(h5_batch)==moptions['files_per_thread']: |
|
|
722 |
h5files_Q.put((h5_batch, h5batchind)) |
|
|
723 |
h5batchind += 1 |
|
|
724 |
h5_batch = []; #break; ### feature500 |
|
|
725 |
if len(h5_batch)>0: |
|
|
726 |
h5files_Q.put((h5_batch, h5batchind)) |
|
|
727 |
|
|
|
728 |
# each thread handle a batch a time and repeat for all batches. |
|
|
729 |
share_var = (moptions, h5files_Q, failed_Q, version_Q) |
|
|
730 |
handlers = [] |
|
|
731 |
for hid in range(moptions['threads']): |
|
|
732 |
p = multiprocessing.Process(target=getFeature_handler, args=share_var); |
|
|
733 |
p.start(); |
|
|
734 |
handlers.append(p); |
|
|
735 |
|
|
|
736 |
# get failed files. |
|
|
737 |
failed_files = defaultdict(list); |
|
|
738 |
version_default = defaultdict(lambda: defaultdict(int)); |
|
|
739 |
while any(p.is_alive() for p in handlers): |
|
|
740 |
try: |
|
|
741 |
errk, fns = failed_Q.get(block=False); |
|
|
742 |
failed_files[errk].extend(fns) |
|
|
743 |
curv, curv_num = version_Q.get(block=False); |
|
|
744 |
version_default[curv] += curv_num |
|
|
745 |
except: |
|
|
746 |
time.sleep(1); |
|
|
747 |
continue; |
|
|
748 |
|
|
|
749 |
# output failure information |
|
|
750 |
if len(failed_files)>0: |
|
|
751 |
print ('Error information for different fast5 files:') |
|
|
752 |
for errtype, errfiles in failed_files.items(): |
|
|
753 |
print ('\t%s %d' % (errtype, len(errfiles))) |
|
|
754 |
print("abversion info {}".format(str(version_default))) |
|
|
755 |
sys.stdout.flush() |
|
|
756 |
end_time = time.time(); |
|
|
757 |
print ("Total consuming time %d" % (end_time-start_time)) |
|
|
758 |
|
|
|
759 |
|
|
|
760 |
|
|
|
761 |
# for indepdent testing of code |
|
|
762 |
if __name__=='__main__': |
|
|
763 |
# if len(sys.argv)>4: |
|
|
764 |
moptions = {} |
|
|
765 |
moptions['basecall_1d'] = 'Basecall_1D_000' |
|
|
766 |
moptions['basecall_1d'] = ['Basecall_1D_000'] |
|
|
767 |
moptions['basecall_2strand'] = 'BaseCalled_template' |
|
|
768 |
|
|
|
769 |
moptions['outLevel'] = myCom.OUTPUT_WARNING |
|
|
770 |
moptions['outLevel'] = myCom.OUTPUT_INFO |
|
|
771 |
|
|
|
772 |
moptions['modfile'] = '../../mod_output/train1/2/mod_train' |
|
|
773 |
|
|
|
774 |
moptions['fnum'] = 53; |
|
|
775 |
moptions['hidden'] = 100; |
|
|
776 |
moptions['windowsize'] = 21; |
|
|
777 |
|
|
|
778 |
moptions['threads'] = 8 |
|
|
779 |
moptions['threads'] = 1 |
|
|
780 |
moptions['files_per_thread'] = 500 |
|
|
781 |
|
|
|
782 |
mDetect_manager(moptions) |