[c66173]: / bin / DeepMod.py

Download this file

384 lines (327 with data), 19.7 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
#!/usr/bin/env python
import os;
import sys;
import string;
from collections import defaultdict
import argparse;
from argparse import RawTextHelpFormatter
from DeepMod_scripts.myCom import *
# three modules in DeepMod
parser = argparse.ArgumentParser(description="Detect nucleotide modification from nanopore signals data.", epilog="For example, \n \
\tpython %(prog)s train: Training a modification classifier.\n \
\tpython %(prog)s detect: Detect modification by integrating all long reads. \n \
\tpython %(prog)s getfeatures: Get features for training a model. \n \
", formatter_class=RawTextHelpFormatter);
#
# Return error message when a value<1
# Return an empty string otherwise
#
def non_negative(i, mstr):
if i<1: return (("\n\tError %d could not be negative(%d)" % (mstr, i)))
else: return ''
#
# Print all parameters in stdout
#
def printParameters(moptions):
mpkeys = moptions.keys(); #mpkeys.sort()
sorted(mpkeys)
print('%30s: %s' % ('Current directory', os.getcwd()))
for mpk in mpkeys:
print ('%30s: %s' % (mpk, str(moptions[mpk])))
sys.stdout.flush()
#
# Got common argument provided by users or default values.
#
#
def mCommonParam(margs):
ErrorMessage = ""
moptions = defaultdict()
# how to output running message: need more control now.
moptions['outLevel'] = margs.outLevel
# the input working base
moptions["wrkBase"] = margs.wrkBase
if moptions["wrkBase"]==None:
ErrorMessage = ErrorMessage + ("\n\tThe input folder is None.")
# An unique ID for output
# Usefull for run the program in parallel
moptions["FileID"] = margs.FileID
# output folder;
# make it if the output folder does not exist
moptions['outFolder'] = margs.outFolder
moptions['outFolder'] = format_last_letter_of_folder(moptions['outFolder'])
if moptions['outFolder']==None or (not os.path.isdir(moptions['outFolder'])):
try:
os.system('mkdir -p '+moptions['outFolder']);
except:
ErrorMessage = ErrorMessage + ("\n\tThe output folder (%s) does not exist and cannot be created." % moptions['outFolder'])
# check all data in a recurive way
moptions['recursive'] = margs.recursive
# the number of threads used and the number of files handled by each thread.
moptions['files_per_thread'] = margs.files_per_thread
if moptions['files_per_thread']<2: moptions['files_per_thread'] = 2
# the number of threads used
moptions['threads'] = margs.threads
if moptions['threads']<1: moptions['threads'] = 1
# windowsize: default=21
moptions['windowsize'] = margs.windowsize
ErrorMessage = ErrorMessage + non_negative(moptions['windowsize'], 'windowsize')
if moptions['windowsize']<1: moptions['windowsize'] = 1
# aligners: bwa-mem or minimap2
moptions['alignStr'] = margs.alignStr;
moptions['SignalGroup'] = margs.SignalGroup;
moptions['move'] = margs.move
return [moptions, ErrorMessage]
#
# detect modification for bases of interests
# input is a list of fast5 files, a reference genome and a well-trained model.
#
def mDetect(margs):
# get common parameters
moptions, ErrorMessage = mCommonParam(margs)
# path for basecall information in fast5 files
moptions['basecall_1d'] = margs.basecall_1d
moptions['basecall_2strand'] = margs.basecall_2strand
# Whether consider those chromosome which contain -_:/
# default: yes;
moptions['ConUnk'] = margs.ConUnk
# output layer information for deep learning
moptions['outputlayer'] = margs.outputlayer
# base of interest
moptions['Base'] = margs.Base
# whether take cluster effect of methylation into consideration
moptions['mod_cluster'] = margs.mod_cluster
# base of interest
if moptions['Base'] in ["", None]:
ErrorMessage = ErrorMessage + ("\n\t Please provide a base of interest.")
# predict medification for bases of interest in long reads first
# only summarize them for each genomic position of interest .
moptions['predDet'] = margs.predDet
if moptions['predDet']:
# path to reference genome
moptions['Ref'] = margs.Ref
if moptions['Ref']==None or (not os.path.isfile(moptions['Ref'])):
ErrorMessage = ErrorMessage + ("\n\t reference file does not exist (%s)" % moptions['Ref'])
# the number of feature for each event
moptions['fnum'] = margs.fnum
ErrorMessage = ErrorMessage + non_negative(moptions['fnum'], 'fnum')
# the number of hidden nodes
moptions['hidden'] = margs.hidden
ErrorMessage = ErrorMessage + non_negative(moptions['hidden'], 'hidden')
# the well-trained model
moptions['modfile'] = margs.modfile
if moptions['modfile']==None:
print("No mod file is provided. The default one is used")
moptions['modfile'] = ('train_deepmod/rnn_P90wd%d_f53/mod_train_P90wd%d_f53' % (moptions['windowsize'], moptions['windowsize']))
if (not os.path.isfile(moptions['modfile']+'.meta')):
moptions['modfile'] = ('{}/lib/python{}.{}/site-packages/DeepMod/train_deepmod/rnn_P90wd{}_f53/mod_train_P90wd{}_f53'.format(sys.prefix,sys.version_info.major,sys.version_info.minor, moptions['windowsize'], moptions['windowsize']))
if (not os.path.isfile(moptions['modfile']+'.meta')):
ErrorMessage = ErrorMessage + ("\n\tThe meta file (%s) does not exist" % (moptions['modfile']+'.meta' if not moptions['modfile']==None else ""))
else:
# already done the prediction process?
# Yes: summarize the results only
moptions['predpath'] = margs.predpath
if moptions['predpath']==None or (not os.path.isdir(moptions['predpath'])):
ErrorMessage = ErrorMessage + ("\n\tThe predpath does not exist")
# specify region of interest
# not consider bases outside regions in a reference genome
# None: all bases of interest
moptions['region'] = [ ]
if margs.region == None or len(margs.region)==0:
moptions['region'].append([None, None, None])
else:
mregionlist = margs.region.split(';')
for mr in mregionlist:
mr_sp = mr.split(':')
moptions['region'].append([mr_sp[0], int(mr_sp[1]) if len(mr_sp)>1 else None, int(mr_sp[2]) if len(mr_sp)>2 else None ])
print("\nNanopore sequencing data analysis is resourece-intensive and time consuming. ")
print("Some potential strong recommendations are below:")
print("\tIf your reference genome is large as human genome and your Nanopore data is huge,")
print("\tIt would be faster to run this program parallelly to speed up.")
print("\tYou might run different input folders of your fast5 files and ")
print("\tgive different output names (--FileID) or folders (--outFolder)")
print("\tA good way for this is to run different chromosome individually.\n")
# print help information if any necessary options are not provided.
printParameters(moptions)
if not ErrorMessage=="":
ErrorMessage = "Please provide correct parameters" + ErrorMessage
print(ErrorMessage)
parser.print_help();
parser.parse_args(['detect', '--help']);
sys.exit(1)
from DeepMod_scripts import myDetect
myDetect.mDetect_manager(moptions)
#
# Train a model
# Need to get features first.
#
def mTrain(margs):
from DeepMod_scripts import myMultiBiRNN
# gent common options
moptions, ErrorMessage = mCommonParam(margs)
# network setting: the number of features and the number of hidden nodes
moptions['fnum'] = margs.fnum
ErrorMessage = ErrorMessage + non_negative(moptions['fnum'], 'fnum')
moptions['hidden'] = margs.hidden
ErrorMessage = ErrorMessage + non_negative(moptions['hidden'], 'hidden')
# the output function of the deep learning model
moptions['outputlayer'] = margs.outputlayer
# whether using different class weights
moptions['unbalanced'] = margs.unbalanced
# re-load trained model and continue to train
moptions['modfile'] = margs.modfile
if moptions['modfile']==None: pass;
elif (not os.path.isfile(moptions['modfile']+'.meta')):
ErrorMessage = ErrorMessage + ("\n\tThe meta file (%s) does not exist" % (moptions['modfile']+'.meta' if not moptions['modfile']==None else ""))
# read-based or region based independent training
# E: region-based
# P: read-based.
if not margs.test==None:
moptions['test'] = margs.test.split(',')
if moptions['test'][0] == 'E': moptions['test'][0] = '-'
elif moptions['test'][0] == 'P': moptions['test'][0] = '0'
else:
ErrorMessage = ErrorMessage + "Unknown option for test: the first character must be E or P "+margs.test
if moptions['test'][0] in ['-']:
moptions['test'][1] = int(moptions['test'][1]) * (10**6)
moptions['test'][2] = int(moptions['test'][2]) * (10**6)
else: moptions['test'][1] = int(moptions['test'][1])/100.0
else: moptions['test'] = ['N', '100']
# print help document if necessary options are not provided.
print("Train")
printParameters(moptions)
if not ErrorMessage=="":
ErrorMessage = "Please provide correct parameters" + ErrorMessage
print(ErrorMessage)
parser.print_help();
parser.parse_args(['train', '--help']);
sys.exit(2)
myMultiBiRNN.mMult_RNN_LSTM_train(moptions)
#
# get features for training
#
#
def mGetFeatures(margs):
from DeepMod_scripts import myGetFeatureBasedPos
# get common options
moptions, ErrorMessage = mCommonParam(margs)
# motif-based data: positive or negative control data
moptions['posneg'] = margs.posneg
# the number of features: 7-description or 57-description
moptions['fnum'] = margs.fnum
ErrorMessage = ErrorMessage + non_negative(moptions['fnum'], 'fnum')
# size of each bacth to store features
moptions['size_per_batch'] = margs.size_per_batch
if moptions['size_per_batch'] < 0.001: moptions['size_per_batch'] = 0.001
# path to basecall inform in fast5 files
moptions['basecall_1d'] = margs.basecall_1d
moptions['basecall_2strand'] = margs.basecall_2strand
# regions of interest
moptions['region'] = [None, None, None]
if not (margs.region==None or margs.region.strip()==''):
rsp = margs.region.split(':')
for rv_ind in range(len(rsp)):
rsp[rv_ind] = rsp[rv_ind].strip();
if not rsp[rv_ind]=='':
moptions['region'][rv_ind] = rsp[rv_ind]
# referene genome
moptions['Ref'] = margs.Ref
if moptions['Ref']==None or (not os.path.isfile(moptions['Ref'])):
ErrorMessage = ErrorMessage + ("\n\t reference file does not exist (%s)" % moptions['Ref'])
# get motif-based modification
# or specify by --fulmod/--anymod/--nomod
moptions['motifORPos'] = margs.motifORPos
if margs.motifORPos==1:
moptions['motif'] = [margs.motif.upper(), margs.ModinMotif]
elif margs.motifORPos==2:
moptions['fulmod'] = margs.fulmod
if moptions['fulmod']==None: # completely modificated positions
ErrorMessage = ErrorMessage + ("\t There is no parameter for --fulmod.")
moptions['anymod'] = margs.anymod
if moptions['anymod'] == None: # patially modificated positions
ErrorMessage = ErrorMessage + ("\t There is no parameter for --anymod.")
moptions['nomod'] = margs.nomod
if moptions['nomod'] == None: # completely unmodified posisionts
ErrorMessage = ErrorMessage + ("\t There is no parameter for --nomod.")
else:
ErrorMessage = ErrorMessage + ("\tmotifORPos value (%d) is not supported." % margs.motifORPos)
# print help document if any required options are not provided.
printParameters(moptions)
if not ErrorMessage=="":
ErrorMessage = "Please provide correct parameters" + ErrorMessage
print(ErrorMessage)
parser.print_help();
parser.parse_args(['getfeatures', '--help']);
sys.exit(1)
myGetFeatureBasedPos.getFeature_manager(moptions)
#####################################################################################
subparsers = parser.add_subparsers()
parent_parser = argparse.ArgumentParser(add_help=False)
# add common options
com_group_for_comparison = parent_parser.add_argument_group('Common options.')
com_group_for_comparison.add_argument("--outLevel", type=int, choices=[OUTPUT_DEBUG, OUTPUT_INFO, OUTPUT_WARNING, OUTPUT_ERROR], default=OUTPUT_WARNING, help=("The level for output: %d for DEBUG, %d for INFO, %d for WARNING, %d for ERROR. Default: %d" % (OUTPUT_DEBUG, OUTPUT_INFO, OUTPUT_WARNING, OUTPUT_ERROR, OUTPUT_WARNING)))
com_group_for_comparison.add_argument("--wrkBase", help="The base folder for FAST5 files.")
com_group_for_comparison.add_argument("--FileID", default="mod", help="The unique string for intermediate files and final output files. Default: 'mod'")
com_group_for_comparison.add_argument("--outFolder", default='./mod_output', help="The default folder for outputing the results. Default: ./mod_output")
com_group_for_comparison.add_argument("--recursive", type=int, default=1, choices=[0,1], help="Recurise to find fast5 files. Default:1")
com_group_for_comparison.add_argument("--threads", type=int, default=4, help="The number of threads used (not for train). Default:4")
com_group_for_comparison.add_argument("--files_per_thread", type=int, default=1000, help="The number of fast5 files for each thread (not for train). Default:500")
com_group_for_comparison.add_argument("--windowsize", type=int, default=21, help="The window size to extract features. Default: 21")
com_group_for_comparison.add_argument("--alignStr", type=str, default='minimap2', choices=["bwa","minimap2"], help="Alignment tools (bwa or minimap2 is supported). Default: minimap2")
com_group_for_comparison.add_argument("--SignalGroup", type=str, default='simple', choices=["simple","rundif"], help="How to associate signals to each called bases. Default: simple")
com_group_for_comparison.add_argument("--move", default=False, action="store_true", help="Whether the basecalled data use move tables instead of event tables. Default: False")
# add detection options
parser_detect = subparsers.add_parser('detect', parents=[parent_parser], help="Detect modifications at a genomic scale", description="Detect modifications by integrating all long reads for a genome", epilog="For example, \n \
python %(prog)s --wrkBase ctrl_oligo_SpeI_cut --FileID mod_det --outFolder ./mod_output/detect3 \n \
", formatter_class=RawTextHelpFormatter)
parser_detect.add_argument("--Ref", help="The reference sequence")
parser_detect.add_argument("--predDet", type=int, default=1, choices=[0,1], help="pred first and then detect (1) or only detect (0). Default: 1")
parser_detect.add_argument("--predpath", default=None, help="The file path of predictions for each fast5 file. The file pattern is *_*.detail. Default: './mod_output/pred2/'")
parser_detect.add_argument("--modfile", type=str, default=None, help="The path to load training model. Default: 'mod_output/'")
parser_detect.add_argument("--fnum", type=int, default=7, help="The number of features. Default: 7")
parser_detect.add_argument("--hidden", type=int, default=100, help="The number of hidden node. Default: 100")
parser_detect.add_argument("--basecall_1d", default="Basecall_1D_000", help="Path for basecall_1d. Default: Basecall_1D_000")
parser_detect.add_argument("--basecall_2strand", default="BaseCalled_template", help="Path for basecall_2strand. Default: BaseCalled_template")
parser_detect.add_argument("--region", default=None, help="The region of interest: for example, chr:1:100000;chr2:10000");
parser_detect.add_argument("--ConUnk", default=True, choices=[False, True], help="Whether contain unknown chromosome");
parser_detect.add_argument("--outputlayer", default="", choices=["", "sigmoid"], help="how to put activation function for output layer")
parser_detect.add_argument("--Base", type=str, default='C', choices=['A', 'C', 'G', 'T'], help="Interest of bases");
parser_detect.add_argument("--mod_cluster", default=0, choices=[0,1], help="1: CpG cluster effect; 0: not");
parser_detect.set_defaults(func=mDetect)
# add training options
parser_training = subparsers.add_parser('train', parents=[parent_parser], help="Training a modification classifier", description="Training a modification classifier", epilog="For example, \n \
python %(prog)s --wrkBase umr --wrkBase2 sss --FileID mod_train --outFolder ./mod_output/train1 \n \
", formatter_class=RawTextHelpFormatter)
parser_training.add_argument("--wrkBase2", help="The base folder for long reads without any modifications.")
parser_training.add_argument("--fnum", type=int, default=7, help="The number of features. Default: 7")
parser_training.add_argument("--hidden", type=int, default=100, help="The number of hidden node. Default: 100")
parser_training.add_argument("--modfile", type=str, default=None, help="The path to load training model. Default: 'mod_output/'")
parser_training.add_argument("--test", help="The number of E Coli genomic position for testing. Default: 'E,1,2'")
parser_training.add_argument("--outputlayer", default="", choices=["", "sigmoid"], help="how to put activation function for output layer")
parser_training.add_argument("--unbalanced", type=int, default=0, choices=[1, 0, None], help="Whether data is unbalanced");
parser_training.set_defaults(func=mTrain)
# add get-feature options
parser_getfeatures = subparsers.add_parser('getfeatures', parents=[parent_parser], help="Get features for all fast5 files", description="Get features for all fast5 files", epilog="For example, \n \
python %(prog)s --wrkBase umr/160617_ecolilowinput_UMR9/called/pass --threads 48 --recursive 0 --posneg 0 --outFolder umr \n \
python %(prog)s --wrkBase sss/160617_ecolilowinput_sssiR9/called/pass --threads 48 --recursive 0 --posneg 1 --outFolder sss \n \
", formatter_class=RawTextHelpFormatter)
parser_getfeatures.add_argument("--posneg", type=int, default=0, choices=[0,1], help="The positive(1) or negative(0) class. Default: 0")
parser_getfeatures.add_argument("--size_per_batch", type=int, default=1, help="The size (unit: 10^7=10M) of a feature file. Default: 1")
parser_getfeatures.add_argument("--fnum", type=int, default=7, help="The number of features. Default: 7")
parser_getfeatures.add_argument("--region", type=str, help="The region of interest. Set to None or empty for all. Format is chr:start_pos:end_pos")
parser_getfeatures.add_argument("--basecall_1d", default="Basecall_1D_000", help="Path for basecall_1d. Default: Basecall_1D_000")
parser_getfeatures.add_argument("--basecall_2strand", default="BaseCalled_template", help="Path for basecall_2strand. Default: BaseCalled_template")
parser_getfeatures.add_argument("--motifORPos", type=int, default=1, help="Use Motif (1) or pos (2) for modified bases. Default: 1")
parser_getfeatures.add_argument("--motif", default='CG', type=str, help="The motif of interest")
parser_getfeatures.add_argument("--ModinMotif", default=0, type=int, help="The position of modified base in the motif of interest")
parser_getfeatures.add_argument("--Ref", help="The reference sequence")
parser_getfeatures.add_argument("--fulmod", type=str, help="The file pattern for full modification: bisultfiteseq/chr20_C*_0.95.txt")
parser_getfeatures.add_argument("--anymod", type=str, help="The file pattern for any modification: bisultfiteseq/chr20_any_0.95.txt")
parser_getfeatures.add_argument("--nomod", type=str, help="The file pattern for any modification: bisultfiteseq/chr20_no1_0.95.txt")
parser_getfeatures.set_defaults(func=mGetFeatures)
# no provided argument
# print help document
if len(sys.argv)<2:
parser.print_help();
else:
args = parser.parse_args()
args.func(args);