Diff of /bin/DeepMod.py [000000] .. [c66173]

Switch to unified view

a b/bin/DeepMod.py
1
#!/usr/bin/env python
2
3
import os;
4
import sys;
5
6
import string;
7
8
from collections import defaultdict
9
10
import argparse;
11
from argparse import RawTextHelpFormatter
12
13
from DeepMod_scripts.myCom import *
14
15
16
17
# three modules in DeepMod
18
parser = argparse.ArgumentParser(description="Detect nucleotide modification from nanopore signals data.", epilog="For example, \n \
19
\tpython %(prog)s train: Training a modification classifier.\n \
20
\tpython %(prog)s detect: Detect modification by integrating all long reads. \n \
21
\tpython %(prog)s getfeatures: Get features for training a model.  \n \
22
", formatter_class=RawTextHelpFormatter);
23
24
25
#
26
# Return error message when a value<1
27
# Return an empty string otherwise
28
#
29
def non_negative(i, mstr):
30
   if i<1: return (("\n\tError %d could not be negative(%d)" % (mstr, i)))
31
   else: return ''
32
33
#
34
# Print all parameters in stdout
35
#
36
def printParameters(moptions):
37
   mpkeys = moptions.keys(); #mpkeys.sort()
38
   sorted(mpkeys)
39
   print('%30s: %s' % ('Current directory', os.getcwd()))
40
   for mpk in mpkeys:
41
      print ('%30s: %s' % (mpk, str(moptions[mpk])))
42
   sys.stdout.flush()
43
44
#
45
# Got common argument provided by users or default values.
46
#
47
#
48
def mCommonParam(margs):
49
50
   ErrorMessage = ""
51
   moptions = defaultdict()
52
   # how to output running message: need more control now.
53
   moptions['outLevel'] = margs.outLevel
54
   # the input working base
55
   moptions["wrkBase"] = margs.wrkBase
56
   if moptions["wrkBase"]==None:
57
      ErrorMessage = ErrorMessage + ("\n\tThe input folder is None.")
58
59
   # An unique ID for output
60
   # Usefull for run the program in parallel
61
   moptions["FileID"] = margs.FileID
62
   # output folder;
63
   # make it if the output folder does not exist
64
   moptions['outFolder'] = margs.outFolder
65
   moptions['outFolder'] = format_last_letter_of_folder(moptions['outFolder'])
66
   if moptions['outFolder']==None or (not os.path.isdir(moptions['outFolder'])):
67
      try:
68
         os.system('mkdir -p '+moptions['outFolder']);
69
      except:
70
         ErrorMessage = ErrorMessage + ("\n\tThe output folder (%s) does not exist and cannot be created." % moptions['outFolder'])
71
72
   # check all data in a recurive way
73
   moptions['recursive'] = margs.recursive
74
   # the number of threads used and the number of files handled by each thread.
75
   moptions['files_per_thread'] = margs.files_per_thread
76
   if moptions['files_per_thread']<2: moptions['files_per_thread'] = 2
77
   # the number of threads used
78
   moptions['threads'] = margs.threads
79
   if moptions['threads']<1: moptions['threads'] = 1
80
81
   # windowsize: default=21
82
   moptions['windowsize'] = margs.windowsize
83
   ErrorMessage = ErrorMessage + non_negative(moptions['windowsize'], 'windowsize')
84
   if moptions['windowsize']<1: moptions['windowsize'] = 1
85
86
   # aligners: bwa-mem or minimap2
87
   moptions['alignStr'] = margs.alignStr;
88
89
   moptions['SignalGroup'] = margs.SignalGroup;
90
91
   moptions['move'] = margs.move
92
93
   return [moptions, ErrorMessage]
94
95
#
96
# detect modification for bases of interests
97
# input is a list of fast5 files, a reference genome and a well-trained model.
98
#
99
def mDetect(margs):
100
   # get common parameters
101
   moptions, ErrorMessage = mCommonParam(margs)
102
103
   # path for basecall information in fast5 files
104
   moptions['basecall_1d'] = margs.basecall_1d
105
   moptions['basecall_2strand'] = margs.basecall_2strand
106
   # Whether consider those chromosome which contain -_:/
107
   # default: yes;
108
   moptions['ConUnk'] = margs.ConUnk
109
   # output layer information for deep learning
110
   moptions['outputlayer'] = margs.outputlayer
111
   # base of interest
112
   moptions['Base'] = margs.Base
113
   # whether take cluster effect of methylation into consideration
114
   moptions['mod_cluster'] = margs.mod_cluster
115
   # base of interest
116
   if moptions['Base'] in ["", None]:
117
      ErrorMessage = ErrorMessage + ("\n\t Please provide a base of interest.")
118
119
   # predict medification for bases of interest in long reads first
120
   # only summarize them for each genomic position of interest .
121
   moptions['predDet'] = margs.predDet
122
   if moptions['predDet']:
123
      # path to reference genome
124
      moptions['Ref'] = margs.Ref
125
      if moptions['Ref']==None or (not os.path.isfile(moptions['Ref'])):
126
         ErrorMessage = ErrorMessage + ("\n\t reference file does not exist (%s)" % moptions['Ref'])
127
128
      # the number of feature for each event
129
      moptions['fnum'] = margs.fnum
130
      ErrorMessage = ErrorMessage + non_negative(moptions['fnum'], 'fnum')
131
      # the number of hidden nodes
132
      moptions['hidden'] = margs.hidden
133
      ErrorMessage = ErrorMessage + non_negative(moptions['hidden'], 'hidden')
134
      # the well-trained model
135
      moptions['modfile'] = margs.modfile
136
      if moptions['modfile']==None:
137
         print("No mod file is provided. The default one is used")
138
         moptions['modfile'] = ('train_deepmod/rnn_P90wd%d_f53/mod_train_P90wd%d_f53' % (moptions['windowsize'], moptions['windowsize']))
139
         if (not os.path.isfile(moptions['modfile']+'.meta')):
140
            moptions['modfile'] = ('{}/lib/python{}.{}/site-packages/DeepMod/train_deepmod/rnn_P90wd{}_f53/mod_train_P90wd{}_f53'.format(sys.prefix,sys.version_info.major,sys.version_info.minor, moptions['windowsize'], moptions['windowsize']))
141
      if (not os.path.isfile(moptions['modfile']+'.meta')):
142
         ErrorMessage = ErrorMessage + ("\n\tThe meta file (%s) does not exist" % (moptions['modfile']+'.meta' if not moptions['modfile']==None else ""))
143
   else:
144
      # already done the prediction process?
145
      # Yes: summarize the results only
146
      moptions['predpath'] = margs.predpath
147
      if moptions['predpath']==None or (not os.path.isdir(moptions['predpath'])):
148
         ErrorMessage = ErrorMessage + ("\n\tThe predpath does not exist")
149
150
   # specify region of interest
151
   # not consider bases outside regions in a reference genome
152
   # None: all bases of interest
153
   moptions['region'] = [ ]
154
   if margs.region == None or len(margs.region)==0:
155
      moptions['region'].append([None, None, None])
156
   else:
157
      mregionlist = margs.region.split(';')
158
      for mr in mregionlist:
159
         mr_sp = mr.split(':')
160
         moptions['region'].append([mr_sp[0], int(mr_sp[1]) if len(mr_sp)>1 else None, int(mr_sp[2]) if len(mr_sp)>2 else None ])
161
162
   print("\nNanopore sequencing data analysis is resourece-intensive and time consuming. ")
163
   print("Some potential strong recommendations are below:")
164
   print("\tIf your reference genome is large as human genome and your Nanopore data is huge,")
165
   print("\tIt would be faster to run this program parallelly to speed up.")
166
   print("\tYou might run different input folders of your fast5 files and ")
167
   print("\tgive different output names (--FileID) or folders (--outFolder)")
168
   print("\tA good way for this is to run different chromosome individually.\n")
169
170
   # print help information if any necessary options are not provided.
171
   printParameters(moptions)
172
   if not ErrorMessage=="":
173
      ErrorMessage = "Please provide correct parameters" + ErrorMessage
174
      print(ErrorMessage)
175
      parser.print_help();
176
      parser.parse_args(['detect', '--help']);
177
      sys.exit(1)
178
179
   from DeepMod_scripts import myDetect
180
   myDetect.mDetect_manager(moptions)
181
182
#
183
# Train a model
184
# Need to get features first.
185
#
186
def mTrain(margs):
187
   from DeepMod_scripts import myMultiBiRNN
188
189
   # gent common options
190
   moptions, ErrorMessage = mCommonParam(margs)
191
192
   # network setting: the number of features and the number of hidden nodes
193
   moptions['fnum'] = margs.fnum
194
   ErrorMessage = ErrorMessage + non_negative(moptions['fnum'], 'fnum')
195
   moptions['hidden'] = margs.hidden
196
   ErrorMessage = ErrorMessage + non_negative(moptions['hidden'], 'hidden')
197
198
   # the output function of the deep learning model
199
   moptions['outputlayer'] = margs.outputlayer
200
   # whether using different class weights
201
   moptions['unbalanced'] = margs.unbalanced
202
203
   # re-load trained model and continue to train
204
   moptions['modfile'] = margs.modfile
205
   if moptions['modfile']==None: pass;
206
   elif (not os.path.isfile(moptions['modfile']+'.meta')):
207
      ErrorMessage = ErrorMessage + ("\n\tThe meta file (%s) does not exist" % (moptions['modfile']+'.meta' if not moptions['modfile']==None else ""))
208
209
   # read-based or region based independent training
210
   # E: region-based
211
   # P: read-based.
212
   if not margs.test==None:
213
      moptions['test'] = margs.test.split(',')
214
      if moptions['test'][0] == 'E': moptions['test'][0] = '-'
215
      elif moptions['test'][0] == 'P': moptions['test'][0] = '0'
216
      else:
217
         ErrorMessage = ErrorMessage + "Unknown option for test: the first character must be E or P "+margs.test
218
      if moptions['test'][0] in ['-']:
219
         moptions['test'][1] = int(moptions['test'][1]) * (10**6)
220
         moptions['test'][2] = int(moptions['test'][2]) * (10**6)
221
      else: moptions['test'][1] = int(moptions['test'][1])/100.0
222
   else: moptions['test'] = ['N', '100']
223
224
   # print help document if necessary options are not provided.
225
   print("Train")
226
   printParameters(moptions)
227
   if not ErrorMessage=="":
228
      ErrorMessage = "Please provide correct parameters" + ErrorMessage
229
      print(ErrorMessage)
230
      parser.print_help();
231
      parser.parse_args(['train', '--help']);
232
      sys.exit(2)
233
234
   myMultiBiRNN.mMult_RNN_LSTM_train(moptions)
235
236
#
237
# get features for training
238
#
239
#
240
def mGetFeatures(margs):
241
   from DeepMod_scripts import myGetFeatureBasedPos
242
243
   # get common options
244
   moptions, ErrorMessage = mCommonParam(margs)
245
   # motif-based data: positive or negative control data
246
   moptions['posneg'] = margs.posneg
247
   # the number of features: 7-description or 57-description
248
   moptions['fnum'] = margs.fnum
249
   ErrorMessage = ErrorMessage + non_negative(moptions['fnum'], 'fnum')
250
   # size of each bacth to store features
251
   moptions['size_per_batch'] = margs.size_per_batch
252
   if moptions['size_per_batch'] < 0.001: moptions['size_per_batch'] = 0.001
253
254
   # path to basecall inform in fast5 files
255
   moptions['basecall_1d'] = margs.basecall_1d
256
   moptions['basecall_2strand'] = margs.basecall_2strand
257
258
   # regions of interest
259
   moptions['region'] = [None, None, None]
260
   if not (margs.region==None or margs.region.strip()==''):
261
      rsp = margs.region.split(':')
262
      for rv_ind in range(len(rsp)):
263
         rsp[rv_ind] = rsp[rv_ind].strip();
264
         if not rsp[rv_ind]=='':
265
            moptions['region'][rv_ind] = rsp[rv_ind]
266
267
   # referene genome
268
   moptions['Ref'] = margs.Ref
269
   if moptions['Ref']==None or (not os.path.isfile(moptions['Ref'])):
270
      ErrorMessage = ErrorMessage + ("\n\t reference file does not exist (%s)" % moptions['Ref'])
271
272
   # get motif-based modification
273
   # or specify by --fulmod/--anymod/--nomod
274
   moptions['motifORPos'] = margs.motifORPos
275
   if margs.motifORPos==1:
276
      moptions['motif'] = [margs.motif.upper(), margs.ModinMotif]
277
   elif margs.motifORPos==2:
278
      moptions['fulmod'] = margs.fulmod
279
      if moptions['fulmod']==None: # completely modificated positions
280
         ErrorMessage = ErrorMessage + ("\t There is no parameter for --fulmod.")
281
      moptions['anymod'] = margs.anymod
282
      if moptions['anymod'] == None: # patially modificated positions
283
         ErrorMessage = ErrorMessage + ("\t There is no parameter for --anymod.")
284
      moptions['nomod'] = margs.nomod
285
      if moptions['nomod'] == None: # completely unmodified posisionts
286
         ErrorMessage = ErrorMessage + ("\t There is no parameter for --nomod.")
287
   else:
288
      ErrorMessage = ErrorMessage + ("\tmotifORPos value (%d) is not supported." % margs.motifORPos)
289
290
   # print help document if any required options are not provided.
291
   printParameters(moptions)
292
   if not ErrorMessage=="":
293
      ErrorMessage = "Please provide correct parameters" + ErrorMessage
294
      print(ErrorMessage)
295
      parser.print_help();
296
      parser.parse_args(['getfeatures', '--help']);
297
      sys.exit(1)
298
299
   myGetFeatureBasedPos.getFeature_manager(moptions)
300
301
302
#####################################################################################
303
304
subparsers = parser.add_subparsers()
305
parent_parser = argparse.ArgumentParser(add_help=False)
306
307
# add common options
308
com_group_for_comparison = parent_parser.add_argument_group('Common options.')
309
com_group_for_comparison.add_argument("--outLevel", type=int, choices=[OUTPUT_DEBUG, OUTPUT_INFO, OUTPUT_WARNING, OUTPUT_ERROR], default=OUTPUT_WARNING, help=("The level for output: %d for DEBUG, %d for INFO, %d for WARNING, %d for ERROR. Default: %d" % (OUTPUT_DEBUG, OUTPUT_INFO, OUTPUT_WARNING, OUTPUT_ERROR, OUTPUT_WARNING)))
310
com_group_for_comparison.add_argument("--wrkBase", help="The base folder for FAST5 files.")
311
com_group_for_comparison.add_argument("--FileID", default="mod", help="The unique string for intermediate files and final output files. Default: 'mod'")
312
com_group_for_comparison.add_argument("--outFolder", default='./mod_output', help="The default folder for outputing the results. Default: ./mod_output")
313
com_group_for_comparison.add_argument("--recursive", type=int, default=1, choices=[0,1], help="Recurise to find fast5 files. Default:1")
314
com_group_for_comparison.add_argument("--threads", type=int, default=4, help="The number of threads used (not for train). Default:4")
315
com_group_for_comparison.add_argument("--files_per_thread", type=int, default=1000, help="The number of fast5 files for each thread (not for train). Default:500")
316
com_group_for_comparison.add_argument("--windowsize", type=int, default=21, help="The window size to extract features. Default: 21")
317
com_group_for_comparison.add_argument("--alignStr", type=str, default='minimap2', choices=["bwa","minimap2"], help="Alignment tools (bwa or minimap2 is supported). Default: minimap2")
318
com_group_for_comparison.add_argument("--SignalGroup", type=str, default='simple', choices=["simple","rundif"], help="How to associate signals to each called bases. Default: simple")
319
com_group_for_comparison.add_argument("--move", default=False, action="store_true", help="Whether the basecalled data use move tables instead of event tables. Default: False")
320
321
# add detection options
322
parser_detect = subparsers.add_parser('detect', parents=[parent_parser], help="Detect modifications at a genomic scale", description="Detect modifications by integrating all long reads for a genome", epilog="For example, \n \
323
python %(prog)s --wrkBase ctrl_oligo_SpeI_cut --FileID mod_det --outFolder ./mod_output/detect3 \n \
324
", formatter_class=RawTextHelpFormatter)
325
parser_detect.add_argument("--Ref", help="The reference sequence")
326
parser_detect.add_argument("--predDet", type=int, default=1, choices=[0,1], help="pred first and then detect (1) or only detect (0). Default: 1")
327
parser_detect.add_argument("--predpath", default=None, help="The file path of predictions for each fast5 file. The file pattern is *_*.detail. Default: './mod_output/pred2/'")
328
parser_detect.add_argument("--modfile", type=str, default=None, help="The path to load training model. Default: 'mod_output/'")
329
parser_detect.add_argument("--fnum", type=int, default=7, help="The number of features. Default: 7")
330
parser_detect.add_argument("--hidden", type=int, default=100, help="The number of hidden node. Default: 100")
331
parser_detect.add_argument("--basecall_1d", default="Basecall_1D_000", help="Path for basecall_1d. Default: Basecall_1D_000")
332
parser_detect.add_argument("--basecall_2strand", default="BaseCalled_template", help="Path for basecall_2strand. Default: BaseCalled_template")
333
parser_detect.add_argument("--region", default=None, help="The region of interest: for example, chr:1:100000;chr2:10000");
334
parser_detect.add_argument("--ConUnk", default=True, choices=[False, True], help="Whether contain unknown chromosome");
335
parser_detect.add_argument("--outputlayer", default="", choices=["", "sigmoid"], help="how to put activation function for output layer")
336
parser_detect.add_argument("--Base", type=str, default='C', choices=['A', 'C', 'G', 'T'], help="Interest of bases");
337
parser_detect.add_argument("--mod_cluster", default=0, choices=[0,1], help="1: CpG cluster effect; 0: not");
338
parser_detect.set_defaults(func=mDetect)
339
340
# add training options
341
parser_training = subparsers.add_parser('train', parents=[parent_parser], help="Training a modification classifier", description="Training a modification classifier", epilog="For example, \n \
342
python %(prog)s --wrkBase umr --wrkBase2 sss --FileID mod_train --outFolder ./mod_output/train1 \n \
343
", formatter_class=RawTextHelpFormatter)
344
parser_training.add_argument("--wrkBase2", help="The base folder for long reads without any modifications.")
345
parser_training.add_argument("--fnum", type=int, default=7, help="The number of features. Default: 7")
346
parser_training.add_argument("--hidden", type=int, default=100, help="The number of hidden node. Default: 100")
347
parser_training.add_argument("--modfile", type=str, default=None, help="The path to load training model. Default: 'mod_output/'")
348
parser_training.add_argument("--test", help="The number of E Coli genomic position for testing. Default: 'E,1,2'")
349
parser_training.add_argument("--outputlayer", default="", choices=["", "sigmoid"], help="how to put activation function for output layer")
350
parser_training.add_argument("--unbalanced", type=int, default=0, choices=[1, 0, None], help="Whether data is unbalanced");
351
parser_training.set_defaults(func=mTrain)
352
353
# add get-feature options
354
parser_getfeatures = subparsers.add_parser('getfeatures', parents=[parent_parser], help="Get features for all fast5 files", description="Get features for all fast5 files", epilog="For example, \n \
355
python %(prog)s --wrkBase umr/160617_ecolilowinput_UMR9/called/pass --threads 48 --recursive 0 --posneg 0 --outFolder umr  \n \
356
python %(prog)s --wrkBase sss/160617_ecolilowinput_sssiR9/called/pass --threads 48 --recursive 0 --posneg 1 --outFolder sss \n \
357
", formatter_class=RawTextHelpFormatter)
358
parser_getfeatures.add_argument("--posneg", type=int, default=0, choices=[0,1], help="The positive(1) or negative(0) class. Default: 0")
359
parser_getfeatures.add_argument("--size_per_batch", type=int, default=1, help="The size (unit: 10^7=10M) of a feature file. Default: 1")
360
parser_getfeatures.add_argument("--fnum", type=int, default=7, help="The number of features. Default: 7")
361
parser_getfeatures.add_argument("--region", type=str, help="The region of interest. Set to None or empty for all. Format is chr:start_pos:end_pos")
362
parser_getfeatures.add_argument("--basecall_1d", default="Basecall_1D_000", help="Path for basecall_1d. Default: Basecall_1D_000")
363
parser_getfeatures.add_argument("--basecall_2strand", default="BaseCalled_template", help="Path for basecall_2strand. Default: BaseCalled_template")
364
365
parser_getfeatures.add_argument("--motifORPos", type=int, default=1, help="Use Motif (1) or pos (2) for modified bases. Default: 1")
366
367
parser_getfeatures.add_argument("--motif", default='CG', type=str, help="The motif of interest")
368
parser_getfeatures.add_argument("--ModinMotif", default=0, type=int, help="The position of modified base in the motif of interest")
369
parser_getfeatures.add_argument("--Ref", help="The reference sequence")
370
371
parser_getfeatures.add_argument("--fulmod", type=str, help="The file pattern for full modification: bisultfiteseq/chr20_C*_0.95.txt")
372
parser_getfeatures.add_argument("--anymod", type=str, help="The file pattern for any modification: bisultfiteseq/chr20_any_0.95.txt")
373
parser_getfeatures.add_argument("--nomod", type=str, help="The file pattern for any modification: bisultfiteseq/chr20_no1_0.95.txt")
374
375
parser_getfeatures.set_defaults(func=mGetFeatures)
376
377
# no provided argument
378
# print help document
379
if len(sys.argv)<2:
380
   parser.print_help();
381
else:
382
   args = parser.parse_args()
383
   args.func(args);