Diff of /bin/DeepMod.py [000000] .. [c66173]

Switch to side-by-side view

--- a
+++ b/bin/DeepMod.py
@@ -0,0 +1,383 @@
+#!/usr/bin/env python
+
+import os;
+import sys;
+
+import string;
+
+from collections import defaultdict
+
+import argparse;
+from argparse import RawTextHelpFormatter
+
+from DeepMod_scripts.myCom import *
+
+
+
+# three modules in DeepMod
+parser = argparse.ArgumentParser(description="Detect nucleotide modification from nanopore signals data.", epilog="For example, \n \
+\tpython %(prog)s train: Training a modification classifier.\n \
+\tpython %(prog)s detect: Detect modification by integrating all long reads. \n \
+\tpython %(prog)s getfeatures: Get features for training a model.  \n \
+", formatter_class=RawTextHelpFormatter);
+
+
+#
+# Return error message when a value<1
+# Return an empty string otherwise
+#
+def non_negative(i, mstr):
+   if i<1: return (("\n\tError %d could not be negative(%d)" % (mstr, i)))
+   else: return ''
+
+#
+# Print all parameters in stdout
+#
+def printParameters(moptions):
+   mpkeys = moptions.keys(); #mpkeys.sort()
+   sorted(mpkeys)
+   print('%30s: %s' % ('Current directory', os.getcwd()))
+   for mpk in mpkeys:
+      print ('%30s: %s' % (mpk, str(moptions[mpk])))
+   sys.stdout.flush()
+
+#
+# Got common argument provided by users or default values.
+#
+#
+def mCommonParam(margs):
+
+   ErrorMessage = ""
+   moptions = defaultdict()
+   # how to output running message: need more control now.
+   moptions['outLevel'] = margs.outLevel
+   # the input working base
+   moptions["wrkBase"] = margs.wrkBase
+   if moptions["wrkBase"]==None:
+      ErrorMessage = ErrorMessage + ("\n\tThe input folder is None.")
+
+   # An unique ID for output
+   # Usefull for run the program in parallel
+   moptions["FileID"] = margs.FileID
+   # output folder;
+   # make it if the output folder does not exist
+   moptions['outFolder'] = margs.outFolder
+   moptions['outFolder'] = format_last_letter_of_folder(moptions['outFolder'])
+   if moptions['outFolder']==None or (not os.path.isdir(moptions['outFolder'])):
+      try:
+         os.system('mkdir -p '+moptions['outFolder']);
+      except:
+         ErrorMessage = ErrorMessage + ("\n\tThe output folder (%s) does not exist and cannot be created." % moptions['outFolder'])
+
+   # check all data in a recurive way
+   moptions['recursive'] = margs.recursive
+   # the number of threads used and the number of files handled by each thread.
+   moptions['files_per_thread'] = margs.files_per_thread
+   if moptions['files_per_thread']<2: moptions['files_per_thread'] = 2
+   # the number of threads used
+   moptions['threads'] = margs.threads
+   if moptions['threads']<1: moptions['threads'] = 1
+
+   # windowsize: default=21
+   moptions['windowsize'] = margs.windowsize
+   ErrorMessage = ErrorMessage + non_negative(moptions['windowsize'], 'windowsize')
+   if moptions['windowsize']<1: moptions['windowsize'] = 1
+
+   # aligners: bwa-mem or minimap2
+   moptions['alignStr'] = margs.alignStr;
+
+   moptions['SignalGroup'] = margs.SignalGroup;
+
+   moptions['move'] = margs.move
+
+   return [moptions, ErrorMessage]
+
+#
+# detect modification for bases of interests
+# input is a list of fast5 files, a reference genome and a well-trained model.
+#
+def mDetect(margs):
+   # get common parameters
+   moptions, ErrorMessage = mCommonParam(margs)
+
+   # path for basecall information in fast5 files
+   moptions['basecall_1d'] = margs.basecall_1d
+   moptions['basecall_2strand'] = margs.basecall_2strand
+   # Whether consider those chromosome which contain -_:/
+   # default: yes;
+   moptions['ConUnk'] = margs.ConUnk
+   # output layer information for deep learning
+   moptions['outputlayer'] = margs.outputlayer
+   # base of interest
+   moptions['Base'] = margs.Base
+   # whether take cluster effect of methylation into consideration
+   moptions['mod_cluster'] = margs.mod_cluster
+   # base of interest
+   if moptions['Base'] in ["", None]:
+      ErrorMessage = ErrorMessage + ("\n\t Please provide a base of interest.")
+
+   # predict medification for bases of interest in long reads first
+   # only summarize them for each genomic position of interest .
+   moptions['predDet'] = margs.predDet
+   if moptions['predDet']:
+      # path to reference genome
+      moptions['Ref'] = margs.Ref
+      if moptions['Ref']==None or (not os.path.isfile(moptions['Ref'])):
+         ErrorMessage = ErrorMessage + ("\n\t reference file does not exist (%s)" % moptions['Ref'])
+
+      # the number of feature for each event
+      moptions['fnum'] = margs.fnum
+      ErrorMessage = ErrorMessage + non_negative(moptions['fnum'], 'fnum')
+      # the number of hidden nodes
+      moptions['hidden'] = margs.hidden
+      ErrorMessage = ErrorMessage + non_negative(moptions['hidden'], 'hidden')
+      # the well-trained model
+      moptions['modfile'] = margs.modfile
+      if moptions['modfile']==None:
+         print("No mod file is provided. The default one is used")
+         moptions['modfile'] = ('train_deepmod/rnn_P90wd%d_f53/mod_train_P90wd%d_f53' % (moptions['windowsize'], moptions['windowsize']))
+         if (not os.path.isfile(moptions['modfile']+'.meta')):
+            moptions['modfile'] = ('{}/lib/python{}.{}/site-packages/DeepMod/train_deepmod/rnn_P90wd{}_f53/mod_train_P90wd{}_f53'.format(sys.prefix,sys.version_info.major,sys.version_info.minor, moptions['windowsize'], moptions['windowsize']))
+      if (not os.path.isfile(moptions['modfile']+'.meta')):
+         ErrorMessage = ErrorMessage + ("\n\tThe meta file (%s) does not exist" % (moptions['modfile']+'.meta' if not moptions['modfile']==None else ""))
+   else:
+      # already done the prediction process?
+      # Yes: summarize the results only
+      moptions['predpath'] = margs.predpath
+      if moptions['predpath']==None or (not os.path.isdir(moptions['predpath'])):
+         ErrorMessage = ErrorMessage + ("\n\tThe predpath does not exist")
+
+   # specify region of interest
+   # not consider bases outside regions in a reference genome
+   # None: all bases of interest
+   moptions['region'] = [ ]
+   if margs.region == None or len(margs.region)==0:
+      moptions['region'].append([None, None, None])
+   else:
+      mregionlist = margs.region.split(';')
+      for mr in mregionlist:
+         mr_sp = mr.split(':')
+         moptions['region'].append([mr_sp[0], int(mr_sp[1]) if len(mr_sp)>1 else None, int(mr_sp[2]) if len(mr_sp)>2 else None ])
+
+   print("\nNanopore sequencing data analysis is resourece-intensive and time consuming. ")
+   print("Some potential strong recommendations are below:")
+   print("\tIf your reference genome is large as human genome and your Nanopore data is huge,")
+   print("\tIt would be faster to run this program parallelly to speed up.")
+   print("\tYou might run different input folders of your fast5 files and ")
+   print("\tgive different output names (--FileID) or folders (--outFolder)")
+   print("\tA good way for this is to run different chromosome individually.\n")
+
+   # print help information if any necessary options are not provided.
+   printParameters(moptions)
+   if not ErrorMessage=="":
+      ErrorMessage = "Please provide correct parameters" + ErrorMessage
+      print(ErrorMessage)
+      parser.print_help();
+      parser.parse_args(['detect', '--help']);
+      sys.exit(1)
+
+   from DeepMod_scripts import myDetect
+   myDetect.mDetect_manager(moptions)
+
+#
+# Train a model
+# Need to get features first.
+#
+def mTrain(margs):
+   from DeepMod_scripts import myMultiBiRNN
+
+   # gent common options
+   moptions, ErrorMessage = mCommonParam(margs)
+
+   # network setting: the number of features and the number of hidden nodes
+   moptions['fnum'] = margs.fnum
+   ErrorMessage = ErrorMessage + non_negative(moptions['fnum'], 'fnum')
+   moptions['hidden'] = margs.hidden
+   ErrorMessage = ErrorMessage + non_negative(moptions['hidden'], 'hidden')
+
+   # the output function of the deep learning model
+   moptions['outputlayer'] = margs.outputlayer
+   # whether using different class weights
+   moptions['unbalanced'] = margs.unbalanced
+
+   # re-load trained model and continue to train
+   moptions['modfile'] = margs.modfile
+   if moptions['modfile']==None: pass;
+   elif (not os.path.isfile(moptions['modfile']+'.meta')):
+      ErrorMessage = ErrorMessage + ("\n\tThe meta file (%s) does not exist" % (moptions['modfile']+'.meta' if not moptions['modfile']==None else ""))
+
+   # read-based or region based independent training
+   # E: region-based
+   # P: read-based.
+   if not margs.test==None:
+      moptions['test'] = margs.test.split(',')
+      if moptions['test'][0] == 'E': moptions['test'][0] = '-'
+      elif moptions['test'][0] == 'P': moptions['test'][0] = '0'
+      else:
+         ErrorMessage = ErrorMessage + "Unknown option for test: the first character must be E or P "+margs.test
+      if moptions['test'][0] in ['-']:
+         moptions['test'][1] = int(moptions['test'][1]) * (10**6)
+         moptions['test'][2] = int(moptions['test'][2]) * (10**6)
+      else: moptions['test'][1] = int(moptions['test'][1])/100.0
+   else: moptions['test'] = ['N', '100']
+
+   # print help document if necessary options are not provided.
+   print("Train")
+   printParameters(moptions)
+   if not ErrorMessage=="":
+      ErrorMessage = "Please provide correct parameters" + ErrorMessage
+      print(ErrorMessage)
+      parser.print_help();
+      parser.parse_args(['train', '--help']);
+      sys.exit(2)
+
+   myMultiBiRNN.mMult_RNN_LSTM_train(moptions)
+
+#
+# get features for training
+#
+#
+def mGetFeatures(margs):
+   from DeepMod_scripts import myGetFeatureBasedPos
+
+   # get common options
+   moptions, ErrorMessage = mCommonParam(margs)
+   # motif-based data: positive or negative control data
+   moptions['posneg'] = margs.posneg
+   # the number of features: 7-description or 57-description
+   moptions['fnum'] = margs.fnum
+   ErrorMessage = ErrorMessage + non_negative(moptions['fnum'], 'fnum')
+   # size of each bacth to store features
+   moptions['size_per_batch'] = margs.size_per_batch
+   if moptions['size_per_batch'] < 0.001: moptions['size_per_batch'] = 0.001
+
+   # path to basecall inform in fast5 files
+   moptions['basecall_1d'] = margs.basecall_1d
+   moptions['basecall_2strand'] = margs.basecall_2strand
+
+   # regions of interest
+   moptions['region'] = [None, None, None]
+   if not (margs.region==None or margs.region.strip()==''):
+      rsp = margs.region.split(':')
+      for rv_ind in range(len(rsp)):
+         rsp[rv_ind] = rsp[rv_ind].strip();
+         if not rsp[rv_ind]=='':
+            moptions['region'][rv_ind] = rsp[rv_ind]
+
+   # referene genome
+   moptions['Ref'] = margs.Ref
+   if moptions['Ref']==None or (not os.path.isfile(moptions['Ref'])):
+      ErrorMessage = ErrorMessage + ("\n\t reference file does not exist (%s)" % moptions['Ref'])
+
+   # get motif-based modification
+   # or specify by --fulmod/--anymod/--nomod
+   moptions['motifORPos'] = margs.motifORPos
+   if margs.motifORPos==1:
+      moptions['motif'] = [margs.motif.upper(), margs.ModinMotif]
+   elif margs.motifORPos==2:
+      moptions['fulmod'] = margs.fulmod
+      if moptions['fulmod']==None: # completely modificated positions
+         ErrorMessage = ErrorMessage + ("\t There is no parameter for --fulmod.")
+      moptions['anymod'] = margs.anymod
+      if moptions['anymod'] == None: # patially modificated positions
+         ErrorMessage = ErrorMessage + ("\t There is no parameter for --anymod.")
+      moptions['nomod'] = margs.nomod
+      if moptions['nomod'] == None: # completely unmodified posisionts
+         ErrorMessage = ErrorMessage + ("\t There is no parameter for --nomod.")
+   else:
+      ErrorMessage = ErrorMessage + ("\tmotifORPos value (%d) is not supported." % margs.motifORPos)
+
+   # print help document if any required options are not provided.
+   printParameters(moptions)
+   if not ErrorMessage=="":
+      ErrorMessage = "Please provide correct parameters" + ErrorMessage
+      print(ErrorMessage)
+      parser.print_help();
+      parser.parse_args(['getfeatures', '--help']);
+      sys.exit(1)
+
+   myGetFeatureBasedPos.getFeature_manager(moptions)
+
+
+#####################################################################################
+
+subparsers = parser.add_subparsers()
+parent_parser = argparse.ArgumentParser(add_help=False)
+
+# add common options
+com_group_for_comparison = parent_parser.add_argument_group('Common options.')
+com_group_for_comparison.add_argument("--outLevel", type=int, choices=[OUTPUT_DEBUG, OUTPUT_INFO, OUTPUT_WARNING, OUTPUT_ERROR], default=OUTPUT_WARNING, help=("The level for output: %d for DEBUG, %d for INFO, %d for WARNING, %d for ERROR. Default: %d" % (OUTPUT_DEBUG, OUTPUT_INFO, OUTPUT_WARNING, OUTPUT_ERROR, OUTPUT_WARNING)))
+com_group_for_comparison.add_argument("--wrkBase", help="The base folder for FAST5 files.")
+com_group_for_comparison.add_argument("--FileID", default="mod", help="The unique string for intermediate files and final output files. Default: 'mod'")
+com_group_for_comparison.add_argument("--outFolder", default='./mod_output', help="The default folder for outputing the results. Default: ./mod_output")
+com_group_for_comparison.add_argument("--recursive", type=int, default=1, choices=[0,1], help="Recurise to find fast5 files. Default:1")
+com_group_for_comparison.add_argument("--threads", type=int, default=4, help="The number of threads used (not for train). Default:4")
+com_group_for_comparison.add_argument("--files_per_thread", type=int, default=1000, help="The number of fast5 files for each thread (not for train). Default:500")
+com_group_for_comparison.add_argument("--windowsize", type=int, default=21, help="The window size to extract features. Default: 21")
+com_group_for_comparison.add_argument("--alignStr", type=str, default='minimap2', choices=["bwa","minimap2"], help="Alignment tools (bwa or minimap2 is supported). Default: minimap2")
+com_group_for_comparison.add_argument("--SignalGroup", type=str, default='simple', choices=["simple","rundif"], help="How to associate signals to each called bases. Default: simple")
+com_group_for_comparison.add_argument("--move", default=False, action="store_true", help="Whether the basecalled data use move tables instead of event tables. Default: False")
+
+# add detection options
+parser_detect = subparsers.add_parser('detect', parents=[parent_parser], help="Detect modifications at a genomic scale", description="Detect modifications by integrating all long reads for a genome", epilog="For example, \n \
+python %(prog)s --wrkBase ctrl_oligo_SpeI_cut --FileID mod_det --outFolder ./mod_output/detect3 \n \
+", formatter_class=RawTextHelpFormatter)
+parser_detect.add_argument("--Ref", help="The reference sequence")
+parser_detect.add_argument("--predDet", type=int, default=1, choices=[0,1], help="pred first and then detect (1) or only detect (0). Default: 1")
+parser_detect.add_argument("--predpath", default=None, help="The file path of predictions for each fast5 file. The file pattern is *_*.detail. Default: './mod_output/pred2/'")
+parser_detect.add_argument("--modfile", type=str, default=None, help="The path to load training model. Default: 'mod_output/'")
+parser_detect.add_argument("--fnum", type=int, default=7, help="The number of features. Default: 7")
+parser_detect.add_argument("--hidden", type=int, default=100, help="The number of hidden node. Default: 100")
+parser_detect.add_argument("--basecall_1d", default="Basecall_1D_000", help="Path for basecall_1d. Default: Basecall_1D_000")
+parser_detect.add_argument("--basecall_2strand", default="BaseCalled_template", help="Path for basecall_2strand. Default: BaseCalled_template")
+parser_detect.add_argument("--region", default=None, help="The region of interest: for example, chr:1:100000;chr2:10000");
+parser_detect.add_argument("--ConUnk", default=True, choices=[False, True], help="Whether contain unknown chromosome");
+parser_detect.add_argument("--outputlayer", default="", choices=["", "sigmoid"], help="how to put activation function for output layer")
+parser_detect.add_argument("--Base", type=str, default='C', choices=['A', 'C', 'G', 'T'], help="Interest of bases");
+parser_detect.add_argument("--mod_cluster", default=0, choices=[0,1], help="1: CpG cluster effect; 0: not");
+parser_detect.set_defaults(func=mDetect)
+
+# add training options
+parser_training = subparsers.add_parser('train', parents=[parent_parser], help="Training a modification classifier", description="Training a modification classifier", epilog="For example, \n \
+python %(prog)s --wrkBase umr --wrkBase2 sss --FileID mod_train --outFolder ./mod_output/train1 \n \
+", formatter_class=RawTextHelpFormatter)
+parser_training.add_argument("--wrkBase2", help="The base folder for long reads without any modifications.")
+parser_training.add_argument("--fnum", type=int, default=7, help="The number of features. Default: 7")
+parser_training.add_argument("--hidden", type=int, default=100, help="The number of hidden node. Default: 100")
+parser_training.add_argument("--modfile", type=str, default=None, help="The path to load training model. Default: 'mod_output/'")
+parser_training.add_argument("--test", help="The number of E Coli genomic position for testing. Default: 'E,1,2'")
+parser_training.add_argument("--outputlayer", default="", choices=["", "sigmoid"], help="how to put activation function for output layer")
+parser_training.add_argument("--unbalanced", type=int, default=0, choices=[1, 0, None], help="Whether data is unbalanced");
+parser_training.set_defaults(func=mTrain)
+
+# add get-feature options
+parser_getfeatures = subparsers.add_parser('getfeatures', parents=[parent_parser], help="Get features for all fast5 files", description="Get features for all fast5 files", epilog="For example, \n \
+python %(prog)s --wrkBase umr/160617_ecolilowinput_UMR9/called/pass --threads 48 --recursive 0 --posneg 0 --outFolder umr  \n \
+python %(prog)s --wrkBase sss/160617_ecolilowinput_sssiR9/called/pass --threads 48 --recursive 0 --posneg 1 --outFolder sss \n \
+", formatter_class=RawTextHelpFormatter)
+parser_getfeatures.add_argument("--posneg", type=int, default=0, choices=[0,1], help="The positive(1) or negative(0) class. Default: 0")
+parser_getfeatures.add_argument("--size_per_batch", type=int, default=1, help="The size (unit: 10^7=10M) of a feature file. Default: 1")
+parser_getfeatures.add_argument("--fnum", type=int, default=7, help="The number of features. Default: 7")
+parser_getfeatures.add_argument("--region", type=str, help="The region of interest. Set to None or empty for all. Format is chr:start_pos:end_pos")
+parser_getfeatures.add_argument("--basecall_1d", default="Basecall_1D_000", help="Path for basecall_1d. Default: Basecall_1D_000")
+parser_getfeatures.add_argument("--basecall_2strand", default="BaseCalled_template", help="Path for basecall_2strand. Default: BaseCalled_template")
+
+parser_getfeatures.add_argument("--motifORPos", type=int, default=1, help="Use Motif (1) or pos (2) for modified bases. Default: 1")
+
+parser_getfeatures.add_argument("--motif", default='CG', type=str, help="The motif of interest")
+parser_getfeatures.add_argument("--ModinMotif", default=0, type=int, help="The position of modified base in the motif of interest")
+parser_getfeatures.add_argument("--Ref", help="The reference sequence")
+
+parser_getfeatures.add_argument("--fulmod", type=str, help="The file pattern for full modification: bisultfiteseq/chr20_C*_0.95.txt")
+parser_getfeatures.add_argument("--anymod", type=str, help="The file pattern for any modification: bisultfiteseq/chr20_any_0.95.txt")
+parser_getfeatures.add_argument("--nomod", type=str, help="The file pattern for any modification: bisultfiteseq/chr20_no1_0.95.txt")
+
+parser_getfeatures.set_defaults(func=mGetFeatures)
+
+# no provided argument
+# print help document
+if len(sys.argv)<2:
+   parser.print_help();
+else:
+   args = parser.parse_args()
+   args.func(args);