--- a +++ b/man/predict_model.Rd @@ -0,0 +1,139 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/predict.R +\name{predict_model} +\alias{predict_model} +\title{Make prediction for nucleotide sequence or entries in fasta/fastq file} +\usage{ +predict_model( + model, + output_format = "one_seq", + layer_name = NULL, + sequence = NULL, + path_input = NULL, + round_digits = NULL, + filename = "states.h5", + step = 1, + vocabulary = c("a", "c", "g", "t"), + batch_size = 256, + verbose = TRUE, + return_states = FALSE, + output_type = "h5", + padding = "none", + use_quality = FALSE, + quality_string = NULL, + mode = "label", + lm_format = "target_right", + output_dir = NULL, + format = "fasta", + include_seq = FALSE, + reverse_complement_encoding = FALSE, + ambiguous_nuc = "zero", + ... +) +} +\arguments{ +\item{model}{A keras model.} + +\item{output_format}{Either \code{"one_seq"}, \code{"by_entry"}, \code{"by_entry_one_file"}, \code{"one_pred_per_entry"}.} + +\item{layer_name}{Name of layer to get output from. If \code{NULL}, will use the last layer.} + +\item{sequence}{Character string, ignores path_input if argument given.} + +\item{path_input}{Path to fasta file.} + +\item{round_digits}{Number of decimal places.} + +\item{filename}{Filename to store states in. No file output if argument is \code{NULL}. +If \code{output_format = "by_entry"}, adds "\emph{nr}" + "i" after name, where i is entry number.} + +\item{step}{Frequency of sampling steps.} + +\item{vocabulary}{Vector of allowed characters. Characters outside vocabulary get encoded as specified in \code{ambiguous_nuc}.} + +\item{batch_size}{Number of samples used for one network update.} + +\item{verbose}{Boolean.} + +\item{return_states}{Return predictions as data frame. Only supported for output_format \code{"one_seq"}.} + +\item{output_type}{\code{"h5"} or \code{"csv"}. If \verb{output_format`` is }"by_entries_one_file", "one_pred_per_entry"\verb{can only be}"h5"`.} + +\item{padding}{Either \code{"none"}, \code{"maxlen"}, \code{"standard"} or \code{"self"}. +\itemize{ +\item If \code{"none"}, apply no padding and skip sequences that are too short. +\item If \code{"maxlen"}, pad with maxlen number of zeros vectors. +\item If \code{"standard"}, pad with zero vectors only if sequence is shorter than maxlen. Pads to minimum size required for one prediction. +\item If \code{"self"}, concatenate sequence with itself until sequence is long enough for one prediction. +Example: if sequence is "ACGT" and maxlen is 10, make prediction for "ACGTACGTAC". +Only applied if sequence is shorter than maxlen. +}} + +\item{use_quality}{Whether to use quality scores.} + +\item{quality_string}{String for encoding with quality scores (as used in fastq format).} + +\item{mode}{Either \code{"lm"} for language model or \code{"label"} for label classification.} + +\item{lm_format}{Either \code{"target_right"}, \code{"target_middle_lstm"}, \code{"target_middle_cnn"} or \code{"wavenet"}.} + +\item{output_dir}{Directory for file output.} + +\item{format}{File format, \code{"fasta"}, \code{"fastq"}, \code{"rds"} or \code{"fasta.tar.gz"}, \code{"fastq.tar.gz"} for \code{tar.gz} files.} + +\item{include_seq}{Whether to include input sequence in h5 file.} + +\item{reverse_complement_encoding}{Whether to use both original sequence and reverse complement as two input sequences.} + +\item{ambiguous_nuc}{How to handle nucleotides outside vocabulary, either \code{"zero"}, \code{"discard"}, \code{"empirical"} or \code{"equal"}. +\itemize{ +\item If \code{"zero"}, input gets encoded as zero vector. +\item If \code{"equal"}, input is repetition of \code{1/length(vocabulary)}. +\item If \code{"discard"}, samples containing nucleotides outside vocabulary get discarded. +\item If \code{"empirical"}, use nucleotide distribution of current file. +}} + +\item{...}{Further arguments for sequence encoding with \code{\link{seq_encoding_label}}.} +} +\value{ +If \code{return_states = TRUE} returns a list of model predictions and position of corresponding sequences. +If additionally \code{include_seq = TRUE}, list contains sequence strings. +If \code{return_states = FALSE} returns nothing, just writes output to file(s). +} +\description{ +Removes layers (optional) from pretrained model and calculates states of fasta/fastq file or nucleotide sequence. +Writes states to h5 or csv file (access content of h5 output with \code{\link{load_prediction}} function). +There are several options on how to process an input file: +\itemize{ +\item If \code{"one_seq"}, computes prediction for sequence argument or fasta/fastq file. +Combines fasta entries in file to one sequence. This means predictor sequences can contain elements from more than one fasta entry. +\item If \code{"by_entry"}, will output a separate file for each fasta/fastq entry. +Names of output files are: \code{output_dir} + "Nr" + i + \code{filename} + \code{output_type}, where i is the number of the fasta entry. +\item If \code{"by_entry_one_file"}, will store prediction for all fasta entries in one h5 file. +\item If \code{"one_pred_per_entry"}, will make one prediction for each entry by either picking random sample for long sequences +or pad sequence for short sequences. +} +} +\examples{ +\dontshow{if (reticulate::py_module_available("tensorflow")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# make prediction for single sequence and write to h5 file +model <- create_model_lstm_cnn(maxlen = 20, layer_lstm = 8, layer_dense = 2, verbose = FALSE) +vocabulary <- c("a", "c", "g", "t") +sequence <- paste(sample(vocabulary, 200, replace = TRUE), collapse = "") +output_file <- tempfile(fileext = ".h5") +predict_model(output_format = "one_seq", model = model, step = 10, + sequence = sequence, filename = output_file, mode = "label") + +# make prediction for fasta file with multiple entries, write output to separate h5 files +fasta_path <- tempfile(fileext = ".fasta") +create_dummy_data(file_path = fasta_path, num_files = 1, + num_seq = 5, seq_length = 100, + write_to_file_path = TRUE) +model <- create_model_lstm_cnn(maxlen = 20, layer_lstm = 8, layer_dense = 2, verbose = FALSE) +output_dir <- tempfile() +dir.create(output_dir) +predict_model(output_format = "by_entry", model = model, step = 10, verbose = FALSE, + output_dir = output_dir, mode = "label", path_input = fasta_path) +list.files(output_dir) +\dontshow{\}) # examplesIf} +}