[1c0e03]: / man / predict_model.Rd

Download this file

140 lines (115 with data), 6.4 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/predict.R
\name{predict_model}
\alias{predict_model}
\title{Make prediction for nucleotide sequence or entries in fasta/fastq file}
\usage{
predict_model(
model,
output_format = "one_seq",
layer_name = NULL,
sequence = NULL,
path_input = NULL,
round_digits = NULL,
filename = "states.h5",
step = 1,
vocabulary = c("a", "c", "g", "t"),
batch_size = 256,
verbose = TRUE,
return_states = FALSE,
output_type = "h5",
padding = "none",
use_quality = FALSE,
quality_string = NULL,
mode = "label",
lm_format = "target_right",
output_dir = NULL,
format = "fasta",
include_seq = FALSE,
reverse_complement_encoding = FALSE,
ambiguous_nuc = "zero",
...
)
}
\arguments{
\item{model}{A keras model.}
\item{output_format}{Either \code{"one_seq"}, \code{"by_entry"}, \code{"by_entry_one_file"}, \code{"one_pred_per_entry"}.}
\item{layer_name}{Name of layer to get output from. If \code{NULL}, will use the last layer.}
\item{sequence}{Character string, ignores path_input if argument given.}
\item{path_input}{Path to fasta file.}
\item{round_digits}{Number of decimal places.}
\item{filename}{Filename to store states in. No file output if argument is \code{NULL}.
If \code{output_format = "by_entry"}, adds "\emph{nr}" + "i" after name, where i is entry number.}
\item{step}{Frequency of sampling steps.}
\item{vocabulary}{Vector of allowed characters. Characters outside vocabulary get encoded as specified in \code{ambiguous_nuc}.}
\item{batch_size}{Number of samples used for one network update.}
\item{verbose}{Boolean.}
\item{return_states}{Return predictions as data frame. Only supported for output_format \code{"one_seq"}.}
\item{output_type}{\code{"h5"} or \code{"csv"}. If \verb{output_format`` is }"by_entries_one_file", "one_pred_per_entry"\verb{can only be}"h5"`.}
\item{padding}{Either \code{"none"}, \code{"maxlen"}, \code{"standard"} or \code{"self"}.
\itemize{
\item If \code{"none"}, apply no padding and skip sequences that are too short.
\item If \code{"maxlen"}, pad with maxlen number of zeros vectors.
\item If \code{"standard"}, pad with zero vectors only if sequence is shorter than maxlen. Pads to minimum size required for one prediction.
\item If \code{"self"}, concatenate sequence with itself until sequence is long enough for one prediction.
Example: if sequence is "ACGT" and maxlen is 10, make prediction for "ACGTACGTAC".
Only applied if sequence is shorter than maxlen.
}}
\item{use_quality}{Whether to use quality scores.}
\item{quality_string}{String for encoding with quality scores (as used in fastq format).}
\item{mode}{Either \code{"lm"} for language model or \code{"label"} for label classification.}
\item{lm_format}{Either \code{"target_right"}, \code{"target_middle_lstm"}, \code{"target_middle_cnn"} or \code{"wavenet"}.}
\item{output_dir}{Directory for file output.}
\item{format}{File format, \code{"fasta"}, \code{"fastq"}, \code{"rds"} or \code{"fasta.tar.gz"}, \code{"fastq.tar.gz"} for \code{tar.gz} files.}
\item{include_seq}{Whether to include input sequence in h5 file.}
\item{reverse_complement_encoding}{Whether to use both original sequence and reverse complement as two input sequences.}
\item{ambiguous_nuc}{How to handle nucleotides outside vocabulary, either \code{"zero"}, \code{"discard"}, \code{"empirical"} or \code{"equal"}.
\itemize{
\item If \code{"zero"}, input gets encoded as zero vector.
\item If \code{"equal"}, input is repetition of \code{1/length(vocabulary)}.
\item If \code{"discard"}, samples containing nucleotides outside vocabulary get discarded.
\item If \code{"empirical"}, use nucleotide distribution of current file.
}}
\item{...}{Further arguments for sequence encoding with \code{\link{seq_encoding_label}}.}
}
\value{
If \code{return_states = TRUE} returns a list of model predictions and position of corresponding sequences.
If additionally \code{include_seq = TRUE}, list contains sequence strings.
If \code{return_states = FALSE} returns nothing, just writes output to file(s).
}
\description{
Removes layers (optional) from pretrained model and calculates states of fasta/fastq file or nucleotide sequence.
Writes states to h5 or csv file (access content of h5 output with \code{\link{load_prediction}} function).
There are several options on how to process an input file:
\itemize{
\item If \code{"one_seq"}, computes prediction for sequence argument or fasta/fastq file.
Combines fasta entries in file to one sequence. This means predictor sequences can contain elements from more than one fasta entry.
\item If \code{"by_entry"}, will output a separate file for each fasta/fastq entry.
Names of output files are: \code{output_dir} + "Nr" + i + \code{filename} + \code{output_type}, where i is the number of the fasta entry.
\item If \code{"by_entry_one_file"}, will store prediction for all fasta entries in one h5 file.
\item If \code{"one_pred_per_entry"}, will make one prediction for each entry by either picking random sample for long sequences
or pad sequence for short sequences.
}
}
\examples{
\dontshow{if (reticulate::py_module_available("tensorflow")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
# make prediction for single sequence and write to h5 file
model <- create_model_lstm_cnn(maxlen = 20, layer_lstm = 8, layer_dense = 2, verbose = FALSE)
vocabulary <- c("a", "c", "g", "t")
sequence <- paste(sample(vocabulary, 200, replace = TRUE), collapse = "")
output_file <- tempfile(fileext = ".h5")
predict_model(output_format = "one_seq", model = model, step = 10,
sequence = sequence, filename = output_file, mode = "label")
# make prediction for fasta file with multiple entries, write output to separate h5 files
fasta_path <- tempfile(fileext = ".fasta")
create_dummy_data(file_path = fasta_path, num_files = 1,
num_seq = 5, seq_length = 100,
write_to_file_path = TRUE)
model <- create_model_lstm_cnn(maxlen = 20, layer_lstm = 8, layer_dense = 2, verbose = FALSE)
output_dir <- tempfile()
dir.create(output_dir)
predict_model(output_format = "by_entry", model = model, step = 10, verbose = FALSE,
output_dir = output_dir, mode = "label", path_input = fasta_path)
list.files(output_dir)
\dontshow{\}) # examplesIf}
}