Diff of /man/predict_model.Rd [000000] .. [409433]

Switch to unified view

a b/man/predict_model.Rd
1
% Generated by roxygen2: do not edit by hand
2
% Please edit documentation in R/predict.R
3
\name{predict_model}
4
\alias{predict_model}
5
\title{Make prediction for nucleotide sequence or entries in fasta/fastq file}
6
\usage{
7
predict_model(
8
  model,
9
  output_format = "one_seq",
10
  layer_name = NULL,
11
  sequence = NULL,
12
  path_input = NULL,
13
  round_digits = NULL,
14
  filename = "states.h5",
15
  step = 1,
16
  vocabulary = c("a", "c", "g", "t"),
17
  batch_size = 256,
18
  verbose = TRUE,
19
  return_states = FALSE,
20
  output_type = "h5",
21
  padding = "none",
22
  use_quality = FALSE,
23
  quality_string = NULL,
24
  mode = "label",
25
  lm_format = "target_right",
26
  output_dir = NULL,
27
  format = "fasta",
28
  include_seq = FALSE,
29
  reverse_complement_encoding = FALSE,
30
  ambiguous_nuc = "zero",
31
  ...
32
)
33
}
34
\arguments{
35
\item{model}{A keras model.}
36
37
\item{output_format}{Either \code{"one_seq"}, \code{"by_entry"}, \code{"by_entry_one_file"}, \code{"one_pred_per_entry"}.}
38
39
\item{layer_name}{Name of layer to get output from. If \code{NULL}, will use the last layer.}
40
41
\item{sequence}{Character string, ignores path_input if argument given.}
42
43
\item{path_input}{Path to fasta file.}
44
45
\item{round_digits}{Number of decimal places.}
46
47
\item{filename}{Filename to store states in. No file output if argument is \code{NULL}.
48
If \code{output_format = "by_entry"}, adds "\emph{nr}" + "i" after name, where i is entry number.}
49
50
\item{step}{Frequency of sampling steps.}
51
52
\item{vocabulary}{Vector of allowed characters. Characters outside vocabulary get encoded as specified in \code{ambiguous_nuc}.}
53
54
\item{batch_size}{Number of samples used for one network update.}
55
56
\item{verbose}{Boolean.}
57
58
\item{return_states}{Return predictions as data frame. Only supported for output_format \code{"one_seq"}.}
59
60
\item{output_type}{\code{"h5"} or \code{"csv"}. If \verb{output_format`` is }"by_entries_one_file", "one_pred_per_entry"\verb{can only be}"h5"`.}
61
62
\item{padding}{Either \code{"none"}, \code{"maxlen"}, \code{"standard"} or \code{"self"}.
63
\itemize{
64
\item If \code{"none"}, apply no padding and skip sequences that are too short.
65
\item If \code{"maxlen"}, pad with maxlen number of zeros vectors.
66
\item If \code{"standard"}, pad with zero vectors only if sequence is shorter than maxlen. Pads to minimum size required for one prediction.
67
\item If \code{"self"}, concatenate sequence with itself until sequence is long enough for one prediction.
68
Example: if sequence is "ACGT" and maxlen is 10, make prediction for "ACGTACGTAC".
69
Only applied if sequence is shorter than maxlen.
70
}}
71
72
\item{use_quality}{Whether to use quality scores.}
73
74
\item{quality_string}{String for encoding with quality scores (as used in fastq format).}
75
76
\item{mode}{Either \code{"lm"} for language model or \code{"label"} for label classification.}
77
78
\item{lm_format}{Either \code{"target_right"}, \code{"target_middle_lstm"}, \code{"target_middle_cnn"} or \code{"wavenet"}.}
79
80
\item{output_dir}{Directory for file output.}
81
82
\item{format}{File format, \code{"fasta"}, \code{"fastq"}, \code{"rds"} or \code{"fasta.tar.gz"}, \code{"fastq.tar.gz"} for \code{tar.gz} files.}
83
84
\item{include_seq}{Whether to include input sequence in h5 file.}
85
86
\item{reverse_complement_encoding}{Whether to use both original sequence and reverse complement as two input sequences.}
87
88
\item{ambiguous_nuc}{How to handle nucleotides outside vocabulary, either \code{"zero"}, \code{"discard"}, \code{"empirical"} or \code{"equal"}.
89
\itemize{
90
\item If \code{"zero"}, input gets encoded as zero vector.
91
\item If \code{"equal"}, input is repetition of \code{1/length(vocabulary)}.
92
\item If \code{"discard"}, samples containing nucleotides outside vocabulary get discarded.
93
\item If \code{"empirical"}, use nucleotide distribution of current file.
94
}}
95
96
\item{...}{Further arguments for sequence encoding with \code{\link{seq_encoding_label}}.}
97
}
98
\value{
99
If \code{return_states = TRUE} returns a list of model predictions and position of corresponding sequences.
100
If additionally \code{include_seq = TRUE}, list contains sequence strings.
101
If \code{return_states = FALSE} returns nothing, just writes output to file(s).
102
}
103
\description{
104
Removes layers (optional) from pretrained model and calculates states of fasta/fastq file or nucleotide sequence.
105
Writes states to h5 or csv file (access content of h5 output with \code{\link{load_prediction}} function).
106
There are several options on how to process an input file:
107
\itemize{
108
\item If \code{"one_seq"}, computes prediction for sequence argument or fasta/fastq file.
109
Combines fasta entries in file to one sequence. This means predictor sequences can contain elements from more than one fasta entry.
110
\item If \code{"by_entry"}, will output a separate file for each fasta/fastq entry.
111
Names of output files are: \code{output_dir} + "Nr" + i + \code{filename} + \code{output_type}, where i is the number of the fasta entry.
112
\item If \code{"by_entry_one_file"}, will store prediction for all fasta entries in one h5 file.
113
\item If \code{"one_pred_per_entry"}, will make one prediction for each entry by either picking random sample for long sequences
114
or pad sequence for short sequences.
115
}
116
}
117
\examples{
118
\dontshow{if (reticulate::py_module_available("tensorflow")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
119
# make prediction for single sequence and write to h5 file
120
model <- create_model_lstm_cnn(maxlen = 20, layer_lstm = 8, layer_dense = 2, verbose = FALSE)
121
vocabulary <- c("a", "c", "g", "t")
122
sequence <- paste(sample(vocabulary, 200, replace = TRUE), collapse = "")
123
output_file <- tempfile(fileext = ".h5")
124
predict_model(output_format = "one_seq", model = model, step = 10,
125
             sequence = sequence, filename = output_file, mode = "label")
126
127
# make prediction for fasta file with multiple entries, write output to separate h5 files
128
fasta_path <- tempfile(fileext = ".fasta")
129
create_dummy_data(file_path = fasta_path, num_files = 1,
130
                 num_seq = 5, seq_length = 100,
131
                 write_to_file_path = TRUE)
132
model <- create_model_lstm_cnn(maxlen = 20, layer_lstm = 8, layer_dense = 2, verbose = FALSE)
133
output_dir <- tempfile()
134
dir.create(output_dir)
135
predict_model(output_format = "by_entry", model = model, step = 10, verbose = FALSE,
136
               output_dir = output_dir, mode = "label", path_input = fasta_path)
137
list.files(output_dir)
138
\dontshow{\}) # examplesIf}
139
}