|
a |
|
b/man/predict_model.Rd |
|
|
1 |
% Generated by roxygen2: do not edit by hand |
|
|
2 |
% Please edit documentation in R/predict.R |
|
|
3 |
\name{predict_model} |
|
|
4 |
\alias{predict_model} |
|
|
5 |
\title{Make prediction for nucleotide sequence or entries in fasta/fastq file} |
|
|
6 |
\usage{ |
|
|
7 |
predict_model( |
|
|
8 |
model, |
|
|
9 |
output_format = "one_seq", |
|
|
10 |
layer_name = NULL, |
|
|
11 |
sequence = NULL, |
|
|
12 |
path_input = NULL, |
|
|
13 |
round_digits = NULL, |
|
|
14 |
filename = "states.h5", |
|
|
15 |
step = 1, |
|
|
16 |
vocabulary = c("a", "c", "g", "t"), |
|
|
17 |
batch_size = 256, |
|
|
18 |
verbose = TRUE, |
|
|
19 |
return_states = FALSE, |
|
|
20 |
output_type = "h5", |
|
|
21 |
padding = "none", |
|
|
22 |
use_quality = FALSE, |
|
|
23 |
quality_string = NULL, |
|
|
24 |
mode = "label", |
|
|
25 |
lm_format = "target_right", |
|
|
26 |
output_dir = NULL, |
|
|
27 |
format = "fasta", |
|
|
28 |
include_seq = FALSE, |
|
|
29 |
reverse_complement_encoding = FALSE, |
|
|
30 |
ambiguous_nuc = "zero", |
|
|
31 |
... |
|
|
32 |
) |
|
|
33 |
} |
|
|
34 |
\arguments{ |
|
|
35 |
\item{model}{A keras model.} |
|
|
36 |
|
|
|
37 |
\item{output_format}{Either \code{"one_seq"}, \code{"by_entry"}, \code{"by_entry_one_file"}, \code{"one_pred_per_entry"}.} |
|
|
38 |
|
|
|
39 |
\item{layer_name}{Name of layer to get output from. If \code{NULL}, will use the last layer.} |
|
|
40 |
|
|
|
41 |
\item{sequence}{Character string, ignores path_input if argument given.} |
|
|
42 |
|
|
|
43 |
\item{path_input}{Path to fasta file.} |
|
|
44 |
|
|
|
45 |
\item{round_digits}{Number of decimal places.} |
|
|
46 |
|
|
|
47 |
\item{filename}{Filename to store states in. No file output if argument is \code{NULL}. |
|
|
48 |
If \code{output_format = "by_entry"}, adds "\emph{nr}" + "i" after name, where i is entry number.} |
|
|
49 |
|
|
|
50 |
\item{step}{Frequency of sampling steps.} |
|
|
51 |
|
|
|
52 |
\item{vocabulary}{Vector of allowed characters. Characters outside vocabulary get encoded as specified in \code{ambiguous_nuc}.} |
|
|
53 |
|
|
|
54 |
\item{batch_size}{Number of samples used for one network update.} |
|
|
55 |
|
|
|
56 |
\item{verbose}{Boolean.} |
|
|
57 |
|
|
|
58 |
\item{return_states}{Return predictions as data frame. Only supported for output_format \code{"one_seq"}.} |
|
|
59 |
|
|
|
60 |
\item{output_type}{\code{"h5"} or \code{"csv"}. If \verb{output_format`` is }"by_entries_one_file", "one_pred_per_entry"\verb{can only be}"h5"`.} |
|
|
61 |
|
|
|
62 |
\item{padding}{Either \code{"none"}, \code{"maxlen"}, \code{"standard"} or \code{"self"}. |
|
|
63 |
\itemize{ |
|
|
64 |
\item If \code{"none"}, apply no padding and skip sequences that are too short. |
|
|
65 |
\item If \code{"maxlen"}, pad with maxlen number of zeros vectors. |
|
|
66 |
\item If \code{"standard"}, pad with zero vectors only if sequence is shorter than maxlen. Pads to minimum size required for one prediction. |
|
|
67 |
\item If \code{"self"}, concatenate sequence with itself until sequence is long enough for one prediction. |
|
|
68 |
Example: if sequence is "ACGT" and maxlen is 10, make prediction for "ACGTACGTAC". |
|
|
69 |
Only applied if sequence is shorter than maxlen. |
|
|
70 |
}} |
|
|
71 |
|
|
|
72 |
\item{use_quality}{Whether to use quality scores.} |
|
|
73 |
|
|
|
74 |
\item{quality_string}{String for encoding with quality scores (as used in fastq format).} |
|
|
75 |
|
|
|
76 |
\item{mode}{Either \code{"lm"} for language model or \code{"label"} for label classification.} |
|
|
77 |
|
|
|
78 |
\item{lm_format}{Either \code{"target_right"}, \code{"target_middle_lstm"}, \code{"target_middle_cnn"} or \code{"wavenet"}.} |
|
|
79 |
|
|
|
80 |
\item{output_dir}{Directory for file output.} |
|
|
81 |
|
|
|
82 |
\item{format}{File format, \code{"fasta"}, \code{"fastq"}, \code{"rds"} or \code{"fasta.tar.gz"}, \code{"fastq.tar.gz"} for \code{tar.gz} files.} |
|
|
83 |
|
|
|
84 |
\item{include_seq}{Whether to include input sequence in h5 file.} |
|
|
85 |
|
|
|
86 |
\item{reverse_complement_encoding}{Whether to use both original sequence and reverse complement as two input sequences.} |
|
|
87 |
|
|
|
88 |
\item{ambiguous_nuc}{How to handle nucleotides outside vocabulary, either \code{"zero"}, \code{"discard"}, \code{"empirical"} or \code{"equal"}. |
|
|
89 |
\itemize{ |
|
|
90 |
\item If \code{"zero"}, input gets encoded as zero vector. |
|
|
91 |
\item If \code{"equal"}, input is repetition of \code{1/length(vocabulary)}. |
|
|
92 |
\item If \code{"discard"}, samples containing nucleotides outside vocabulary get discarded. |
|
|
93 |
\item If \code{"empirical"}, use nucleotide distribution of current file. |
|
|
94 |
}} |
|
|
95 |
|
|
|
96 |
\item{...}{Further arguments for sequence encoding with \code{\link{seq_encoding_label}}.} |
|
|
97 |
} |
|
|
98 |
\value{ |
|
|
99 |
If \code{return_states = TRUE} returns a list of model predictions and position of corresponding sequences. |
|
|
100 |
If additionally \code{include_seq = TRUE}, list contains sequence strings. |
|
|
101 |
If \code{return_states = FALSE} returns nothing, just writes output to file(s). |
|
|
102 |
} |
|
|
103 |
\description{ |
|
|
104 |
Removes layers (optional) from pretrained model and calculates states of fasta/fastq file or nucleotide sequence. |
|
|
105 |
Writes states to h5 or csv file (access content of h5 output with \code{\link{load_prediction}} function). |
|
|
106 |
There are several options on how to process an input file: |
|
|
107 |
\itemize{ |
|
|
108 |
\item If \code{"one_seq"}, computes prediction for sequence argument or fasta/fastq file. |
|
|
109 |
Combines fasta entries in file to one sequence. This means predictor sequences can contain elements from more than one fasta entry. |
|
|
110 |
\item If \code{"by_entry"}, will output a separate file for each fasta/fastq entry. |
|
|
111 |
Names of output files are: \code{output_dir} + "Nr" + i + \code{filename} + \code{output_type}, where i is the number of the fasta entry. |
|
|
112 |
\item If \code{"by_entry_one_file"}, will store prediction for all fasta entries in one h5 file. |
|
|
113 |
\item If \code{"one_pred_per_entry"}, will make one prediction for each entry by either picking random sample for long sequences |
|
|
114 |
or pad sequence for short sequences. |
|
|
115 |
} |
|
|
116 |
} |
|
|
117 |
\examples{ |
|
|
118 |
\dontshow{if (reticulate::py_module_available("tensorflow")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} |
|
|
119 |
# make prediction for single sequence and write to h5 file |
|
|
120 |
model <- create_model_lstm_cnn(maxlen = 20, layer_lstm = 8, layer_dense = 2, verbose = FALSE) |
|
|
121 |
vocabulary <- c("a", "c", "g", "t") |
|
|
122 |
sequence <- paste(sample(vocabulary, 200, replace = TRUE), collapse = "") |
|
|
123 |
output_file <- tempfile(fileext = ".h5") |
|
|
124 |
predict_model(output_format = "one_seq", model = model, step = 10, |
|
|
125 |
sequence = sequence, filename = output_file, mode = "label") |
|
|
126 |
|
|
|
127 |
# make prediction for fasta file with multiple entries, write output to separate h5 files |
|
|
128 |
fasta_path <- tempfile(fileext = ".fasta") |
|
|
129 |
create_dummy_data(file_path = fasta_path, num_files = 1, |
|
|
130 |
num_seq = 5, seq_length = 100, |
|
|
131 |
write_to_file_path = TRUE) |
|
|
132 |
model <- create_model_lstm_cnn(maxlen = 20, layer_lstm = 8, layer_dense = 2, verbose = FALSE) |
|
|
133 |
output_dir <- tempfile() |
|
|
134 |
dir.create(output_dir) |
|
|
135 |
predict_model(output_format = "by_entry", model = model, step = 10, verbose = FALSE, |
|
|
136 |
output_dir = output_dir, mode = "label", path_input = fasta_path) |
|
|
137 |
list.files(output_dir) |
|
|
138 |
\dontshow{\}) # examplesIf} |
|
|
139 |
} |