|
a |
|
b/man/evaluate_model.Rd |
|
|
1 |
% Generated by roxygen2: do not edit by hand |
|
|
2 |
% Please edit documentation in R/evaluation.R |
|
|
3 |
\name{evaluate_model} |
|
|
4 |
\alias{evaluate_model} |
|
|
5 |
\title{Evaluates a trained model on fasta, fastq or rds files} |
|
|
6 |
\usage{ |
|
|
7 |
evaluate_model( |
|
|
8 |
path_input, |
|
|
9 |
model = NULL, |
|
|
10 |
batch_size = 100, |
|
|
11 |
step = 1, |
|
|
12 |
padding = FALSE, |
|
|
13 |
vocabulary = c("a", "c", "g", "t"), |
|
|
14 |
vocabulary_label = list(c("a", "c", "g", "t")), |
|
|
15 |
number_batches = 10, |
|
|
16 |
format = "fasta", |
|
|
17 |
target_middle = FALSE, |
|
|
18 |
mode = "lm", |
|
|
19 |
output_format = "target_right", |
|
|
20 |
ambiguous_nuc = "zero", |
|
|
21 |
evaluate_all_files = FALSE, |
|
|
22 |
verbose = TRUE, |
|
|
23 |
max_iter = 20000, |
|
|
24 |
target_from_csv = NULL, |
|
|
25 |
max_samples = NULL, |
|
|
26 |
proportion_per_seq = NULL, |
|
|
27 |
concat_seq = NULL, |
|
|
28 |
seed = 1234, |
|
|
29 |
auc = FALSE, |
|
|
30 |
auprc = FALSE, |
|
|
31 |
path_pred_list = NULL, |
|
|
32 |
exact_num_samples = NULL, |
|
|
33 |
activations = NULL, |
|
|
34 |
shuffle_file_order = FALSE, |
|
|
35 |
include_seq = FALSE, |
|
|
36 |
... |
|
|
37 |
) |
|
|
38 |
} |
|
|
39 |
\arguments{ |
|
|
40 |
\item{path_input}{Input directory where fasta, fastq or rds files are located.} |
|
|
41 |
|
|
|
42 |
\item{model}{A keras model.} |
|
|
43 |
|
|
|
44 |
\item{batch_size}{Number of samples per batch.} |
|
|
45 |
|
|
|
46 |
\item{step}{How often to take a sample.} |
|
|
47 |
|
|
|
48 |
\item{padding}{Whether to pad sequences too short for one sample with zeros.} |
|
|
49 |
|
|
|
50 |
\item{vocabulary}{Vector of allowed characters. Character outside vocabulary get encoded as specified in ambiguous_nuc.} |
|
|
51 |
|
|
|
52 |
\item{vocabulary_label}{List of labels for targets of each output layer.} |
|
|
53 |
|
|
|
54 |
\item{number_batches}{How many batches to evaluate.} |
|
|
55 |
|
|
|
56 |
\item{format}{File format, \code{"fasta"}, \code{"fastq"} or \code{"rds"}.} |
|
|
57 |
|
|
|
58 |
\item{target_middle}{Whether model is language model with separate input layers.} |
|
|
59 |
|
|
|
60 |
\item{mode}{Either \code{"lm"} for language model or \code{"label_header"}, \code{"label_csv"} or \code{"label_folder"} for label classification.} |
|
|
61 |
|
|
|
62 |
\item{output_format}{Determines shape of output tensor for language model. |
|
|
63 |
Either \code{"target_right"}, \code{"target_middle_lstm"}, \code{"target_middle_cnn"} or \code{"wavenet"}. |
|
|
64 |
Assume a sequence \code{"AACCGTA"}. Output correspond as follows |
|
|
65 |
\itemize{ |
|
|
66 |
\item \verb{"target_right": X = "AACCGT", Y = "A"} |
|
|
67 |
\item \verb{"target_middle_lstm": X = (X_1 = "AAC", X_2 = "ATG"), Y = "C"} (note reversed order of X_2) |
|
|
68 |
\item \verb{"target_middle_cnn": X = "AACGTA", Y = "C"} |
|
|
69 |
\item \verb{"wavenet": X = "AACCGT", Y = "ACCGTA"} |
|
|
70 |
}} |
|
|
71 |
|
|
|
72 |
\item{ambiguous_nuc}{How to handle nucleotides outside vocabulary, either \code{"zero"}, \code{"discard"}, \code{"empirical"} or \code{"equal"}. |
|
|
73 |
\itemize{ |
|
|
74 |
\item If \code{"zero"}, input gets encoded as zero vector. |
|
|
75 |
\item If \code{"equal"}, input is repetition of \code{1/length(vocabulary)}. |
|
|
76 |
\item If \code{"discard"}, samples containing nucleotides outside vocabulary get discarded. |
|
|
77 |
\item If \code{"empirical"}, use nucleotide distribution of current file. |
|
|
78 |
}} |
|
|
79 |
|
|
|
80 |
\item{evaluate_all_files}{Boolean, if \code{TRUE} will iterate over all files in \code{path_input} once. \code{number_batches} will be overwritten.} |
|
|
81 |
|
|
|
82 |
\item{verbose}{Boolean.} |
|
|
83 |
|
|
|
84 |
\item{max_iter}{Stop after \code{max_iter} number of iterations failed to produce a new batch.} |
|
|
85 |
|
|
|
86 |
\item{target_from_csv}{Path to csv file with target mapping. One column should be called "file" and other entries in row are the targets.} |
|
|
87 |
|
|
|
88 |
\item{max_samples}{Maximum number of samples to use from one file. If not \code{NULL} and file has more than \code{max_samples} samples, will randomly choose a |
|
|
89 |
subset of \code{max_samples} samples.} |
|
|
90 |
|
|
|
91 |
\item{proportion_per_seq}{Numerical value between 0 and 1. Proportion of sequence to take samples from (use random subsequence).} |
|
|
92 |
|
|
|
93 |
\item{concat_seq}{Character string or \code{NULL}. If not \code{NULL} all entries from file get concatenated to one sequence with \code{concat_seq} string between them. |
|
|
94 |
Example: If 1.entry AACC, 2. entry TTTG and \code{concat_seq = "ZZZ"} this becomes AACCZZZTTTG.} |
|
|
95 |
|
|
|
96 |
\item{seed}{Sets seed for \code{set.seed} function for reproducible results.} |
|
|
97 |
|
|
|
98 |
\item{auc}{Whether to include AUC metric. If output layer activation is \code{"softmax"}, only possible for 2 targets. Computes the average if output layer has sigmoid |
|
|
99 |
activation and multiple targets.} |
|
|
100 |
|
|
|
101 |
\item{auprc}{Whether to include AUPRC metric. If output layer activation is \code{"softmax"}, only possible for 2 targets. Computes the average if output layer has sigmoid |
|
|
102 |
activation and multiple targets.} |
|
|
103 |
|
|
|
104 |
\item{path_pred_list}{Path to store list of predictions (output of output layers) and corresponding true labels as rds file.} |
|
|
105 |
|
|
|
106 |
\item{exact_num_samples}{Exact number of samples to evaluate. If you want to evaluate a number of samples not divisible by batch_size. Useful if you want |
|
|
107 |
to evaluate a data set exactly ones and know the number of samples already. Should be a vector if \code{mode = "label_folder"} (with same length as \code{vocabulary_label}) |
|
|
108 |
and else an integer.} |
|
|
109 |
|
|
|
110 |
\item{activations}{List containing output formats for output layers (\verb{softmax, sigmoid} or \code{linear}). If \code{NULL}, will be estimated from model.} |
|
|
111 |
|
|
|
112 |
\item{shuffle_file_order}{Logical, whether to go through files randomly or sequentially.} |
|
|
113 |
|
|
|
114 |
\item{include_seq}{Whether to store input. Only applies if \code{path_pred_list} is not \code{NULL}.} |
|
|
115 |
|
|
|
116 |
\item{...}{Further generator options. See \code{\link{get_generator}}.} |
|
|
117 |
} |
|
|
118 |
\value{ |
|
|
119 |
A list of evaluation results. Each list element corresponds to an output layer of the model. |
|
|
120 |
} |
|
|
121 |
\description{ |
|
|
122 |
Returns evaluation metric like confusion matrix, loss, AUC, AUPRC, MAE, MSE (depending on output layer). |
|
|
123 |
} |
|
|
124 |
\examples{ |
|
|
125 |
\dontshow{if (reticulate::py_module_available("tensorflow")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} |
|
|
126 |
# create dummy data |
|
|
127 |
path_input <- tempfile() |
|
|
128 |
dir.create(path_input) |
|
|
129 |
create_dummy_data(file_path = path_input, |
|
|
130 |
num_files = 3, |
|
|
131 |
seq_length = 11, |
|
|
132 |
num_seq = 5, |
|
|
133 |
vocabulary = c("a", "c", "g", "t")) |
|
|
134 |
# create model |
|
|
135 |
model <- create_model_lstm_cnn(layer_lstm = 8, layer_dense = 4, maxlen = 10, verbose = FALSE) |
|
|
136 |
# evaluate |
|
|
137 |
evaluate_model(path_input = path_input, |
|
|
138 |
model = model, |
|
|
139 |
step = 11, |
|
|
140 |
vocabulary = c("a", "c", "g", "t"), |
|
|
141 |
vocabulary_label = list(c("a", "c", "g", "t")), |
|
|
142 |
mode = "lm", |
|
|
143 |
output_format = "target_right", |
|
|
144 |
evaluate_all_files = TRUE, |
|
|
145 |
verbose = FALSE) |
|
|
146 |
|
|
|
147 |
\dontshow{\}) # examplesIf} |
|
|
148 |
} |