Diff of /man/evaluate_model.Rd [000000] .. [409433]

Switch to unified view

a b/man/evaluate_model.Rd
1
% Generated by roxygen2: do not edit by hand
2
% Please edit documentation in R/evaluation.R
3
\name{evaluate_model}
4
\alias{evaluate_model}
5
\title{Evaluates a trained model on fasta, fastq or rds files}
6
\usage{
7
evaluate_model(
8
  path_input,
9
  model = NULL,
10
  batch_size = 100,
11
  step = 1,
12
  padding = FALSE,
13
  vocabulary = c("a", "c", "g", "t"),
14
  vocabulary_label = list(c("a", "c", "g", "t")),
15
  number_batches = 10,
16
  format = "fasta",
17
  target_middle = FALSE,
18
  mode = "lm",
19
  output_format = "target_right",
20
  ambiguous_nuc = "zero",
21
  evaluate_all_files = FALSE,
22
  verbose = TRUE,
23
  max_iter = 20000,
24
  target_from_csv = NULL,
25
  max_samples = NULL,
26
  proportion_per_seq = NULL,
27
  concat_seq = NULL,
28
  seed = 1234,
29
  auc = FALSE,
30
  auprc = FALSE,
31
  path_pred_list = NULL,
32
  exact_num_samples = NULL,
33
  activations = NULL,
34
  shuffle_file_order = FALSE,
35
  include_seq = FALSE,
36
  ...
37
)
38
}
39
\arguments{
40
\item{path_input}{Input directory where fasta, fastq or rds files are located.}
41
42
\item{model}{A keras model.}
43
44
\item{batch_size}{Number of samples per batch.}
45
46
\item{step}{How often to take a sample.}
47
48
\item{padding}{Whether to pad sequences too short for one sample with zeros.}
49
50
\item{vocabulary}{Vector of allowed characters. Character outside vocabulary get encoded as specified in ambiguous_nuc.}
51
52
\item{vocabulary_label}{List of labels for targets of each output layer.}
53
54
\item{number_batches}{How many batches to evaluate.}
55
56
\item{format}{File format, \code{"fasta"}, \code{"fastq"} or \code{"rds"}.}
57
58
\item{target_middle}{Whether model is language model with separate input layers.}
59
60
\item{mode}{Either \code{"lm"} for language model or \code{"label_header"}, \code{"label_csv"} or \code{"label_folder"} for label classification.}
61
62
\item{output_format}{Determines shape of output tensor for language model.
63
Either \code{"target_right"}, \code{"target_middle_lstm"}, \code{"target_middle_cnn"} or \code{"wavenet"}.
64
Assume a sequence \code{"AACCGTA"}. Output correspond as follows
65
\itemize{
66
\item \verb{"target_right": X = "AACCGT", Y = "A"}
67
\item \verb{"target_middle_lstm": X = (X_1 = "AAC", X_2 = "ATG"), Y = "C"} (note reversed order of X_2)
68
\item \verb{"target_middle_cnn": X = "AACGTA", Y = "C"}
69
\item \verb{"wavenet": X = "AACCGT", Y = "ACCGTA"}
70
}}
71
72
\item{ambiguous_nuc}{How to handle nucleotides outside vocabulary, either \code{"zero"}, \code{"discard"}, \code{"empirical"} or \code{"equal"}.
73
\itemize{
74
\item If \code{"zero"}, input gets encoded as zero vector.
75
\item If \code{"equal"}, input is repetition of \code{1/length(vocabulary)}.
76
\item If \code{"discard"}, samples containing nucleotides outside vocabulary get discarded.
77
\item If \code{"empirical"}, use nucleotide distribution of current file.
78
}}
79
80
\item{evaluate_all_files}{Boolean, if \code{TRUE} will iterate over all files in \code{path_input} once. \code{number_batches} will be overwritten.}
81
82
\item{verbose}{Boolean.}
83
84
\item{max_iter}{Stop after \code{max_iter} number of iterations failed to produce a new batch.}
85
86
\item{target_from_csv}{Path to csv file with target mapping. One column should be called "file" and other entries in row are the targets.}
87
88
\item{max_samples}{Maximum number of samples to use from one file. If not \code{NULL} and file has more than \code{max_samples} samples, will randomly choose a
89
subset of \code{max_samples} samples.}
90
91
\item{proportion_per_seq}{Numerical value between 0 and 1. Proportion of sequence to take samples from (use random subsequence).}
92
93
\item{concat_seq}{Character string or \code{NULL}. If not \code{NULL} all entries from file get concatenated to one sequence with \code{concat_seq} string between them.
94
Example: If 1.entry AACC, 2. entry TTTG and \code{concat_seq = "ZZZ"} this becomes AACCZZZTTTG.}
95
96
\item{seed}{Sets seed for \code{set.seed} function for reproducible results.}
97
98
\item{auc}{Whether to include AUC metric. If output layer activation is \code{"softmax"}, only possible for 2 targets. Computes the average if output layer has sigmoid
99
activation and multiple targets.}
100
101
\item{auprc}{Whether to include AUPRC metric. If output layer activation is \code{"softmax"}, only possible for 2 targets. Computes the average if output layer has sigmoid
102
activation and multiple targets.}
103
104
\item{path_pred_list}{Path to store list of predictions (output of output layers) and corresponding true labels as rds file.}
105
106
\item{exact_num_samples}{Exact number of samples to evaluate. If you want to evaluate a number of samples not divisible by batch_size. Useful if you want
107
to evaluate a data set exactly ones and know the number of samples already. Should be a vector if \code{mode = "label_folder"} (with same length as \code{vocabulary_label})
108
and else an integer.}
109
110
\item{activations}{List containing output formats for output layers (\verb{softmax, sigmoid} or \code{linear}). If \code{NULL}, will be estimated from model.}
111
112
\item{shuffle_file_order}{Logical, whether to go through files randomly or sequentially.}
113
114
\item{include_seq}{Whether to store input. Only applies if \code{path_pred_list} is not \code{NULL}.}
115
116
\item{...}{Further generator options. See \code{\link{get_generator}}.}
117
}
118
\value{
119
A list of evaluation results. Each list element corresponds to an output layer of the model.
120
}
121
\description{
122
Returns evaluation metric like confusion matrix, loss, AUC, AUPRC, MAE, MSE (depending on output layer).
123
}
124
\examples{
125
\dontshow{if (reticulate::py_module_available("tensorflow")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
126
# create dummy data
127
path_input <- tempfile()
128
dir.create(path_input)
129
create_dummy_data(file_path = path_input,
130
                  num_files = 3,
131
                  seq_length = 11, 
132
                  num_seq = 5,
133
                  vocabulary = c("a", "c", "g", "t"))
134
# create model
135
model <- create_model_lstm_cnn(layer_lstm = 8, layer_dense = 4, maxlen = 10, verbose = FALSE)
136
# evaluate
137
evaluate_model(path_input = path_input,
138
  model = model,
139
  step = 11,
140
  vocabulary = c("a", "c", "g", "t"),
141
  vocabulary_label = list(c("a", "c", "g", "t")),
142
  mode = "lm",
143
  output_format = "target_right",
144
  evaluate_all_files = TRUE,
145
  verbose = FALSE)
146
  
147
\dontshow{\}) # examplesIf}
148
}