--- a +++ b/man/evaluate_model.Rd @@ -0,0 +1,148 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/evaluation.R +\name{evaluate_model} +\alias{evaluate_model} +\title{Evaluates a trained model on fasta, fastq or rds files} +\usage{ +evaluate_model( + path_input, + model = NULL, + batch_size = 100, + step = 1, + padding = FALSE, + vocabulary = c("a", "c", "g", "t"), + vocabulary_label = list(c("a", "c", "g", "t")), + number_batches = 10, + format = "fasta", + target_middle = FALSE, + mode = "lm", + output_format = "target_right", + ambiguous_nuc = "zero", + evaluate_all_files = FALSE, + verbose = TRUE, + max_iter = 20000, + target_from_csv = NULL, + max_samples = NULL, + proportion_per_seq = NULL, + concat_seq = NULL, + seed = 1234, + auc = FALSE, + auprc = FALSE, + path_pred_list = NULL, + exact_num_samples = NULL, + activations = NULL, + shuffle_file_order = FALSE, + include_seq = FALSE, + ... +) +} +\arguments{ +\item{path_input}{Input directory where fasta, fastq or rds files are located.} + +\item{model}{A keras model.} + +\item{batch_size}{Number of samples per batch.} + +\item{step}{How often to take a sample.} + +\item{padding}{Whether to pad sequences too short for one sample with zeros.} + +\item{vocabulary}{Vector of allowed characters. Character outside vocabulary get encoded as specified in ambiguous_nuc.} + +\item{vocabulary_label}{List of labels for targets of each output layer.} + +\item{number_batches}{How many batches to evaluate.} + +\item{format}{File format, \code{"fasta"}, \code{"fastq"} or \code{"rds"}.} + +\item{target_middle}{Whether model is language model with separate input layers.} + +\item{mode}{Either \code{"lm"} for language model or \code{"label_header"}, \code{"label_csv"} or \code{"label_folder"} for label classification.} + +\item{output_format}{Determines shape of output tensor for language model. +Either \code{"target_right"}, \code{"target_middle_lstm"}, \code{"target_middle_cnn"} or \code{"wavenet"}. +Assume a sequence \code{"AACCGTA"}. Output correspond as follows +\itemize{ +\item \verb{"target_right": X = "AACCGT", Y = "A"} +\item \verb{"target_middle_lstm": X = (X_1 = "AAC", X_2 = "ATG"), Y = "C"} (note reversed order of X_2) +\item \verb{"target_middle_cnn": X = "AACGTA", Y = "C"} +\item \verb{"wavenet": X = "AACCGT", Y = "ACCGTA"} +}} + +\item{ambiguous_nuc}{How to handle nucleotides outside vocabulary, either \code{"zero"}, \code{"discard"}, \code{"empirical"} or \code{"equal"}. +\itemize{ +\item If \code{"zero"}, input gets encoded as zero vector. +\item If \code{"equal"}, input is repetition of \code{1/length(vocabulary)}. +\item If \code{"discard"}, samples containing nucleotides outside vocabulary get discarded. +\item If \code{"empirical"}, use nucleotide distribution of current file. +}} + +\item{evaluate_all_files}{Boolean, if \code{TRUE} will iterate over all files in \code{path_input} once. \code{number_batches} will be overwritten.} + +\item{verbose}{Boolean.} + +\item{max_iter}{Stop after \code{max_iter} number of iterations failed to produce a new batch.} + +\item{target_from_csv}{Path to csv file with target mapping. One column should be called "file" and other entries in row are the targets.} + +\item{max_samples}{Maximum number of samples to use from one file. If not \code{NULL} and file has more than \code{max_samples} samples, will randomly choose a +subset of \code{max_samples} samples.} + +\item{proportion_per_seq}{Numerical value between 0 and 1. Proportion of sequence to take samples from (use random subsequence).} + +\item{concat_seq}{Character string or \code{NULL}. If not \code{NULL} all entries from file get concatenated to one sequence with \code{concat_seq} string between them. +Example: If 1.entry AACC, 2. entry TTTG and \code{concat_seq = "ZZZ"} this becomes AACCZZZTTTG.} + +\item{seed}{Sets seed for \code{set.seed} function for reproducible results.} + +\item{auc}{Whether to include AUC metric. If output layer activation is \code{"softmax"}, only possible for 2 targets. Computes the average if output layer has sigmoid +activation and multiple targets.} + +\item{auprc}{Whether to include AUPRC metric. If output layer activation is \code{"softmax"}, only possible for 2 targets. Computes the average if output layer has sigmoid +activation and multiple targets.} + +\item{path_pred_list}{Path to store list of predictions (output of output layers) and corresponding true labels as rds file.} + +\item{exact_num_samples}{Exact number of samples to evaluate. If you want to evaluate a number of samples not divisible by batch_size. Useful if you want +to evaluate a data set exactly ones and know the number of samples already. Should be a vector if \code{mode = "label_folder"} (with same length as \code{vocabulary_label}) +and else an integer.} + +\item{activations}{List containing output formats for output layers (\verb{softmax, sigmoid} or \code{linear}). If \code{NULL}, will be estimated from model.} + +\item{shuffle_file_order}{Logical, whether to go through files randomly or sequentially.} + +\item{include_seq}{Whether to store input. Only applies if \code{path_pred_list} is not \code{NULL}.} + +\item{...}{Further generator options. See \code{\link{get_generator}}.} +} +\value{ +A list of evaluation results. Each list element corresponds to an output layer of the model. +} +\description{ +Returns evaluation metric like confusion matrix, loss, AUC, AUPRC, MAE, MSE (depending on output layer). +} +\examples{ +\dontshow{if (reticulate::py_module_available("tensorflow")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# create dummy data +path_input <- tempfile() +dir.create(path_input) +create_dummy_data(file_path = path_input, + num_files = 3, + seq_length = 11, + num_seq = 5, + vocabulary = c("a", "c", "g", "t")) +# create model +model <- create_model_lstm_cnn(layer_lstm = 8, layer_dense = 4, maxlen = 10, verbose = FALSE) +# evaluate +evaluate_model(path_input = path_input, + model = model, + step = 11, + vocabulary = c("a", "c", "g", "t"), + vocabulary_label = list(c("a", "c", "g", "t")), + mode = "lm", + output_format = "target_right", + evaluate_all_files = TRUE, + verbose = FALSE) + +\dontshow{\}) # examplesIf} +}