--- a +++ b/man/generator_rds.Rd @@ -0,0 +1,100 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/generator_rds.R +\name{generator_rds} +\alias{generator_rds} +\title{Rds data generator} +\usage{ +generator_rds( + rds_folder, + batch_size, + path_file_log = NULL, + max_samples = NULL, + proportion_per_seq = NULL, + target_len = NULL, + seed = NULL, + delete_used_files = FALSE, + reverse_complement = FALSE, + sample_by_file_size = FALSE, + n_gram = NULL, + n_gram_stride = 1, + reverse_complement_encoding = FALSE, + add_noise = NULL, + reshape_xy = NULL +) +} +\arguments{ +\item{rds_folder}{Path to input files.} + +\item{batch_size}{Number of samples in one batch.} + +\item{path_file_log}{Write name of files to csv file if path is specified.} + +\item{max_samples}{Maximum number of samples to use from one file. If not \code{NULL} and file has more than \code{max_samples} samples, will randomly choose a +subset of \code{max_samples} samples.} + +\item{proportion_per_seq}{Numerical value between 0 and 1. Proportion of sequence to take samples from (use random subsequence).} + +\item{target_len}{Number of target nucleotides for language model.} + +\item{seed}{Sets seed for \code{set.seed} function for reproducible results.} + +\item{delete_used_files}{Whether to delete file once used. Only applies for rds files.} + +\item{reverse_complement}{Boolean, for every new file decide randomly to use original data or its reverse complement.} + +\item{sample_by_file_size}{Sample new file weighted by file size (bigger files more likely).} + +\item{n_gram}{Integer, encode target not nucleotide wise but combine n nucleotides at once. For example for \verb{n=2, "AA" -> (1, 0,..., 0),} +\verb{"AC" -> (0, 1, 0,..., 0), "TT" -> (0,..., 0, 1)}, where the one-hot vectors have length \code{length(vocabulary)^n}.} + +\item{n_gram_stride}{Step size for n-gram encoding. For AACCGGTT with \code{n_gram = 4} and \code{n_gram_stride = 2}, generator encodes +\verb{(AACC), (CCGG), (GGTT)}; for \code{n_gram_stride = 4} generator encodes \verb{(AACC), (GGTT)}.} + +\item{reverse_complement_encoding}{Whether to use both original sequence and reverse complement as two input sequences.} + +\item{add_noise}{\code{NULL} or list of arguments. If not \code{NULL}, list must contain the following arguments: \code{noise_type} can be \code{"normal"} or \code{"uniform"}; +optional arguments \code{sd} or \code{mean} if noise_type is \code{"normal"} (default is \code{sd=1} and \code{mean=0}) or \verb{min, max} if \code{noise_type} is \code{"uniform"} +(default is \verb{min=0, max=1}).} + +\item{reshape_xy}{Can be a list of functions to apply to input and/or target. List elements (containing the reshape functions) +must be called x for input or y for target and each have arguments called x and y. For example: +\code{reshape_xy = list(x = function(x, y) {return(x+1)}, y = function(x, y) {return(x+y)})} . +For rds generator needs to have an additional argument called sw.} +} +\value{ +A generator function. +} +\description{ +Creates training batches from rds files. Rds files must contain a +list of length 2 (input/target) or of length 1 (for language model). +If \code{target_len} is not NULL will take the last \code{target_len} entries of +the first list element as targets and the rest as input. +} +\examples{ +\dontshow{if (reticulate::py_module_available("tensorflow")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# create 3 rds files +rds_folder <- tempfile() +dir.create(rds_folder) +batch_size <- 7 +maxlen <- 11 +voc_len <- 4 +for (i in 1:3) { + x <- sample(0:(voc_len-1), maxlen*batch_size, replace = TRUE) + x <- keras::to_categorical(x, num_classes = voc_len) + x <- array(x, dim = c(batch_size, maxlen, voc_len)) + y <- sample(0:2, batch_size ,replace = TRUE) + y <- keras::to_categorical(y, num_classes = 3) + xy_list <- list(x, y) + file_name <- paste0(rds_folder, "/file_", i, ".rds") + saveRDS(xy_list, file_name) +} + +# create generator +gen <- generator_rds(rds_folder, batch_size = 2) +z <- gen() +x <- z[[1]] +y <- z[[2]] +x[1, , ] +y[1, ] +\dontshow{\}) # examplesIf} +}