Diff of /man/train_model_cpc.Rd [000000] .. [409433]

Switch to side-by-side view

--- a
+++ b/man/train_model_cpc.Rd
@@ -0,0 +1,220 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/train_cpc.R
+\name{train_model_cpc}
+\alias{train_model_cpc}
+\title{Train CPC inspired model}
+\usage{
+train_model_cpc(
+  train_type = "CPC",
+  encoder = NULL,
+  context = NULL,
+  path,
+  path_val = NULL,
+  path_checkpoint = NULL,
+  path_tensorboard = NULL,
+  train_val_ratio = 0.2,
+  run_name,
+  batch_size = 32,
+  epochs = 100,
+  steps_per_epoch = 2000,
+  shuffle_file_order = FALSE,
+  initial_epoch = 1,
+  seed = 1234,
+  path_file_log = TRUE,
+  train_val_split_csv = NULL,
+  file_limit = NULL,
+  proportion_per_seq = NULL,
+  max_samples = NULL,
+  maxlen = NULL,
+  patchlen = NULL,
+  nopatches = NULL,
+  step = NULL,
+  file_filter = NULL,
+  stride = 0.4,
+  pretrained_model = NULL,
+  learningrate = 0.001,
+  learningrate_schedule = NULL,
+  k = 5,
+  stepsmin = 2,
+  stepsmax = 3,
+  emb_scale = 0.1
+)
+}
+\arguments{
+\item{train_type}{Either \code{"cpc"}, \code{"Self-GenomeNet"}.}
+
+\item{encoder}{A keras encoder for the cpc function.}
+
+\item{context}{A keras context model for the cpc function.}
+
+\item{path}{Path to training data. If \code{train_type} is \code{label_folder}, should be a vector or list
+where each entry corresponds to a class (list elements can be directories and/or individual files). If \code{train_type} is not \code{label_folder},
+can be a single directory or file or a list of directories and/or files.}
+
+\item{path_val}{Path to validation data. See \code{path} argument for details.}
+
+\item{path_checkpoint}{Path to checkpoints folder or \code{NULL}. If \code{NULL}, checkpoints don't get stored.}
+
+\item{path_tensorboard}{Path to tensorboard directory or \code{NULL}. If \code{NULL}, training not tracked on tensorboard.}
+
+\item{train_val_ratio}{For generator defines the fraction of batches that will be used for validation (compared to size of training data), i.e. one validation iteration
+processes \code{batch_size} \eqn{*} \code{steps_per_epoch} \eqn{*} \code{train_val_ratio} samples. If you use dataset instead of generator and \code{dataset_val} is \code{NULL}, splits \code{dataset}
+into train/validation data.}
+
+\item{run_name}{Name of the run. Name will be used to identify output from callbacks.}
+
+\item{batch_size}{Number of samples used for one network update.}
+
+\item{epochs}{Number of iterations.}
+
+\item{steps_per_epoch}{Number of training batches per epoch.}
+
+\item{shuffle_file_order}{Boolean, whether to go through files sequentially or shuffle beforehand.}
+
+\item{initial_epoch}{Epoch at which to start training. Note that network
+will run for (\code{epochs} - \code{initial_epochs}) rounds and not \code{epochs} rounds.}
+
+\item{seed}{Sets seed for reproducible results.}
+
+\item{path_file_log}{Write name of files to csv file if path is specified.}
+
+\item{train_val_split_csv}{A csv file specifying train/validation split. csv file should contain one column named \code{"file"} and one column named
+\code{"type"}. The \code{"file"} column contains names of fasta/fastq files and \code{"type"} column specifies if file is used for training or validation.
+Entries in \code{"type"} must be named \code{"train"} or \code{"val"}, otherwise file will not be used for either. \code{path} and \code{path_val} arguments should be the same.
+Not implemented for \code{train_type = "label_folder"}.}
+
+\item{file_limit}{Integer or \code{NULL}. If integer, use only specified number of randomly sampled files for training. Ignored if greater than number of files in \code{path}.}
+
+\item{proportion_per_seq}{Numerical value between 0 and 1. Proportion of sequence to take samples from (use random subsequence).}
+
+\item{max_samples}{Maximum number of samples to use from one file. If not \code{NULL} and file has more than \code{max_samples} samples, will randomly choose a
+subset of \code{max_samples} samples.}
+
+\item{maxlen}{Length of predictor sequence.}
+
+\item{patchlen}{The length of a patch when splitting the input sequence.}
+
+\item{nopatches}{The number of patches when splitting the input sequence.}
+
+\item{step}{Frequency of sampling steps.}
+
+\item{file_filter}{Vector of file names to use from path_corpus.}
+
+\item{stride}{The overlap between two patches when splitting the input sequence.}
+
+\item{pretrained_model}{A pretrained keras model, for which training will be continued}
+
+\item{learningrate}{A Tensor, floating point value. If a schedule is defines, this value gives the initial learning rate. Defaults to 0.001.}
+
+\item{learningrate_schedule}{A schedule for a non-constant learning rate over the training. Either "cosine_annealing", "step_decay", or "exp_decay".}
+
+\item{k}{Value of k for sparse top k categorical accuracy. Defaults to 5.}
+
+\item{stepsmin}{In CPC, a patch is predicted given another patch. stepsmin defines how many patches between these two should be ignored during prediction.}
+
+\item{stepsmax}{The maximum distance between the predicted patch and the given patch.}
+
+\item{emb_scale}{Scales the impact of a patches context.}
+}
+\value{
+A list of training metrics.
+}
+\description{
+Train a CPC (Oord et al.) inspired neural network on genomic data.
+}
+\examples{
+\dontshow{if (reticulate::py_module_available("tensorflow")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+
+#create dummy data
+path_train_1 <- tempfile()
+path_train_2 <- tempfile()
+path_val_1 <- tempfile()
+path_val_2 <- tempfile()
+
+for (current_path in c(path_train_1, path_train_2,
+                       path_val_1, path_val_2)) {
+  dir.create(current_path)
+  deepG::create_dummy_data(file_path = current_path,
+                           num_files = 3,
+                           seq_length = 10,
+                           num_seq = 5,
+                           vocabulary = c("a", "c", "g", "t"))
+}
+
+# create model
+encoder <- function(maxlen = NULL,
+                    patchlen = NULL,
+                    nopatches = NULL,
+                    eval = FALSE) {
+  if (is.null(nopatches)) {
+    nopatches <- nopatchescalc(patchlen, maxlen, patchlen * 0.4)
+  }
+  inp <- keras::layer_input(shape = c(maxlen, 4))
+  stridelen <- as.integer(0.4 * patchlen)
+  createpatches <- inp \%>\%
+    keras::layer_reshape(list(maxlen, 4L, 1L), name = "prep_reshape1", dtype = "float32") \%>\%
+    tensorflow::tf$image$extract_patches(
+      sizes = list(1L, patchlen, 4L, 1L),
+      strides = list(1L, stridelen, 4L, 1L),
+      rates = list(1L, 1L, 1L, 1L),
+      padding = "VALID",
+      name = "prep_patches"
+    ) \%>\%
+    keras::layer_reshape(list(nopatches, patchlen, 4L),
+                         name = "prep_reshape2") \%>\%
+    tensorflow::tf$reshape(list(-1L, patchlen, 4L),
+                           name = "prep_reshape3")
+
+  danQ <- createpatches \%>\%
+    keras::layer_conv_1d(
+      input_shape = c(maxlen, 4L),
+      filters = 320L,
+      kernel_size = 26L,
+      activation = "relu"
+    ) \%>\%
+    keras::layer_max_pooling_1d(pool_size = 13L, strides = 13L) \%>\%
+    keras::layer_dropout(0.2) \%>\%
+    keras::layer_lstm(units = 320, return_sequences = TRUE) \%>\%
+    keras::layer_dropout(0.5) \%>\%
+    keras::layer_flatten() \%>\%
+    keras::layer_dense(925, activation = "relu")
+  patchesback <- danQ \%>\%
+    tensorflow::tf$reshape(list(-1L, tensorflow::tf$cast(nopatches, tensorflow::tf$int16), 925L))
+  keras::keras_model(inp, patchesback)
+}
+
+context <- function(latents) {
+  cres <- latents
+  cres_dim = cres$shape
+  predictions <-
+    cres \%>\%
+    keras::layer_lstm(
+      return_sequences = TRUE,
+      units = 256,  # WAS: 2048,
+      name = paste("context_LSTM_1",
+                   sep = ""),
+      activation = "relu"
+    )
+  return(predictions)
+}
+
+# train model
+temp_dir <- tempdir()
+hist <- train_model_cpc(train_type = "CPC",
+                        ### cpc functions ###
+                        encoder = encoder,
+                        context = context,
+                        #### Generator settings ####
+                        path_checkpoint = temp_dir,
+                        path = c(path_train_1, path_train_2),
+                        path_val = c(path_val_1, path_val_2),
+                        run_name = "TEST",
+                        batch_size = 8,
+                        epochs = 3,
+                        steps_per_epoch = 6,
+                        patchlen = 100,
+                        nopatches = 8)
+                
+ 
+\dontshow{\}) # examplesIf}
+}