deepG / Git / [1c0e03] /man/train_model

Models:
MarcoTheBlack/
deepG
Downloads: 2
[1c0e03]: / man / train_model_cpc.Rd
History
Download this file
221 lines (180 with data), 8.3 kB

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/train_cpc.R
\name{train_model_cpc}
\alias{train_model_cpc}
\title{Train CPC inspired model}
\usage{
train_model_cpc(
  train_type = "CPC",
  encoder = NULL,
  context = NULL,
  path,
  path_val = NULL,
  path_checkpoint = NULL,
  path_tensorboard = NULL,
  train_val_ratio = 0.2,
  run_name,
  batch_size = 32,
  epochs = 100,
  steps_per_epoch = 2000,
  shuffle_file_order = FALSE,
  initial_epoch = 1,
  seed = 1234,
  path_file_log = TRUE,
  train_val_split_csv = NULL,
  file_limit = NULL,
  proportion_per_seq = NULL,
  max_samples = NULL,
  maxlen = NULL,
  patchlen = NULL,
  nopatches = NULL,
  step = NULL,
  file_filter = NULL,
  stride = 0.4,
  pretrained_model = NULL,
  learningrate = 0.001,
  learningrate_schedule = NULL,
  k = 5,
  stepsmin = 2,
  stepsmax = 3,
  emb_scale = 0.1
)
}
\arguments{
\item{train_type}{Either \code{"cpc"}, \code{"Self-GenomeNet"}.}

\item{encoder}{A keras encoder for the cpc function.}

\item{context}{A keras context model for the cpc function.}

\item{path}{Path to training data. If \code{train_type} is \code{label_folder}, should be a vector or list
where each entry corresponds to a class (list elements can be directories and/or individual files). If \code{train_type} is not \code{label_folder},
can be a single directory or file or a list of directories and/or files.}

\item{path_val}{Path to validation data. See \code{path} argument for details.}

\item{path_checkpoint}{Path to checkpoints folder or \code{NULL}. If \code{NULL}, checkpoints don't get stored.}

\item{path_tensorboard}{Path to tensorboard directory or \code{NULL}. If \code{NULL}, training not tracked on tensorboard.}

\item{train_val_ratio}{For generator defines the fraction of batches that will be used for validation (compared to size of training data), i.e. one validation iteration
processes \code{batch_size} \eqn{*} \code{steps_per_epoch} \eqn{*} \code{train_val_ratio} samples. If you use dataset instead of generator and \code{dataset_val} is \code{NULL}, splits \code{dataset}
into train/validation data.}

\item{run_name}{Name of the run. Name will be used to identify output from callbacks.}

\item{batch_size}{Number of samples used for one network update.}

\item{epochs}{Number of iterations.}

\item{steps_per_epoch}{Number of training batches per epoch.}

\item{shuffle_file_order}{Boolean, whether to go through files sequentially or shuffle beforehand.}

\item{initial_epoch}{Epoch at which to start training. Note that network
will run for (\code{epochs} - \code{initial_epochs}) rounds and not \code{epochs} rounds.}

\item{seed}{Sets seed for reproducible results.}

\item{path_file_log}{Write name of files to csv file if path is specified.}

\item{train_val_split_csv}{A csv file specifying train/validation split. csv file should contain one column named \code{"file"} and one column named
\code{"type"}. The \code{"file"} column contains names of fasta/fastq files and \code{"type"} column specifies if file is used for training or validation.
Entries in \code{"type"} must be named \code{"train"} or \code{"val"}, otherwise file will not be used for either. \code{path} and \code{path_val} arguments should be the same.
Not implemented for \code{train_type = "label_folder"}.}

\item{file_limit}{Integer or \code{NULL}. If integer, use only specified number of randomly sampled files for training. Ignored if greater than number of files in \code{path}.}

\item{proportion_per_seq}{Numerical value between 0 and 1. Proportion of sequence to take samples from (use random subsequence).}

\item{max_samples}{Maximum number of samples to use from one file. If not \code{NULL} and file has more than \code{max_samples} samples, will randomly choose a
subset of \code{max_samples} samples.}

\item{maxlen}{Length of predictor sequence.}

\item{patchlen}{The length of a patch when splitting the input sequence.}

\item{nopatches}{The number of patches when splitting the input sequence.}

\item{step}{Frequency of sampling steps.}

\item{file_filter}{Vector of file names to use from path_corpus.}

\item{stride}{The overlap between two patches when splitting the input sequence.}

\item{pretrained_model}{A pretrained keras model, for which training will be continued}

\item{learningrate}{A Tensor, floating point value. If a schedule is defines, this value gives the initial learning rate. Defaults to 0.001.}

\item{learningrate_schedule}{A schedule for a non-constant learning rate over the training. Either "cosine_annealing", "step_decay", or "exp_decay".}

\item{k}{Value of k for sparse top k categorical accuracy. Defaults to 5.}

\item{stepsmin}{In CPC, a patch is predicted given another patch. stepsmin defines how many patches between these two should be ignored during prediction.}

\item{stepsmax}{The maximum distance between the predicted patch and the given patch.}

\item{emb_scale}{Scales the impact of a patches context.}
}
\value{
A list of training metrics.
}
\description{
Train a CPC (Oord et al.) inspired neural network on genomic data.
}
\examples{
\dontshow{if (reticulate::py_module_available("tensorflow")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}

#create dummy data
path_train_1 <- tempfile()
path_train_2 <- tempfile()
path_val_1 <- tempfile()
path_val_2 <- tempfile()

for (current_path in c(path_train_1, path_train_2,
                       path_val_1, path_val_2)) {
  dir.create(current_path)
  deepG::create_dummy_data(file_path = current_path,
                           num_files = 3,
                           seq_length = 10,
                           num_seq = 5,
                           vocabulary = c("a", "c", "g", "t"))
}

# create model
encoder <- function(maxlen = NULL,
                    patchlen = NULL,
                    nopatches = NULL,
                    eval = FALSE) {
  if (is.null(nopatches)) {
    nopatches <- nopatchescalc(patchlen, maxlen, patchlen * 0.4)
  }
  inp <- keras::layer_input(shape = c(maxlen, 4))
  stridelen <- as.integer(0.4 * patchlen)
  createpatches <- inp \%>\%
    keras::layer_reshape(list(maxlen, 4L, 1L), name = "prep_reshape1", dtype = "float32") \%>\%
    tensorflow::tf$image$extract_patches(
      sizes = list(1L, patchlen, 4L, 1L),
      strides = list(1L, stridelen, 4L, 1L),
      rates = list(1L, 1L, 1L, 1L),
      padding = "VALID",
      name = "prep_patches"
    ) \%>\%
    keras::layer_reshape(list(nopatches, patchlen, 4L),
                         name = "prep_reshape2") \%>\%
    tensorflow::tf$reshape(list(-1L, patchlen, 4L),
                           name = "prep_reshape3")

  danQ <- createpatches \%>\%
    keras::layer_conv_1d(
      input_shape = c(maxlen, 4L),
      filters = 320L,
      kernel_size = 26L,
      activation = "relu"
    ) \%>\%
    keras::layer_max_pooling_1d(pool_size = 13L, strides = 13L) \%>\%
    keras::layer_dropout(0.2) \%>\%
    keras::layer_lstm(units = 320, return_sequences = TRUE) \%>\%
    keras::layer_dropout(0.5) \%>\%
    keras::layer_flatten() \%>\%
    keras::layer_dense(925, activation = "relu")
  patchesback <- danQ \%>\%
    tensorflow::tf$reshape(list(-1L, tensorflow::tf$cast(nopatches, tensorflow::tf$int16), 925L))
  keras::keras_model(inp, patchesback)
}

context <- function(latents) {
  cres <- latents
  cres_dim = cres$shape
  predictions <-
    cres \%>\%
    keras::layer_lstm(
      return_sequences = TRUE,
      units = 256,  # WAS: 2048,
      name = paste("context_LSTM_1",
                   sep = ""),
      activation = "relu"
    )
  return(predictions)
}

# train model
temp_dir <- tempdir()
hist <- train_model_cpc(train_type = "CPC",
                        ### cpc functions ###
                        encoder = encoder,
                        context = context,
                        #### Generator settings ####
                        path_checkpoint = temp_dir,
                        path = c(path_train_1, path_train_2),
                        path_val = c(path_val_1, path_val_2),
                        run_name = "TEST",
                        batch_size = 8,
                        epochs = 3,
                        steps_per_epoch = 6,
                        patchlen = 100,
                        nopatches = 8)
                
 
\dontshow{\}) # examplesIf}
}