--- a +++ b/man/train_model_cpc.Rd @@ -0,0 +1,220 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/train_cpc.R +\name{train_model_cpc} +\alias{train_model_cpc} +\title{Train CPC inspired model} +\usage{ +train_model_cpc( + train_type = "CPC", + encoder = NULL, + context = NULL, + path, + path_val = NULL, + path_checkpoint = NULL, + path_tensorboard = NULL, + train_val_ratio = 0.2, + run_name, + batch_size = 32, + epochs = 100, + steps_per_epoch = 2000, + shuffle_file_order = FALSE, + initial_epoch = 1, + seed = 1234, + path_file_log = TRUE, + train_val_split_csv = NULL, + file_limit = NULL, + proportion_per_seq = NULL, + max_samples = NULL, + maxlen = NULL, + patchlen = NULL, + nopatches = NULL, + step = NULL, + file_filter = NULL, + stride = 0.4, + pretrained_model = NULL, + learningrate = 0.001, + learningrate_schedule = NULL, + k = 5, + stepsmin = 2, + stepsmax = 3, + emb_scale = 0.1 +) +} +\arguments{ +\item{train_type}{Either \code{"cpc"}, \code{"Self-GenomeNet"}.} + +\item{encoder}{A keras encoder for the cpc function.} + +\item{context}{A keras context model for the cpc function.} + +\item{path}{Path to training data. If \code{train_type} is \code{label_folder}, should be a vector or list +where each entry corresponds to a class (list elements can be directories and/or individual files). If \code{train_type} is not \code{label_folder}, +can be a single directory or file or a list of directories and/or files.} + +\item{path_val}{Path to validation data. See \code{path} argument for details.} + +\item{path_checkpoint}{Path to checkpoints folder or \code{NULL}. If \code{NULL}, checkpoints don't get stored.} + +\item{path_tensorboard}{Path to tensorboard directory or \code{NULL}. If \code{NULL}, training not tracked on tensorboard.} + +\item{train_val_ratio}{For generator defines the fraction of batches that will be used for validation (compared to size of training data), i.e. one validation iteration +processes \code{batch_size} \eqn{*} \code{steps_per_epoch} \eqn{*} \code{train_val_ratio} samples. If you use dataset instead of generator and \code{dataset_val} is \code{NULL}, splits \code{dataset} +into train/validation data.} + +\item{run_name}{Name of the run. Name will be used to identify output from callbacks.} + +\item{batch_size}{Number of samples used for one network update.} + +\item{epochs}{Number of iterations.} + +\item{steps_per_epoch}{Number of training batches per epoch.} + +\item{shuffle_file_order}{Boolean, whether to go through files sequentially or shuffle beforehand.} + +\item{initial_epoch}{Epoch at which to start training. Note that network +will run for (\code{epochs} - \code{initial_epochs}) rounds and not \code{epochs} rounds.} + +\item{seed}{Sets seed for reproducible results.} + +\item{path_file_log}{Write name of files to csv file if path is specified.} + +\item{train_val_split_csv}{A csv file specifying train/validation split. csv file should contain one column named \code{"file"} and one column named +\code{"type"}. The \code{"file"} column contains names of fasta/fastq files and \code{"type"} column specifies if file is used for training or validation. +Entries in \code{"type"} must be named \code{"train"} or \code{"val"}, otherwise file will not be used for either. \code{path} and \code{path_val} arguments should be the same. +Not implemented for \code{train_type = "label_folder"}.} + +\item{file_limit}{Integer or \code{NULL}. If integer, use only specified number of randomly sampled files for training. Ignored if greater than number of files in \code{path}.} + +\item{proportion_per_seq}{Numerical value between 0 and 1. Proportion of sequence to take samples from (use random subsequence).} + +\item{max_samples}{Maximum number of samples to use from one file. If not \code{NULL} and file has more than \code{max_samples} samples, will randomly choose a +subset of \code{max_samples} samples.} + +\item{maxlen}{Length of predictor sequence.} + +\item{patchlen}{The length of a patch when splitting the input sequence.} + +\item{nopatches}{The number of patches when splitting the input sequence.} + +\item{step}{Frequency of sampling steps.} + +\item{file_filter}{Vector of file names to use from path_corpus.} + +\item{stride}{The overlap between two patches when splitting the input sequence.} + +\item{pretrained_model}{A pretrained keras model, for which training will be continued} + +\item{learningrate}{A Tensor, floating point value. If a schedule is defines, this value gives the initial learning rate. Defaults to 0.001.} + +\item{learningrate_schedule}{A schedule for a non-constant learning rate over the training. Either "cosine_annealing", "step_decay", or "exp_decay".} + +\item{k}{Value of k for sparse top k categorical accuracy. Defaults to 5.} + +\item{stepsmin}{In CPC, a patch is predicted given another patch. stepsmin defines how many patches between these two should be ignored during prediction.} + +\item{stepsmax}{The maximum distance between the predicted patch and the given patch.} + +\item{emb_scale}{Scales the impact of a patches context.} +} +\value{ +A list of training metrics. +} +\description{ +Train a CPC (Oord et al.) inspired neural network on genomic data. +} +\examples{ +\dontshow{if (reticulate::py_module_available("tensorflow")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} + +#create dummy data +path_train_1 <- tempfile() +path_train_2 <- tempfile() +path_val_1 <- tempfile() +path_val_2 <- tempfile() + +for (current_path in c(path_train_1, path_train_2, + path_val_1, path_val_2)) { + dir.create(current_path) + deepG::create_dummy_data(file_path = current_path, + num_files = 3, + seq_length = 10, + num_seq = 5, + vocabulary = c("a", "c", "g", "t")) +} + +# create model +encoder <- function(maxlen = NULL, + patchlen = NULL, + nopatches = NULL, + eval = FALSE) { + if (is.null(nopatches)) { + nopatches <- nopatchescalc(patchlen, maxlen, patchlen * 0.4) + } + inp <- keras::layer_input(shape = c(maxlen, 4)) + stridelen <- as.integer(0.4 * patchlen) + createpatches <- inp \%>\% + keras::layer_reshape(list(maxlen, 4L, 1L), name = "prep_reshape1", dtype = "float32") \%>\% + tensorflow::tf$image$extract_patches( + sizes = list(1L, patchlen, 4L, 1L), + strides = list(1L, stridelen, 4L, 1L), + rates = list(1L, 1L, 1L, 1L), + padding = "VALID", + name = "prep_patches" + ) \%>\% + keras::layer_reshape(list(nopatches, patchlen, 4L), + name = "prep_reshape2") \%>\% + tensorflow::tf$reshape(list(-1L, patchlen, 4L), + name = "prep_reshape3") + + danQ <- createpatches \%>\% + keras::layer_conv_1d( + input_shape = c(maxlen, 4L), + filters = 320L, + kernel_size = 26L, + activation = "relu" + ) \%>\% + keras::layer_max_pooling_1d(pool_size = 13L, strides = 13L) \%>\% + keras::layer_dropout(0.2) \%>\% + keras::layer_lstm(units = 320, return_sequences = TRUE) \%>\% + keras::layer_dropout(0.5) \%>\% + keras::layer_flatten() \%>\% + keras::layer_dense(925, activation = "relu") + patchesback <- danQ \%>\% + tensorflow::tf$reshape(list(-1L, tensorflow::tf$cast(nopatches, tensorflow::tf$int16), 925L)) + keras::keras_model(inp, patchesback) +} + +context <- function(latents) { + cres <- latents + cres_dim = cres$shape + predictions <- + cres \%>\% + keras::layer_lstm( + return_sequences = TRUE, + units = 256, # WAS: 2048, + name = paste("context_LSTM_1", + sep = ""), + activation = "relu" + ) + return(predictions) +} + +# train model +temp_dir <- tempdir() +hist <- train_model_cpc(train_type = "CPC", + ### cpc functions ### + encoder = encoder, + context = context, + #### Generator settings #### + path_checkpoint = temp_dir, + path = c(path_train_1, path_train_2), + path_val = c(path_val_1, path_val_2), + run_name = "TEST", + batch_size = 8, + epochs = 3, + steps_per_epoch = 6, + patchlen = 100, + nopatches = 8) + + +\dontshow{\}) # examplesIf} +}