% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/train_cpc.R
\name{train_model_cpc}
\alias{train_model_cpc}
\title{Train CPC inspired model}
\usage{
train_model_cpc(
train_type = "CPC",
encoder = NULL,
context = NULL,
path,
path_val = NULL,
path_checkpoint = NULL,
path_tensorboard = NULL,
train_val_ratio = 0.2,
run_name,
batch_size = 32,
epochs = 100,
steps_per_epoch = 2000,
shuffle_file_order = FALSE,
initial_epoch = 1,
seed = 1234,
path_file_log = TRUE,
train_val_split_csv = NULL,
file_limit = NULL,
proportion_per_seq = NULL,
max_samples = NULL,
maxlen = NULL,
patchlen = NULL,
nopatches = NULL,
step = NULL,
file_filter = NULL,
stride = 0.4,
pretrained_model = NULL,
learningrate = 0.001,
learningrate_schedule = NULL,
k = 5,
stepsmin = 2,
stepsmax = 3,
emb_scale = 0.1
)
}
\arguments{
\item{train_type}{Either \code{"cpc"}, \code{"Self-GenomeNet"}.}
\item{encoder}{A keras encoder for the cpc function.}
\item{context}{A keras context model for the cpc function.}
\item{path}{Path to training data. If \code{train_type} is \code{label_folder}, should be a vector or list
where each entry corresponds to a class (list elements can be directories and/or individual files). If \code{train_type} is not \code{label_folder},
can be a single directory or file or a list of directories and/or files.}
\item{path_val}{Path to validation data. See \code{path} argument for details.}
\item{path_checkpoint}{Path to checkpoints folder or \code{NULL}. If \code{NULL}, checkpoints don't get stored.}
\item{path_tensorboard}{Path to tensorboard directory or \code{NULL}. If \code{NULL}, training not tracked on tensorboard.}
\item{train_val_ratio}{For generator defines the fraction of batches that will be used for validation (compared to size of training data), i.e. one validation iteration
processes \code{batch_size} \eqn{*} \code{steps_per_epoch} \eqn{*} \code{train_val_ratio} samples. If you use dataset instead of generator and \code{dataset_val} is \code{NULL}, splits \code{dataset}
into train/validation data.}
\item{run_name}{Name of the run. Name will be used to identify output from callbacks.}
\item{batch_size}{Number of samples used for one network update.}
\item{epochs}{Number of iterations.}
\item{steps_per_epoch}{Number of training batches per epoch.}
\item{shuffle_file_order}{Boolean, whether to go through files sequentially or shuffle beforehand.}
\item{initial_epoch}{Epoch at which to start training. Note that network
will run for (\code{epochs} - \code{initial_epochs}) rounds and not \code{epochs} rounds.}
\item{seed}{Sets seed for reproducible results.}
\item{path_file_log}{Write name of files to csv file if path is specified.}
\item{train_val_split_csv}{A csv file specifying train/validation split. csv file should contain one column named \code{"file"} and one column named
\code{"type"}. The \code{"file"} column contains names of fasta/fastq files and \code{"type"} column specifies if file is used for training or validation.
Entries in \code{"type"} must be named \code{"train"} or \code{"val"}, otherwise file will not be used for either. \code{path} and \code{path_val} arguments should be the same.
Not implemented for \code{train_type = "label_folder"}.}
\item{file_limit}{Integer or \code{NULL}. If integer, use only specified number of randomly sampled files for training. Ignored if greater than number of files in \code{path}.}
\item{proportion_per_seq}{Numerical value between 0 and 1. Proportion of sequence to take samples from (use random subsequence).}
\item{max_samples}{Maximum number of samples to use from one file. If not \code{NULL} and file has more than \code{max_samples} samples, will randomly choose a
subset of \code{max_samples} samples.}
\item{maxlen}{Length of predictor sequence.}
\item{patchlen}{The length of a patch when splitting the input sequence.}
\item{nopatches}{The number of patches when splitting the input sequence.}
\item{step}{Frequency of sampling steps.}
\item{file_filter}{Vector of file names to use from path_corpus.}
\item{stride}{The overlap between two patches when splitting the input sequence.}
\item{pretrained_model}{A pretrained keras model, for which training will be continued}
\item{learningrate}{A Tensor, floating point value. If a schedule is defines, this value gives the initial learning rate. Defaults to 0.001.}
\item{learningrate_schedule}{A schedule for a non-constant learning rate over the training. Either "cosine_annealing", "step_decay", or "exp_decay".}
\item{k}{Value of k for sparse top k categorical accuracy. Defaults to 5.}
\item{stepsmin}{In CPC, a patch is predicted given another patch. stepsmin defines how many patches between these two should be ignored during prediction.}
\item{stepsmax}{The maximum distance between the predicted patch and the given patch.}
\item{emb_scale}{Scales the impact of a patches context.}
}
\value{
A list of training metrics.
}
\description{
Train a CPC (Oord et al.) inspired neural network on genomic data.
}
\examples{
\dontshow{if (reticulate::py_module_available("tensorflow")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
#create dummy data
path_train_1 <- tempfile()
path_train_2 <- tempfile()
path_val_1 <- tempfile()
path_val_2 <- tempfile()
for (current_path in c(path_train_1, path_train_2,
path_val_1, path_val_2)) {
dir.create(current_path)
deepG::create_dummy_data(file_path = current_path,
num_files = 3,
seq_length = 10,
num_seq = 5,
vocabulary = c("a", "c", "g", "t"))
}
# create model
encoder <- function(maxlen = NULL,
patchlen = NULL,
nopatches = NULL,
eval = FALSE) {
if (is.null(nopatches)) {
nopatches <- nopatchescalc(patchlen, maxlen, patchlen * 0.4)
}
inp <- keras::layer_input(shape = c(maxlen, 4))
stridelen <- as.integer(0.4 * patchlen)
createpatches <- inp \%>\%
keras::layer_reshape(list(maxlen, 4L, 1L), name = "prep_reshape1", dtype = "float32") \%>\%
tensorflow::tf$image$extract_patches(
sizes = list(1L, patchlen, 4L, 1L),
strides = list(1L, stridelen, 4L, 1L),
rates = list(1L, 1L, 1L, 1L),
padding = "VALID",
name = "prep_patches"
) \%>\%
keras::layer_reshape(list(nopatches, patchlen, 4L),
name = "prep_reshape2") \%>\%
tensorflow::tf$reshape(list(-1L, patchlen, 4L),
name = "prep_reshape3")
danQ <- createpatches \%>\%
keras::layer_conv_1d(
input_shape = c(maxlen, 4L),
filters = 320L,
kernel_size = 26L,
activation = "relu"
) \%>\%
keras::layer_max_pooling_1d(pool_size = 13L, strides = 13L) \%>\%
keras::layer_dropout(0.2) \%>\%
keras::layer_lstm(units = 320, return_sequences = TRUE) \%>\%
keras::layer_dropout(0.5) \%>\%
keras::layer_flatten() \%>\%
keras::layer_dense(925, activation = "relu")
patchesback <- danQ \%>\%
tensorflow::tf$reshape(list(-1L, tensorflow::tf$cast(nopatches, tensorflow::tf$int16), 925L))
keras::keras_model(inp, patchesback)
}
context <- function(latents) {
cres <- latents
cres_dim = cres$shape
predictions <-
cres \%>\%
keras::layer_lstm(
return_sequences = TRUE,
units = 256, # WAS: 2048,
name = paste("context_LSTM_1",
sep = ""),
activation = "relu"
)
return(predictions)
}
# train model
temp_dir <- tempdir()
hist <- train_model_cpc(train_type = "CPC",
### cpc functions ###
encoder = encoder,
context = context,
#### Generator settings ####
path_checkpoint = temp_dir,
path = c(path_train_1, path_train_2),
path_val = c(path_val_1, path_val_2),
run_name = "TEST",
batch_size = 8,
epochs = 3,
steps_per_epoch = 6,
patchlen = 100,
nopatches = 8)
\dontshow{\}) # examplesIf}
}