[1c0e03]: / man / train_model_cpc.Rd

Download this file

221 lines (180 with data), 8.3 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/train_cpc.R
\name{train_model_cpc}
\alias{train_model_cpc}
\title{Train CPC inspired model}
\usage{
train_model_cpc(
train_type = "CPC",
encoder = NULL,
context = NULL,
path,
path_val = NULL,
path_checkpoint = NULL,
path_tensorboard = NULL,
train_val_ratio = 0.2,
run_name,
batch_size = 32,
epochs = 100,
steps_per_epoch = 2000,
shuffle_file_order = FALSE,
initial_epoch = 1,
seed = 1234,
path_file_log = TRUE,
train_val_split_csv = NULL,
file_limit = NULL,
proportion_per_seq = NULL,
max_samples = NULL,
maxlen = NULL,
patchlen = NULL,
nopatches = NULL,
step = NULL,
file_filter = NULL,
stride = 0.4,
pretrained_model = NULL,
learningrate = 0.001,
learningrate_schedule = NULL,
k = 5,
stepsmin = 2,
stepsmax = 3,
emb_scale = 0.1
)
}
\arguments{
\item{train_type}{Either \code{"cpc"}, \code{"Self-GenomeNet"}.}
\item{encoder}{A keras encoder for the cpc function.}
\item{context}{A keras context model for the cpc function.}
\item{path}{Path to training data. If \code{train_type} is \code{label_folder}, should be a vector or list
where each entry corresponds to a class (list elements can be directories and/or individual files). If \code{train_type} is not \code{label_folder},
can be a single directory or file or a list of directories and/or files.}
\item{path_val}{Path to validation data. See \code{path} argument for details.}
\item{path_checkpoint}{Path to checkpoints folder or \code{NULL}. If \code{NULL}, checkpoints don't get stored.}
\item{path_tensorboard}{Path to tensorboard directory or \code{NULL}. If \code{NULL}, training not tracked on tensorboard.}
\item{train_val_ratio}{For generator defines the fraction of batches that will be used for validation (compared to size of training data), i.e. one validation iteration
processes \code{batch_size} \eqn{*} \code{steps_per_epoch} \eqn{*} \code{train_val_ratio} samples. If you use dataset instead of generator and \code{dataset_val} is \code{NULL}, splits \code{dataset}
into train/validation data.}
\item{run_name}{Name of the run. Name will be used to identify output from callbacks.}
\item{batch_size}{Number of samples used for one network update.}
\item{epochs}{Number of iterations.}
\item{steps_per_epoch}{Number of training batches per epoch.}
\item{shuffle_file_order}{Boolean, whether to go through files sequentially or shuffle beforehand.}
\item{initial_epoch}{Epoch at which to start training. Note that network
will run for (\code{epochs} - \code{initial_epochs}) rounds and not \code{epochs} rounds.}
\item{seed}{Sets seed for reproducible results.}
\item{path_file_log}{Write name of files to csv file if path is specified.}
\item{train_val_split_csv}{A csv file specifying train/validation split. csv file should contain one column named \code{"file"} and one column named
\code{"type"}. The \code{"file"} column contains names of fasta/fastq files and \code{"type"} column specifies if file is used for training or validation.
Entries in \code{"type"} must be named \code{"train"} or \code{"val"}, otherwise file will not be used for either. \code{path} and \code{path_val} arguments should be the same.
Not implemented for \code{train_type = "label_folder"}.}
\item{file_limit}{Integer or \code{NULL}. If integer, use only specified number of randomly sampled files for training. Ignored if greater than number of files in \code{path}.}
\item{proportion_per_seq}{Numerical value between 0 and 1. Proportion of sequence to take samples from (use random subsequence).}
\item{max_samples}{Maximum number of samples to use from one file. If not \code{NULL} and file has more than \code{max_samples} samples, will randomly choose a
subset of \code{max_samples} samples.}
\item{maxlen}{Length of predictor sequence.}
\item{patchlen}{The length of a patch when splitting the input sequence.}
\item{nopatches}{The number of patches when splitting the input sequence.}
\item{step}{Frequency of sampling steps.}
\item{file_filter}{Vector of file names to use from path_corpus.}
\item{stride}{The overlap between two patches when splitting the input sequence.}
\item{pretrained_model}{A pretrained keras model, for which training will be continued}
\item{learningrate}{A Tensor, floating point value. If a schedule is defines, this value gives the initial learning rate. Defaults to 0.001.}
\item{learningrate_schedule}{A schedule for a non-constant learning rate over the training. Either "cosine_annealing", "step_decay", or "exp_decay".}
\item{k}{Value of k for sparse top k categorical accuracy. Defaults to 5.}
\item{stepsmin}{In CPC, a patch is predicted given another patch. stepsmin defines how many patches between these two should be ignored during prediction.}
\item{stepsmax}{The maximum distance between the predicted patch and the given patch.}
\item{emb_scale}{Scales the impact of a patches context.}
}
\value{
A list of training metrics.
}
\description{
Train a CPC (Oord et al.) inspired neural network on genomic data.
}
\examples{
\dontshow{if (reticulate::py_module_available("tensorflow")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
#create dummy data
path_train_1 <- tempfile()
path_train_2 <- tempfile()
path_val_1 <- tempfile()
path_val_2 <- tempfile()
for (current_path in c(path_train_1, path_train_2,
path_val_1, path_val_2)) {
dir.create(current_path)
deepG::create_dummy_data(file_path = current_path,
num_files = 3,
seq_length = 10,
num_seq = 5,
vocabulary = c("a", "c", "g", "t"))
}
# create model
encoder <- function(maxlen = NULL,
patchlen = NULL,
nopatches = NULL,
eval = FALSE) {
if (is.null(nopatches)) {
nopatches <- nopatchescalc(patchlen, maxlen, patchlen * 0.4)
}
inp <- keras::layer_input(shape = c(maxlen, 4))
stridelen <- as.integer(0.4 * patchlen)
createpatches <- inp \%>\%
keras::layer_reshape(list(maxlen, 4L, 1L), name = "prep_reshape1", dtype = "float32") \%>\%
tensorflow::tf$image$extract_patches(
sizes = list(1L, patchlen, 4L, 1L),
strides = list(1L, stridelen, 4L, 1L),
rates = list(1L, 1L, 1L, 1L),
padding = "VALID",
name = "prep_patches"
) \%>\%
keras::layer_reshape(list(nopatches, patchlen, 4L),
name = "prep_reshape2") \%>\%
tensorflow::tf$reshape(list(-1L, patchlen, 4L),
name = "prep_reshape3")
danQ <- createpatches \%>\%
keras::layer_conv_1d(
input_shape = c(maxlen, 4L),
filters = 320L,
kernel_size = 26L,
activation = "relu"
) \%>\%
keras::layer_max_pooling_1d(pool_size = 13L, strides = 13L) \%>\%
keras::layer_dropout(0.2) \%>\%
keras::layer_lstm(units = 320, return_sequences = TRUE) \%>\%
keras::layer_dropout(0.5) \%>\%
keras::layer_flatten() \%>\%
keras::layer_dense(925, activation = "relu")
patchesback <- danQ \%>\%
tensorflow::tf$reshape(list(-1L, tensorflow::tf$cast(nopatches, tensorflow::tf$int16), 925L))
keras::keras_model(inp, patchesback)
}
context <- function(latents) {
cres <- latents
cres_dim = cres$shape
predictions <-
cres \%>\%
keras::layer_lstm(
return_sequences = TRUE,
units = 256, # WAS: 2048,
name = paste("context_LSTM_1",
sep = ""),
activation = "relu"
)
return(predictions)
}
# train model
temp_dir <- tempdir()
hist <- train_model_cpc(train_type = "CPC",
### cpc functions ###
encoder = encoder,
context = context,
#### Generator settings ####
path_checkpoint = temp_dir,
path = c(path_train_1, path_train_2),
path_val = c(path_val_1, path_val_2),
run_name = "TEST",
batch_size = 8,
epochs = 3,
steps_per_epoch = 6,
patchlen = 100,
nopatches = 8)
\dontshow{\}) # examplesIf}
}