a b/man/train_model_cpc.Rd
1
% Generated by roxygen2: do not edit by hand
2
% Please edit documentation in R/train_cpc.R
3
\name{train_model_cpc}
4
\alias{train_model_cpc}
5
\title{Train CPC inspired model}
6
\usage{
7
train_model_cpc(
8
  train_type = "CPC",
9
  encoder = NULL,
10
  context = NULL,
11
  path,
12
  path_val = NULL,
13
  path_checkpoint = NULL,
14
  path_tensorboard = NULL,
15
  train_val_ratio = 0.2,
16
  run_name,
17
  batch_size = 32,
18
  epochs = 100,
19
  steps_per_epoch = 2000,
20
  shuffle_file_order = FALSE,
21
  initial_epoch = 1,
22
  seed = 1234,
23
  path_file_log = TRUE,
24
  train_val_split_csv = NULL,
25
  file_limit = NULL,
26
  proportion_per_seq = NULL,
27
  max_samples = NULL,
28
  maxlen = NULL,
29
  patchlen = NULL,
30
  nopatches = NULL,
31
  step = NULL,
32
  file_filter = NULL,
33
  stride = 0.4,
34
  pretrained_model = NULL,
35
  learningrate = 0.001,
36
  learningrate_schedule = NULL,
37
  k = 5,
38
  stepsmin = 2,
39
  stepsmax = 3,
40
  emb_scale = 0.1
41
)
42
}
43
\arguments{
44
\item{train_type}{Either \code{"cpc"}, \code{"Self-GenomeNet"}.}
45
46
\item{encoder}{A keras encoder for the cpc function.}
47
48
\item{context}{A keras context model for the cpc function.}
49
50
\item{path}{Path to training data. If \code{train_type} is \code{label_folder}, should be a vector or list
51
where each entry corresponds to a class (list elements can be directories and/or individual files). If \code{train_type} is not \code{label_folder},
52
can be a single directory or file or a list of directories and/or files.}
53
54
\item{path_val}{Path to validation data. See \code{path} argument for details.}
55
56
\item{path_checkpoint}{Path to checkpoints folder or \code{NULL}. If \code{NULL}, checkpoints don't get stored.}
57
58
\item{path_tensorboard}{Path to tensorboard directory or \code{NULL}. If \code{NULL}, training not tracked on tensorboard.}
59
60
\item{train_val_ratio}{For generator defines the fraction of batches that will be used for validation (compared to size of training data), i.e. one validation iteration
61
processes \code{batch_size} \eqn{*} \code{steps_per_epoch} \eqn{*} \code{train_val_ratio} samples. If you use dataset instead of generator and \code{dataset_val} is \code{NULL}, splits \code{dataset}
62
into train/validation data.}
63
64
\item{run_name}{Name of the run. Name will be used to identify output from callbacks.}
65
66
\item{batch_size}{Number of samples used for one network update.}
67
68
\item{epochs}{Number of iterations.}
69
70
\item{steps_per_epoch}{Number of training batches per epoch.}
71
72
\item{shuffle_file_order}{Boolean, whether to go through files sequentially or shuffle beforehand.}
73
74
\item{initial_epoch}{Epoch at which to start training. Note that network
75
will run for (\code{epochs} - \code{initial_epochs}) rounds and not \code{epochs} rounds.}
76
77
\item{seed}{Sets seed for reproducible results.}
78
79
\item{path_file_log}{Write name of files to csv file if path is specified.}
80
81
\item{train_val_split_csv}{A csv file specifying train/validation split. csv file should contain one column named \code{"file"} and one column named
82
\code{"type"}. The \code{"file"} column contains names of fasta/fastq files and \code{"type"} column specifies if file is used for training or validation.
83
Entries in \code{"type"} must be named \code{"train"} or \code{"val"}, otherwise file will not be used for either. \code{path} and \code{path_val} arguments should be the same.
84
Not implemented for \code{train_type = "label_folder"}.}
85
86
\item{file_limit}{Integer or \code{NULL}. If integer, use only specified number of randomly sampled files for training. Ignored if greater than number of files in \code{path}.}
87
88
\item{proportion_per_seq}{Numerical value between 0 and 1. Proportion of sequence to take samples from (use random subsequence).}
89
90
\item{max_samples}{Maximum number of samples to use from one file. If not \code{NULL} and file has more than \code{max_samples} samples, will randomly choose a
91
subset of \code{max_samples} samples.}
92
93
\item{maxlen}{Length of predictor sequence.}
94
95
\item{patchlen}{The length of a patch when splitting the input sequence.}
96
97
\item{nopatches}{The number of patches when splitting the input sequence.}
98
99
\item{step}{Frequency of sampling steps.}
100
101
\item{file_filter}{Vector of file names to use from path_corpus.}
102
103
\item{stride}{The overlap between two patches when splitting the input sequence.}
104
105
\item{pretrained_model}{A pretrained keras model, for which training will be continued}
106
107
\item{learningrate}{A Tensor, floating point value. If a schedule is defines, this value gives the initial learning rate. Defaults to 0.001.}
108
109
\item{learningrate_schedule}{A schedule for a non-constant learning rate over the training. Either "cosine_annealing", "step_decay", or "exp_decay".}
110
111
\item{k}{Value of k for sparse top k categorical accuracy. Defaults to 5.}
112
113
\item{stepsmin}{In CPC, a patch is predicted given another patch. stepsmin defines how many patches between these two should be ignored during prediction.}
114
115
\item{stepsmax}{The maximum distance between the predicted patch and the given patch.}
116
117
\item{emb_scale}{Scales the impact of a patches context.}
118
}
119
\value{
120
A list of training metrics.
121
}
122
\description{
123
Train a CPC (Oord et al.) inspired neural network on genomic data.
124
}
125
\examples{
126
\dontshow{if (reticulate::py_module_available("tensorflow")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
127
128
#create dummy data
129
path_train_1 <- tempfile()
130
path_train_2 <- tempfile()
131
path_val_1 <- tempfile()
132
path_val_2 <- tempfile()
133
134
for (current_path in c(path_train_1, path_train_2,
135
                       path_val_1, path_val_2)) {
136
  dir.create(current_path)
137
  deepG::create_dummy_data(file_path = current_path,
138
                           num_files = 3,
139
                           seq_length = 10,
140
                           num_seq = 5,
141
                           vocabulary = c("a", "c", "g", "t"))
142
}
143
144
# create model
145
encoder <- function(maxlen = NULL,
146
                    patchlen = NULL,
147
                    nopatches = NULL,
148
                    eval = FALSE) {
149
  if (is.null(nopatches)) {
150
    nopatches <- nopatchescalc(patchlen, maxlen, patchlen * 0.4)
151
  }
152
  inp <- keras::layer_input(shape = c(maxlen, 4))
153
  stridelen <- as.integer(0.4 * patchlen)
154
  createpatches <- inp \%>\%
155
    keras::layer_reshape(list(maxlen, 4L, 1L), name = "prep_reshape1", dtype = "float32") \%>\%
156
    tensorflow::tf$image$extract_patches(
157
      sizes = list(1L, patchlen, 4L, 1L),
158
      strides = list(1L, stridelen, 4L, 1L),
159
      rates = list(1L, 1L, 1L, 1L),
160
      padding = "VALID",
161
      name = "prep_patches"
162
    ) \%>\%
163
    keras::layer_reshape(list(nopatches, patchlen, 4L),
164
                         name = "prep_reshape2") \%>\%
165
    tensorflow::tf$reshape(list(-1L, patchlen, 4L),
166
                           name = "prep_reshape3")
167
168
  danQ <- createpatches \%>\%
169
    keras::layer_conv_1d(
170
      input_shape = c(maxlen, 4L),
171
      filters = 320L,
172
      kernel_size = 26L,
173
      activation = "relu"
174
    ) \%>\%
175
    keras::layer_max_pooling_1d(pool_size = 13L, strides = 13L) \%>\%
176
    keras::layer_dropout(0.2) \%>\%
177
    keras::layer_lstm(units = 320, return_sequences = TRUE) \%>\%
178
    keras::layer_dropout(0.5) \%>\%
179
    keras::layer_flatten() \%>\%
180
    keras::layer_dense(925, activation = "relu")
181
  patchesback <- danQ \%>\%
182
    tensorflow::tf$reshape(list(-1L, tensorflow::tf$cast(nopatches, tensorflow::tf$int16), 925L))
183
  keras::keras_model(inp, patchesback)
184
}
185
186
context <- function(latents) {
187
  cres <- latents
188
  cres_dim = cres$shape
189
  predictions <-
190
    cres \%>\%
191
    keras::layer_lstm(
192
      return_sequences = TRUE,
193
      units = 256,  # WAS: 2048,
194
      name = paste("context_LSTM_1",
195
                   sep = ""),
196
      activation = "relu"
197
    )
198
  return(predictions)
199
}
200
201
# train model
202
temp_dir <- tempdir()
203
hist <- train_model_cpc(train_type = "CPC",
204
                        ### cpc functions ###
205
                        encoder = encoder,
206
                        context = context,
207
                        #### Generator settings ####
208
                        path_checkpoint = temp_dir,
209
                        path = c(path_train_1, path_train_2),
210
                        path_val = c(path_val_1, path_val_2),
211
                        run_name = "TEST",
212
                        batch_size = 8,
213
                        epochs = 3,
214
                        steps_per_epoch = 6,
215
                        patchlen = 100,
216
                        nopatches = 8)
217
                
218
 
219
\dontshow{\}) # examplesIf}
220
}