|
a |
|
b/man/train_model_cpc.Rd |
|
|
1 |
% Generated by roxygen2: do not edit by hand |
|
|
2 |
% Please edit documentation in R/train_cpc.R |
|
|
3 |
\name{train_model_cpc} |
|
|
4 |
\alias{train_model_cpc} |
|
|
5 |
\title{Train CPC inspired model} |
|
|
6 |
\usage{ |
|
|
7 |
train_model_cpc( |
|
|
8 |
train_type = "CPC", |
|
|
9 |
encoder = NULL, |
|
|
10 |
context = NULL, |
|
|
11 |
path, |
|
|
12 |
path_val = NULL, |
|
|
13 |
path_checkpoint = NULL, |
|
|
14 |
path_tensorboard = NULL, |
|
|
15 |
train_val_ratio = 0.2, |
|
|
16 |
run_name, |
|
|
17 |
batch_size = 32, |
|
|
18 |
epochs = 100, |
|
|
19 |
steps_per_epoch = 2000, |
|
|
20 |
shuffle_file_order = FALSE, |
|
|
21 |
initial_epoch = 1, |
|
|
22 |
seed = 1234, |
|
|
23 |
path_file_log = TRUE, |
|
|
24 |
train_val_split_csv = NULL, |
|
|
25 |
file_limit = NULL, |
|
|
26 |
proportion_per_seq = NULL, |
|
|
27 |
max_samples = NULL, |
|
|
28 |
maxlen = NULL, |
|
|
29 |
patchlen = NULL, |
|
|
30 |
nopatches = NULL, |
|
|
31 |
step = NULL, |
|
|
32 |
file_filter = NULL, |
|
|
33 |
stride = 0.4, |
|
|
34 |
pretrained_model = NULL, |
|
|
35 |
learningrate = 0.001, |
|
|
36 |
learningrate_schedule = NULL, |
|
|
37 |
k = 5, |
|
|
38 |
stepsmin = 2, |
|
|
39 |
stepsmax = 3, |
|
|
40 |
emb_scale = 0.1 |
|
|
41 |
) |
|
|
42 |
} |
|
|
43 |
\arguments{ |
|
|
44 |
\item{train_type}{Either \code{"cpc"}, \code{"Self-GenomeNet"}.} |
|
|
45 |
|
|
|
46 |
\item{encoder}{A keras encoder for the cpc function.} |
|
|
47 |
|
|
|
48 |
\item{context}{A keras context model for the cpc function.} |
|
|
49 |
|
|
|
50 |
\item{path}{Path to training data. If \code{train_type} is \code{label_folder}, should be a vector or list |
|
|
51 |
where each entry corresponds to a class (list elements can be directories and/or individual files). If \code{train_type} is not \code{label_folder}, |
|
|
52 |
can be a single directory or file or a list of directories and/or files.} |
|
|
53 |
|
|
|
54 |
\item{path_val}{Path to validation data. See \code{path} argument for details.} |
|
|
55 |
|
|
|
56 |
\item{path_checkpoint}{Path to checkpoints folder or \code{NULL}. If \code{NULL}, checkpoints don't get stored.} |
|
|
57 |
|
|
|
58 |
\item{path_tensorboard}{Path to tensorboard directory or \code{NULL}. If \code{NULL}, training not tracked on tensorboard.} |
|
|
59 |
|
|
|
60 |
\item{train_val_ratio}{For generator defines the fraction of batches that will be used for validation (compared to size of training data), i.e. one validation iteration |
|
|
61 |
processes \code{batch_size} \eqn{*} \code{steps_per_epoch} \eqn{*} \code{train_val_ratio} samples. If you use dataset instead of generator and \code{dataset_val} is \code{NULL}, splits \code{dataset} |
|
|
62 |
into train/validation data.} |
|
|
63 |
|
|
|
64 |
\item{run_name}{Name of the run. Name will be used to identify output from callbacks.} |
|
|
65 |
|
|
|
66 |
\item{batch_size}{Number of samples used for one network update.} |
|
|
67 |
|
|
|
68 |
\item{epochs}{Number of iterations.} |
|
|
69 |
|
|
|
70 |
\item{steps_per_epoch}{Number of training batches per epoch.} |
|
|
71 |
|
|
|
72 |
\item{shuffle_file_order}{Boolean, whether to go through files sequentially or shuffle beforehand.} |
|
|
73 |
|
|
|
74 |
\item{initial_epoch}{Epoch at which to start training. Note that network |
|
|
75 |
will run for (\code{epochs} - \code{initial_epochs}) rounds and not \code{epochs} rounds.} |
|
|
76 |
|
|
|
77 |
\item{seed}{Sets seed for reproducible results.} |
|
|
78 |
|
|
|
79 |
\item{path_file_log}{Write name of files to csv file if path is specified.} |
|
|
80 |
|
|
|
81 |
\item{train_val_split_csv}{A csv file specifying train/validation split. csv file should contain one column named \code{"file"} and one column named |
|
|
82 |
\code{"type"}. The \code{"file"} column contains names of fasta/fastq files and \code{"type"} column specifies if file is used for training or validation. |
|
|
83 |
Entries in \code{"type"} must be named \code{"train"} or \code{"val"}, otherwise file will not be used for either. \code{path} and \code{path_val} arguments should be the same. |
|
|
84 |
Not implemented for \code{train_type = "label_folder"}.} |
|
|
85 |
|
|
|
86 |
\item{file_limit}{Integer or \code{NULL}. If integer, use only specified number of randomly sampled files for training. Ignored if greater than number of files in \code{path}.} |
|
|
87 |
|
|
|
88 |
\item{proportion_per_seq}{Numerical value between 0 and 1. Proportion of sequence to take samples from (use random subsequence).} |
|
|
89 |
|
|
|
90 |
\item{max_samples}{Maximum number of samples to use from one file. If not \code{NULL} and file has more than \code{max_samples} samples, will randomly choose a |
|
|
91 |
subset of \code{max_samples} samples.} |
|
|
92 |
|
|
|
93 |
\item{maxlen}{Length of predictor sequence.} |
|
|
94 |
|
|
|
95 |
\item{patchlen}{The length of a patch when splitting the input sequence.} |
|
|
96 |
|
|
|
97 |
\item{nopatches}{The number of patches when splitting the input sequence.} |
|
|
98 |
|
|
|
99 |
\item{step}{Frequency of sampling steps.} |
|
|
100 |
|
|
|
101 |
\item{file_filter}{Vector of file names to use from path_corpus.} |
|
|
102 |
|
|
|
103 |
\item{stride}{The overlap between two patches when splitting the input sequence.} |
|
|
104 |
|
|
|
105 |
\item{pretrained_model}{A pretrained keras model, for which training will be continued} |
|
|
106 |
|
|
|
107 |
\item{learningrate}{A Tensor, floating point value. If a schedule is defines, this value gives the initial learning rate. Defaults to 0.001.} |
|
|
108 |
|
|
|
109 |
\item{learningrate_schedule}{A schedule for a non-constant learning rate over the training. Either "cosine_annealing", "step_decay", or "exp_decay".} |
|
|
110 |
|
|
|
111 |
\item{k}{Value of k for sparse top k categorical accuracy. Defaults to 5.} |
|
|
112 |
|
|
|
113 |
\item{stepsmin}{In CPC, a patch is predicted given another patch. stepsmin defines how many patches between these two should be ignored during prediction.} |
|
|
114 |
|
|
|
115 |
\item{stepsmax}{The maximum distance between the predicted patch and the given patch.} |
|
|
116 |
|
|
|
117 |
\item{emb_scale}{Scales the impact of a patches context.} |
|
|
118 |
} |
|
|
119 |
\value{ |
|
|
120 |
A list of training metrics. |
|
|
121 |
} |
|
|
122 |
\description{ |
|
|
123 |
Train a CPC (Oord et al.) inspired neural network on genomic data. |
|
|
124 |
} |
|
|
125 |
\examples{ |
|
|
126 |
\dontshow{if (reticulate::py_module_available("tensorflow")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} |
|
|
127 |
|
|
|
128 |
#create dummy data |
|
|
129 |
path_train_1 <- tempfile() |
|
|
130 |
path_train_2 <- tempfile() |
|
|
131 |
path_val_1 <- tempfile() |
|
|
132 |
path_val_2 <- tempfile() |
|
|
133 |
|
|
|
134 |
for (current_path in c(path_train_1, path_train_2, |
|
|
135 |
path_val_1, path_val_2)) { |
|
|
136 |
dir.create(current_path) |
|
|
137 |
deepG::create_dummy_data(file_path = current_path, |
|
|
138 |
num_files = 3, |
|
|
139 |
seq_length = 10, |
|
|
140 |
num_seq = 5, |
|
|
141 |
vocabulary = c("a", "c", "g", "t")) |
|
|
142 |
} |
|
|
143 |
|
|
|
144 |
# create model |
|
|
145 |
encoder <- function(maxlen = NULL, |
|
|
146 |
patchlen = NULL, |
|
|
147 |
nopatches = NULL, |
|
|
148 |
eval = FALSE) { |
|
|
149 |
if (is.null(nopatches)) { |
|
|
150 |
nopatches <- nopatchescalc(patchlen, maxlen, patchlen * 0.4) |
|
|
151 |
} |
|
|
152 |
inp <- keras::layer_input(shape = c(maxlen, 4)) |
|
|
153 |
stridelen <- as.integer(0.4 * patchlen) |
|
|
154 |
createpatches <- inp \%>\% |
|
|
155 |
keras::layer_reshape(list(maxlen, 4L, 1L), name = "prep_reshape1", dtype = "float32") \%>\% |
|
|
156 |
tensorflow::tf$image$extract_patches( |
|
|
157 |
sizes = list(1L, patchlen, 4L, 1L), |
|
|
158 |
strides = list(1L, stridelen, 4L, 1L), |
|
|
159 |
rates = list(1L, 1L, 1L, 1L), |
|
|
160 |
padding = "VALID", |
|
|
161 |
name = "prep_patches" |
|
|
162 |
) \%>\% |
|
|
163 |
keras::layer_reshape(list(nopatches, patchlen, 4L), |
|
|
164 |
name = "prep_reshape2") \%>\% |
|
|
165 |
tensorflow::tf$reshape(list(-1L, patchlen, 4L), |
|
|
166 |
name = "prep_reshape3") |
|
|
167 |
|
|
|
168 |
danQ <- createpatches \%>\% |
|
|
169 |
keras::layer_conv_1d( |
|
|
170 |
input_shape = c(maxlen, 4L), |
|
|
171 |
filters = 320L, |
|
|
172 |
kernel_size = 26L, |
|
|
173 |
activation = "relu" |
|
|
174 |
) \%>\% |
|
|
175 |
keras::layer_max_pooling_1d(pool_size = 13L, strides = 13L) \%>\% |
|
|
176 |
keras::layer_dropout(0.2) \%>\% |
|
|
177 |
keras::layer_lstm(units = 320, return_sequences = TRUE) \%>\% |
|
|
178 |
keras::layer_dropout(0.5) \%>\% |
|
|
179 |
keras::layer_flatten() \%>\% |
|
|
180 |
keras::layer_dense(925, activation = "relu") |
|
|
181 |
patchesback <- danQ \%>\% |
|
|
182 |
tensorflow::tf$reshape(list(-1L, tensorflow::tf$cast(nopatches, tensorflow::tf$int16), 925L)) |
|
|
183 |
keras::keras_model(inp, patchesback) |
|
|
184 |
} |
|
|
185 |
|
|
|
186 |
context <- function(latents) { |
|
|
187 |
cres <- latents |
|
|
188 |
cres_dim = cres$shape |
|
|
189 |
predictions <- |
|
|
190 |
cres \%>\% |
|
|
191 |
keras::layer_lstm( |
|
|
192 |
return_sequences = TRUE, |
|
|
193 |
units = 256, # WAS: 2048, |
|
|
194 |
name = paste("context_LSTM_1", |
|
|
195 |
sep = ""), |
|
|
196 |
activation = "relu" |
|
|
197 |
) |
|
|
198 |
return(predictions) |
|
|
199 |
} |
|
|
200 |
|
|
|
201 |
# train model |
|
|
202 |
temp_dir <- tempdir() |
|
|
203 |
hist <- train_model_cpc(train_type = "CPC", |
|
|
204 |
### cpc functions ### |
|
|
205 |
encoder = encoder, |
|
|
206 |
context = context, |
|
|
207 |
#### Generator settings #### |
|
|
208 |
path_checkpoint = temp_dir, |
|
|
209 |
path = c(path_train_1, path_train_2), |
|
|
210 |
path_val = c(path_val_1, path_val_2), |
|
|
211 |
run_name = "TEST", |
|
|
212 |
batch_size = 8, |
|
|
213 |
epochs = 3, |
|
|
214 |
steps_per_epoch = 6, |
|
|
215 |
patchlen = 100, |
|
|
216 |
nopatches = 8) |
|
|
217 |
|
|
|
218 |
|
|
|
219 |
\dontshow{\}) # examplesIf} |
|
|
220 |
} |