[736116]: / man / get_generator.Rd

Download this file

263 lines (214 with data), 13.8 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/generators.R
\name{get_generator}
\alias{get_generator}
\title{Wrapper for generator functions}
\usage{
get_generator(
path = NULL,
train_type,
batch_size,
maxlen,
step = NULL,
shuffle_file_order = FALSE,
vocabulary = c("A", "C", "G", "T"),
seed = 1,
proportion_entries = NULL,
shuffle_input = FALSE,
format = "fasta",
path_file_log = NULL,
reverse_complement = FALSE,
n_gram = NULL,
n_gram_stride = NULL,
output_format = "target_right",
ambiguous_nuc = "zero",
proportion_per_seq = NULL,
skip_amb_nuc = NULL,
use_quality_score = FALSE,
padding = FALSE,
added_label_path = NULL,
target_from_csv = NULL,
add_input_as_seq = NULL,
max_samples = NULL,
concat_seq = NULL,
target_len = 1,
file_filter = NULL,
use_coverage = NULL,
sample_by_file_size = FALSE,
add_noise = NULL,
random_sampling = FALSE,
set_learning = NULL,
file_limit = NULL,
reverse_complement_encoding = FALSE,
read_data = FALSE,
target_split = NULL,
path_file_logVal = NULL,
model = NULL,
vocabulary_label = NULL,
masked_lm = NULL,
val = FALSE,
return_int = FALSE,
verbose = TRUE,
delete_used_files = FALSE,
reshape_xy = NULL
)
}
\arguments{
\item{path}{Path to training data. If \code{train_type} is \code{label_folder}, should be a vector or list
where each entry corresponds to a class (list elements can be directories and/or individual files). If \code{train_type} is not \code{label_folder},
can be a single directory or file or a list of directories and/or files.}
\item{train_type}{Either \code{"lm"}, \code{"lm_rds"}, \code{"masked_lm"} for language model; \code{"label_header"}, \code{"label_folder"}, \code{"label_csv"}, \code{"label_rds"} for classification or \code{"dummy_gen"}.
\itemize{
\item Language model is trained to predict character(s) in a sequence. \cr
\item \code{"label_header"}/\code{"label_folder"}/\code{"label_csv"} are trained to predict a corresponding class given a sequence as input.
\item If \code{"label_header"}, class will be read from fasta headers.
\item If \code{"label_folder"}, class will be read from folder, i.e. all files in one folder must belong to the same class.
\item If \code{"label_csv"}, targets are read from a csv file. This file should have one column named "file". The targets then correspond to entries in that row (except "file"
column). Example: if we are currently working with a file called "a.fasta" and corresponding label is "label_1", there should be a row in our csv file\tabular{lll}{
file \tab label_1 \tab label_2 \cr
"a.fasta" \tab 1 \tab 0 \cr
}
\item If \code{"label_rds"}, generator will iterate over set of .rds files containing each a list of input and target tensors. Not implemented for model
with multiple inputs.
\item If \code{"lm_rds"}, generator will iterate over set of .rds files and will split tensor according to \code{target_len} argument
(targets are last \code{target_len} nucleotides of each sequence).
\item If \code{"dummy_gen"}, generator creates random data once and repeatedly feeds these to model.
\item If \code{"masked_lm"}, generator maskes some parts of the input. See \code{masked_lm} argument for details.
}}
\item{batch_size}{Number of samples used for one network update.}
\item{maxlen}{Length of predictor sequence.}
\item{step}{Frequency of sampling steps.}
\item{shuffle_file_order}{Boolean, whether to go through files sequentially or shuffle beforehand.}
\item{vocabulary}{Vector of allowed characters. Characters outside vocabulary get encoded as specified in \code{ambiguous_nuc}.}
\item{seed}{Sets seed for reproducible results.}
\item{proportion_entries}{Proportion of fasta entries to keep. For example, if fasta file has 50 entries and \code{proportion_entries = 0.1},
will randomly select 5 entries.}
\item{shuffle_input}{Whether to shuffle entries in file.}
\item{format}{File format, \code{"fasta"}, \code{"fastq"}, \code{"rds"} or \code{"fasta.tar.gz"}, \code{"fastq.tar.gz"} for \code{tar.gz} files.}
\item{path_file_log}{Write name of files used for training to csv file if path is specified.}
\item{reverse_complement}{Boolean, for every new file decide randomly to use original data or its reverse complement.}
\item{n_gram}{Integer, encode target not nucleotide wise but combine n nucleotides at once. For example for \verb{n=2, "AA" -> (1, 0,..., 0),}
\verb{"AC" -> (0, 1, 0,..., 0), "TT" -> (0,..., 0, 1)}, where the one-hot vectors have length \code{length(vocabulary)^n}.}
\item{n_gram_stride}{Step size for n-gram encoding. For AACCGGTT with \code{n_gram = 4} and \code{n_gram_stride = 2}, generator encodes
\verb{(AACC), (CCGG), (GGTT)}; for \code{n_gram_stride = 4} generator encodes \verb{(AACC), (GGTT)}.}
\item{output_format}{Determines shape of output tensor for language model.
Either \code{"target_right"}, \code{"target_middle_lstm"}, \code{"target_middle_cnn"} or \code{"wavenet"}.
Assume a sequence \code{"AACCGTA"}. Output correspond as follows
\itemize{
\item \verb{"target_right": X = "AACCGT", Y = "A"}
\item \verb{"target_middle_lstm": X = (X_1 = "AAC", X_2 = "ATG"), Y = "C"} (note reversed order of X_2)
\item \verb{"target_middle_cnn": X = "AACGTA", Y = "C"}
\item \verb{"wavenet": X = "AACCGT", Y = "ACCGTA"}
}}
\item{ambiguous_nuc}{How to handle nucleotides outside vocabulary, either \code{"zero"}, \code{"discard"}, \code{"empirical"} or \code{"equal"}.
\itemize{
\item If \code{"zero"}, input gets encoded as zero vector.
\item If \code{"equal"}, input is repetition of \code{1/length(vocabulary)}.
\item If \code{"discard"}, samples containing nucleotides outside vocabulary get discarded.
\item If \code{"empirical"}, use nucleotide distribution of current file.
}}
\item{proportion_per_seq}{Numerical value between 0 and 1. Proportion of sequence to take samples from (use random subsequence).}
\item{skip_amb_nuc}{Threshold of ambiguous nucleotides to accept in fasta entry. Complete entry will get discarded otherwise.}
\item{use_quality_score}{Whether to use fastq quality scores. If \code{TRUE} input is not one-hot-encoding but corresponds to probabilities.
For example (0.97, 0.01, 0.01, 0.01) instead of (1, 0, 0, 0).}
\item{padding}{Whether to pad sequences too short for one sample with zeros.}
\item{added_label_path}{Path to file with additional input labels. Should be a csv file with one column named "file". Other columns should correspond to labels.}
\item{target_from_csv}{Path to csv file with target mapping. One column should be called "file" and other entries in row are the targets.}
\item{add_input_as_seq}{Boolean vector specifying for each entry in \code{added_label_path} if rows from csv should be encoded as a sequence or used directly.
If a row in your csv file is a sequence this should be \code{TRUE}. For example you may want to add another sequence, say ACCGT. Then this would correspond to 1,2,2,3,4 in
csv file (if vocabulary = c("A", "C", "G", "T")). If \code{add_input_as_seq} is \code{TRUE}, 12234 gets one-hot encoded, so added input is a 3D tensor. If \code{add_input_as_seq} is
\code{FALSE} this will feed network just raw data (a 2D tensor).}
\item{max_samples}{Maximum number of samples to use from one file. If not \code{NULL} and file has more than \code{max_samples} samples, will randomly choose a
subset of \code{max_samples} samples.}
\item{concat_seq}{Character string or \code{NULL}. If not \code{NULL} all entries from file get concatenated to one sequence with \code{concat_seq} string between them.
Example: If 1.entry AACC, 2. entry TTTG and \code{concat_seq = "ZZZ"} this becomes AACCZZZTTTG.}
\item{target_len}{Number of nucleotides to predict at once for language model.}
\item{file_filter}{Vector of file names to use from path_corpus.}
\item{use_coverage}{Integer or \code{NULL}. If not \code{NULL}, use coverage as encoding rather than one-hot encoding and normalize.
Coverage information must be contained in fasta header: there must be a string \code{"cov_n"} in the header, where \code{n} is some integer.}
\item{sample_by_file_size}{Sample new file weighted by file size (bigger files more likely).}
\item{add_noise}{\code{NULL} or list of arguments. If not \code{NULL}, list must contain the following arguments: \code{noise_type} can be \code{"normal"} or \code{"uniform"};
optional arguments \code{sd} or \code{mean} if noise_type is \code{"normal"} (default is \code{sd=1} and \code{mean=0}) or \verb{min, max} if \code{noise_type} is \code{"uniform"}
(default is \verb{min=0, max=1}).}
\item{random_sampling}{Whether samples should be taken from random positions when using \code{max_samples} argument. If \code{FALSE} random
samples are taken from a consecutive subsequence.}
\item{set_learning}{When you want to assign one label to set of samples. Only implemented for \code{train_type = "label_folder"}.
Input is a list with the following parameters
\itemize{
\item \code{samples_per_target}: how many samples to use for one target.
\item \code{maxlen}: length of one sample.
\item \code{reshape_mode}: \verb{"time_dist", "multi_input"} or \code{"concat"}.
\itemize{
\item
If \code{reshape_mode} is \code{"multi_input"}, generator will produce \code{samples_per_target} separate inputs, each of length \code{maxlen} (model should have
\code{samples_per_target} input layers).
\item If reshape_mode is \code{"time_dist"}, generator will produce a 4D input array. The dimensions correspond to
\verb{(batch_size, samples_per_target, maxlen, length(vocabulary))}.
\item If \code{reshape_mode} is \code{"concat"}, generator will concatenate \code{samples_per_target} sequences
of length \code{maxlen} to one long sequence.
}
\item If \code{reshape_mode} is \code{"concat"}, there is an additional \code{buffer_len}
argument. If \code{buffer_len} is an integer, the subsequences are interspaced with \code{buffer_len} rows. The input length is
(\code{maxlen} \eqn{*} \code{samples_per_target}) + \code{buffer_len} \eqn{*} (\code{samples_per_target} - 1).
}}
\item{file_limit}{Integer or \code{NULL}. If integer, use only specified number of randomly sampled files for training. Ignored if greater than number of files in \code{path}.}
\item{reverse_complement_encoding}{Whether to use both original sequence and reverse complement as two input sequences.}
\item{read_data}{If \code{TRUE} the first element of output is a list of length 2, each containing one part of paired read. Maxlen should be 2*length of one read.}
\item{target_split}{If target gets read from csv file, list of names to divide target tensor into list of tensors.
Example: if csv file has header names \verb{"file", "label_1", "label_2", "label_3"} and \code{target_split = list(c("label_1", "label_2"), "label_3")},
this will divide target matrix to list of length 2, where the first element contains columns named \code{"label_1"} and \code{"label_2"} and the
second entry contains the column named \code{"label_3"}.}
\item{path_file_logVal}{Path to csv file logging used validation files.}
\item{model}{A keras model.}
\item{vocabulary_label}{Character vector of possible targets. Targets outside \code{vocabulary_label} will get discarded if
\code{train_type = "label_header"}.}
\item{masked_lm}{If not \code{NULL}, input and target are equal except some parts of the input are masked or random.
Must be list with the following arguments:
\itemize{
\item \code{mask_rate}: Rate of input to mask (rate of input to replace with mask token).
\item \code{random_rate}: Rate of input to set to random token.
\item \code{identity_rate}: Rate of input where sample weights are applied but input and output are identical.
\item \code{include_sw}: Whether to include sample weights.
\item \code{block_len} (optional): Masked/random/identity regions appear in blocks of size \code{block_len}.
}}
\item{val}{Logical, call initialized generator "genY" or "genValY" where Y is an integer between 1 and length of directories.}
\item{return_int}{Whether to return integer encoding or one-hot encoding.}
\item{verbose}{Whether to show messages.}
\item{delete_used_files}{Whether to delete file once used. Only applies for rds files.}
\item{reshape_xy}{Can be a list of functions to apply to input and/or target. List elements (containing the reshape functions)
must be called x for input or y for target and each have arguments called x and y. For example:
\code{reshape_xy = list(x = function(x, y) {return(x+1)}, y = function(x, y) {return(x+y)})} .
For rds generator needs to have an additional argument called sw.}
}
\value{
A generator function.
}
\description{
For a detailed description see the data generator \href{https://deepg.de/articles/data_generator.html}{tutorial}.
Will choose one of the generators from \code{\link{generator_fasta_lm}},
\code{\link{generator_fasta_label_folder}}, \code{\link{generator_fasta_label_header_csv}},
\code{\link{generator_rds}}, \code{\link{generator_random}}, \code{\link{generator_dummy}} or
\code{\link{generator_fasta_lm}} according to the \code{train_type} and \code{random_sampling}
arguments.
}
\examples{
\dontshow{if (reticulate::py_module_available("tensorflow")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
# create dummy fasta files
fasta_path <- tempfile()
dir.create(fasta_path)
create_dummy_data(file_path = fasta_path,
num_files = 3,
seq_length = 10,
num_seq = 5,
vocabulary = c("a", "c", "g", "t"))
gen <- get_generator(path = fasta_path,
maxlen = 5, train_type = "lm",
output_format = "target_right",
step = 3, batch_size = 7)
z <- gen()
x <- z[[1]]
y <- z[[2]]
dim(x)
dim(y)
\dontshow{\}) # examplesIf}
}