[1c0e03]: / man / dataset_from_gen.Rd

Download this file

151 lines (130 with data), 7.0 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/generator_utils.R
\name{dataset_from_gen}
\alias{dataset_from_gen}
\title{Collect samples from generator and store in rds or pickle file.}
\usage{
dataset_from_gen(
output_path,
iterations = 10,
train_type = "lm",
output_format = "target_right",
path_corpus,
batch_size = 32,
maxlen = 250,
step = NULL,
vocabulary = c("a", "c", "g", "t"),
shuffle = FALSE,
set_learning = NULL,
seed = NULL,
random_sampling = FALSE,
store_format = "rds",
file_name_start = "batch_",
masked_lm = NULL,
...
)
}
\arguments{
\item{output_path}{Output directory. Output files will be named \code{output_path} + \code{file_name_start} + x + ".rds" or ".pickle", where x is an index (from 1 to
\code{iterations}) and file ending depends on \code{store_format} argument.}
\item{iterations}{Number of batches (output files) to create.}
\item{train_type}{Either \code{"lm"}, \code{"lm_rds"}, \code{"masked_lm"} for language model; \code{"label_header"}, \code{"label_folder"}, \code{"label_csv"}, \code{"label_rds"} for classification or \code{"dummy_gen"}.
\itemize{
\item Language model is trained to predict character(s) in a sequence. \cr
\item \code{"label_header"}/\code{"label_folder"}/\code{"label_csv"} are trained to predict a corresponding class given a sequence as input.
\item If \code{"label_header"}, class will be read from fasta headers.
\item If \code{"label_folder"}, class will be read from folder, i.e. all files in one folder must belong to the same class.
\item If \code{"label_csv"}, targets are read from a csv file. This file should have one column named "file". The targets then correspond to entries in that row (except "file"
column). Example: if we are currently working with a file called "a.fasta" and corresponding label is "label_1", there should be a row in our csv file\tabular{lll}{
file \tab label_1 \tab label_2 \cr
"a.fasta" \tab 1 \tab 0 \cr
}
\item If \code{"label_rds"}, generator will iterate over set of .rds files containing each a list of input and target tensors. Not implemented for model
with multiple inputs.
\item If \code{"lm_rds"}, generator will iterate over set of .rds files and will split tensor according to \code{target_len} argument
(targets are last \code{target_len} nucleotides of each sequence).
\item If \code{"dummy_gen"}, generator creates random data once and repeatedly feeds these to model.
\item If \code{"masked_lm"}, generator maskes some parts of the input. See \code{masked_lm} argument for details.
}}
\item{output_format}{Determines shape of output tensor for language model.
Either \code{"target_right"}, \code{"target_middle_lstm"}, \code{"target_middle_cnn"} or \code{"wavenet"}.
Assume a sequence \code{"AACCGTA"}. Output correspond as follows
\itemize{
\item \verb{"target_right": X = "AACCGT", Y = "A"}
\item \verb{"target_middle_lstm": X = (X_1 = "AAC", X_2 = "ATG"), Y = "C"} (note reversed order of X_2)
\item \verb{"target_middle_cnn": X = "AACGTA", Y = "C"}
\item \verb{"wavenet": X = "AACCGT", Y = "ACCGTA"}
}}
\item{path_corpus}{Input directory where fasta files are located or path to single file ending with fasta or fastq
(as specified in format argument). Can also be a list of directories and/or files.}
\item{batch_size}{Number of samples in one batch.}
\item{maxlen}{Length of predictor sequence.}
\item{step}{How often to take a sample.}
\item{vocabulary}{Vector of allowed characters. Characters outside vocabulary get encoded as specified in \code{ambiguous_nuc}.}
\item{shuffle}{Whether to shuffle samples within each batch.}
\item{set_learning}{When you want to assign one label to set of samples. Only implemented for \code{train_type = "label_folder"}.
Input is a list with the following parameters
\itemize{
\item \code{samples_per_target}: how many samples to use for one target.
\item \code{maxlen}: length of one sample.
\item \code{reshape_mode}: \verb{"time_dist", "multi_input"} or \code{"concat"}.
\itemize{
\item
If \code{reshape_mode} is \code{"multi_input"}, generator will produce \code{samples_per_target} separate inputs, each of length \code{maxlen} (model should have
\code{samples_per_target} input layers).
\item If reshape_mode is \code{"time_dist"}, generator will produce a 4D input array. The dimensions correspond to
\verb{(batch_size, samples_per_target, maxlen, length(vocabulary))}.
\item If \code{reshape_mode} is \code{"concat"}, generator will concatenate \code{samples_per_target} sequences
of length \code{maxlen} to one long sequence.
}
\item If \code{reshape_mode} is \code{"concat"}, there is an additional \code{buffer_len}
argument. If \code{buffer_len} is an integer, the subsequences are interspaced with \code{buffer_len} rows. The input length is
(\code{maxlen} \eqn{*} \code{samples_per_target}) + \code{buffer_len} \eqn{*} (\code{samples_per_target} - 1).
}}
\item{seed}{Sets seed for \code{set.seed} function for reproducible results.}
\item{random_sampling}{Whether samples should be taken from random positions when using \code{max_samples} argument. If \code{FALSE} random
samples are taken from a consecutive subsequence.}
\item{store_format}{Either "rds" or "pickle".}
\item{file_name_start}{Start of output file names.}
\item{masked_lm}{If not \code{NULL}, input and target are equal except some parts of the input are masked or random.
Must be list with the following arguments:
\itemize{
\item \code{mask_rate}: Rate of input to mask (rate of input to replace with mask token).
\item \code{random_rate}: Rate of input to set to random token.
\item \code{identity_rate}: Rate of input where sample weights are applied but input and output are identical.
\item \code{include_sw}: Whether to include sample weights.
\item \code{block_len} (optional): Masked/random/identity regions appear in blocks of size \code{block_len}.
}}
\item{...}{further generator options. See \code{\link{get_generator}}.}
}
\value{
None. Function writes data to files and does not return a value.
}
\description{
Repeatedly generate samples with data generator and store output. Creates a separate rds or pickle file in \code{output_path} for each
batch.
}
\examples{
\dontshow{if (reticulate::py_module_available("tensorflow")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
# create dummy fasta files
temp_dir <- tempfile()
dir.create(temp_dir)
create_dummy_data(file_path = temp_dir,
num_files = 3,
seq_length = 8,
num_seq = 2)
# extract samples
out_dir <- tempfile()
dir.create(out_dir)
dataset_from_gen(output_path = out_dir,
iterations = 10,
train_type = "lm",
output_format = "target_right",
path_corpus = temp_dir,
batch_size = 32,
maxlen = 5,
step = 1,
file_name_start = "batch_")
list.files(out_dir)
\dontshow{\}) # examplesIf}
}