|
a |
|
b/man/dataset_from_gen.Rd |
|
|
1 |
% Generated by roxygen2: do not edit by hand |
|
|
2 |
% Please edit documentation in R/generator_utils.R |
|
|
3 |
\name{dataset_from_gen} |
|
|
4 |
\alias{dataset_from_gen} |
|
|
5 |
\title{Collect samples from generator and store in rds or pickle file.} |
|
|
6 |
\usage{ |
|
|
7 |
dataset_from_gen( |
|
|
8 |
output_path, |
|
|
9 |
iterations = 10, |
|
|
10 |
train_type = "lm", |
|
|
11 |
output_format = "target_right", |
|
|
12 |
path_corpus, |
|
|
13 |
batch_size = 32, |
|
|
14 |
maxlen = 250, |
|
|
15 |
step = NULL, |
|
|
16 |
vocabulary = c("a", "c", "g", "t"), |
|
|
17 |
shuffle = FALSE, |
|
|
18 |
set_learning = NULL, |
|
|
19 |
seed = NULL, |
|
|
20 |
random_sampling = FALSE, |
|
|
21 |
store_format = "rds", |
|
|
22 |
file_name_start = "batch_", |
|
|
23 |
masked_lm = NULL, |
|
|
24 |
... |
|
|
25 |
) |
|
|
26 |
} |
|
|
27 |
\arguments{ |
|
|
28 |
\item{output_path}{Output directory. Output files will be named \code{output_path} + \code{file_name_start} + x + ".rds" or ".pickle", where x is an index (from 1 to |
|
|
29 |
\code{iterations}) and file ending depends on \code{store_format} argument.} |
|
|
30 |
|
|
|
31 |
\item{iterations}{Number of batches (output files) to create.} |
|
|
32 |
|
|
|
33 |
\item{train_type}{Either \code{"lm"}, \code{"lm_rds"}, \code{"masked_lm"} for language model; \code{"label_header"}, \code{"label_folder"}, \code{"label_csv"}, \code{"label_rds"} for classification or \code{"dummy_gen"}. |
|
|
34 |
\itemize{ |
|
|
35 |
\item Language model is trained to predict character(s) in a sequence. \cr |
|
|
36 |
\item \code{"label_header"}/\code{"label_folder"}/\code{"label_csv"} are trained to predict a corresponding class given a sequence as input. |
|
|
37 |
\item If \code{"label_header"}, class will be read from fasta headers. |
|
|
38 |
\item If \code{"label_folder"}, class will be read from folder, i.e. all files in one folder must belong to the same class. |
|
|
39 |
\item If \code{"label_csv"}, targets are read from a csv file. This file should have one column named "file". The targets then correspond to entries in that row (except "file" |
|
|
40 |
column). Example: if we are currently working with a file called "a.fasta" and corresponding label is "label_1", there should be a row in our csv file\tabular{lll}{ |
|
|
41 |
file \tab label_1 \tab label_2 \cr |
|
|
42 |
"a.fasta" \tab 1 \tab 0 \cr |
|
|
43 |
} |
|
|
44 |
|
|
|
45 |
|
|
|
46 |
\item If \code{"label_rds"}, generator will iterate over set of .rds files containing each a list of input and target tensors. Not implemented for model |
|
|
47 |
with multiple inputs. |
|
|
48 |
\item If \code{"lm_rds"}, generator will iterate over set of .rds files and will split tensor according to \code{target_len} argument |
|
|
49 |
(targets are last \code{target_len} nucleotides of each sequence). |
|
|
50 |
\item If \code{"dummy_gen"}, generator creates random data once and repeatedly feeds these to model. |
|
|
51 |
\item If \code{"masked_lm"}, generator maskes some parts of the input. See \code{masked_lm} argument for details. |
|
|
52 |
}} |
|
|
53 |
|
|
|
54 |
\item{output_format}{Determines shape of output tensor for language model. |
|
|
55 |
Either \code{"target_right"}, \code{"target_middle_lstm"}, \code{"target_middle_cnn"} or \code{"wavenet"}. |
|
|
56 |
Assume a sequence \code{"AACCGTA"}. Output correspond as follows |
|
|
57 |
\itemize{ |
|
|
58 |
\item \verb{"target_right": X = "AACCGT", Y = "A"} |
|
|
59 |
\item \verb{"target_middle_lstm": X = (X_1 = "AAC", X_2 = "ATG"), Y = "C"} (note reversed order of X_2) |
|
|
60 |
\item \verb{"target_middle_cnn": X = "AACGTA", Y = "C"} |
|
|
61 |
\item \verb{"wavenet": X = "AACCGT", Y = "ACCGTA"} |
|
|
62 |
}} |
|
|
63 |
|
|
|
64 |
\item{path_corpus}{Input directory where fasta files are located or path to single file ending with fasta or fastq |
|
|
65 |
(as specified in format argument). Can also be a list of directories and/or files.} |
|
|
66 |
|
|
|
67 |
\item{batch_size}{Number of samples in one batch.} |
|
|
68 |
|
|
|
69 |
\item{maxlen}{Length of predictor sequence.} |
|
|
70 |
|
|
|
71 |
\item{step}{How often to take a sample.} |
|
|
72 |
|
|
|
73 |
\item{vocabulary}{Vector of allowed characters. Characters outside vocabulary get encoded as specified in \code{ambiguous_nuc}.} |
|
|
74 |
|
|
|
75 |
\item{shuffle}{Whether to shuffle samples within each batch.} |
|
|
76 |
|
|
|
77 |
\item{set_learning}{When you want to assign one label to set of samples. Only implemented for \code{train_type = "label_folder"}. |
|
|
78 |
Input is a list with the following parameters |
|
|
79 |
\itemize{ |
|
|
80 |
\item \code{samples_per_target}: how many samples to use for one target. |
|
|
81 |
\item \code{maxlen}: length of one sample. |
|
|
82 |
\item \code{reshape_mode}: \verb{"time_dist", "multi_input"} or \code{"concat"}. |
|
|
83 |
\itemize{ |
|
|
84 |
\item |
|
|
85 |
If \code{reshape_mode} is \code{"multi_input"}, generator will produce \code{samples_per_target} separate inputs, each of length \code{maxlen} (model should have |
|
|
86 |
\code{samples_per_target} input layers). |
|
|
87 |
\item If reshape_mode is \code{"time_dist"}, generator will produce a 4D input array. The dimensions correspond to |
|
|
88 |
\verb{(batch_size, samples_per_target, maxlen, length(vocabulary))}. |
|
|
89 |
\item If \code{reshape_mode} is \code{"concat"}, generator will concatenate \code{samples_per_target} sequences |
|
|
90 |
of length \code{maxlen} to one long sequence. |
|
|
91 |
} |
|
|
92 |
\item If \code{reshape_mode} is \code{"concat"}, there is an additional \code{buffer_len} |
|
|
93 |
argument. If \code{buffer_len} is an integer, the subsequences are interspaced with \code{buffer_len} rows. The input length is |
|
|
94 |
(\code{maxlen} \eqn{*} \code{samples_per_target}) + \code{buffer_len} \eqn{*} (\code{samples_per_target} - 1). |
|
|
95 |
}} |
|
|
96 |
|
|
|
97 |
\item{seed}{Sets seed for \code{set.seed} function for reproducible results.} |
|
|
98 |
|
|
|
99 |
\item{random_sampling}{Whether samples should be taken from random positions when using \code{max_samples} argument. If \code{FALSE} random |
|
|
100 |
samples are taken from a consecutive subsequence.} |
|
|
101 |
|
|
|
102 |
\item{store_format}{Either "rds" or "pickle".} |
|
|
103 |
|
|
|
104 |
\item{file_name_start}{Start of output file names.} |
|
|
105 |
|
|
|
106 |
\item{masked_lm}{If not \code{NULL}, input and target are equal except some parts of the input are masked or random. |
|
|
107 |
Must be list with the following arguments: |
|
|
108 |
\itemize{ |
|
|
109 |
\item \code{mask_rate}: Rate of input to mask (rate of input to replace with mask token). |
|
|
110 |
\item \code{random_rate}: Rate of input to set to random token. |
|
|
111 |
\item \code{identity_rate}: Rate of input where sample weights are applied but input and output are identical. |
|
|
112 |
\item \code{include_sw}: Whether to include sample weights. |
|
|
113 |
\item \code{block_len} (optional): Masked/random/identity regions appear in blocks of size \code{block_len}. |
|
|
114 |
}} |
|
|
115 |
|
|
|
116 |
\item{...}{further generator options. See \code{\link{get_generator}}.} |
|
|
117 |
} |
|
|
118 |
\value{ |
|
|
119 |
None. Function writes data to files and does not return a value. |
|
|
120 |
} |
|
|
121 |
\description{ |
|
|
122 |
Repeatedly generate samples with data generator and store output. Creates a separate rds or pickle file in \code{output_path} for each |
|
|
123 |
batch. |
|
|
124 |
} |
|
|
125 |
\examples{ |
|
|
126 |
\dontshow{if (reticulate::py_module_available("tensorflow")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} |
|
|
127 |
# create dummy fasta files |
|
|
128 |
temp_dir <- tempfile() |
|
|
129 |
dir.create(temp_dir) |
|
|
130 |
create_dummy_data(file_path = temp_dir, |
|
|
131 |
num_files = 3, |
|
|
132 |
seq_length = 8, |
|
|
133 |
num_seq = 2) |
|
|
134 |
|
|
|
135 |
# extract samples |
|
|
136 |
out_dir <- tempfile() |
|
|
137 |
dir.create(out_dir) |
|
|
138 |
dataset_from_gen(output_path = out_dir, |
|
|
139 |
iterations = 10, |
|
|
140 |
train_type = "lm", |
|
|
141 |
output_format = "target_right", |
|
|
142 |
path_corpus = temp_dir, |
|
|
143 |
batch_size = 32, |
|
|
144 |
maxlen = 5, |
|
|
145 |
step = 1, |
|
|
146 |
file_name_start = "batch_") |
|
|
147 |
|
|
|
148 |
list.files(out_dir) |
|
|
149 |
\dontshow{\}) # examplesIf} |
|
|
150 |
} |