|
a |
|
b/man/generator_fasta_lm.Rd |
|
|
1 |
% Generated by roxygen2: do not edit by hand |
|
|
2 |
% Please edit documentation in R/generator_lm.R |
|
|
3 |
\name{generator_fasta_lm} |
|
|
4 |
\alias{generator_fasta_lm} |
|
|
5 |
\title{Language model generator for fasta/fastq files} |
|
|
6 |
\usage{ |
|
|
7 |
generator_fasta_lm( |
|
|
8 |
path_corpus, |
|
|
9 |
format = "fasta", |
|
|
10 |
batch_size = 256, |
|
|
11 |
maxlen = 250, |
|
|
12 |
max_iter = 10000, |
|
|
13 |
vocabulary = c("a", "c", "g", "t"), |
|
|
14 |
verbose = FALSE, |
|
|
15 |
shuffle_file_order = FALSE, |
|
|
16 |
step = 1, |
|
|
17 |
seed = 1234, |
|
|
18 |
shuffle_input = FALSE, |
|
|
19 |
file_limit = NULL, |
|
|
20 |
path_file_log = NULL, |
|
|
21 |
reverse_complement = FALSE, |
|
|
22 |
output_format = "target_right", |
|
|
23 |
ambiguous_nuc = "zeros", |
|
|
24 |
use_quality_score = FALSE, |
|
|
25 |
proportion_per_seq = NULL, |
|
|
26 |
padding = TRUE, |
|
|
27 |
added_label_path = NULL, |
|
|
28 |
add_input_as_seq = NULL, |
|
|
29 |
skip_amb_nuc = NULL, |
|
|
30 |
max_samples = NULL, |
|
|
31 |
concat_seq = NULL, |
|
|
32 |
target_len = 1, |
|
|
33 |
file_filter = NULL, |
|
|
34 |
use_coverage = NULL, |
|
|
35 |
proportion_entries = NULL, |
|
|
36 |
sample_by_file_size = FALSE, |
|
|
37 |
n_gram = NULL, |
|
|
38 |
n_gram_stride = 1, |
|
|
39 |
add_noise = NULL, |
|
|
40 |
return_int = FALSE, |
|
|
41 |
reshape_xy = NULL |
|
|
42 |
) |
|
|
43 |
} |
|
|
44 |
\arguments{ |
|
|
45 |
\item{path_corpus}{Input directory where fasta files are located or path to single file ending with fasta or fastq |
|
|
46 |
(as specified in format argument). Can also be a list of directories and/or files.} |
|
|
47 |
|
|
|
48 |
\item{format}{File format, either \code{"fasta"} or \code{"fastq"}.} |
|
|
49 |
|
|
|
50 |
\item{batch_size}{Number of samples in one batch.} |
|
|
51 |
|
|
|
52 |
\item{maxlen}{Length of predictor sequence.} |
|
|
53 |
|
|
|
54 |
\item{max_iter}{Stop after \code{max_iter} number of iterations failed to produce a new batch.} |
|
|
55 |
|
|
|
56 |
\item{vocabulary}{Vector of allowed characters. Characters outside vocabulary get encoded as specified in \code{ambiguous_nuc}.} |
|
|
57 |
|
|
|
58 |
\item{verbose}{Whether to show messages.} |
|
|
59 |
|
|
|
60 |
\item{shuffle_file_order}{Logical, whether to go through files randomly or sequentially.} |
|
|
61 |
|
|
|
62 |
\item{step}{How often to take a sample.} |
|
|
63 |
|
|
|
64 |
\item{seed}{Sets seed for \code{set.seed} function for reproducible results.} |
|
|
65 |
|
|
|
66 |
\item{shuffle_input}{Whether to shuffle entries in every fasta/fastq file before extracting samples.} |
|
|
67 |
|
|
|
68 |
\item{file_limit}{Integer or \code{NULL}. If integer, use only specified number of randomly sampled files for training. Ignored if greater than number of files in \code{path}.} |
|
|
69 |
|
|
|
70 |
\item{path_file_log}{Write name of files to csv file if path is specified.} |
|
|
71 |
|
|
|
72 |
\item{reverse_complement}{Boolean, for every new file decide randomly to use original data or its reverse complement.} |
|
|
73 |
|
|
|
74 |
\item{output_format}{Determines shape of output tensor for language model. |
|
|
75 |
Either \code{"target_right"}, \code{"target_middle_lstm"}, \code{"target_middle_cnn"} or \code{"wavenet"}. |
|
|
76 |
Assume a sequence \code{"AACCGTA"}. Output correspond as follows |
|
|
77 |
\itemize{ |
|
|
78 |
\item \verb{"target_right": X = "AACCGT", Y = "A"} |
|
|
79 |
\item \verb{"target_middle_lstm": X = (X_1 = "AAC", X_2 = "ATG"), Y = "C"} (note reversed order of X_2) |
|
|
80 |
\item \verb{"target_middle_cnn": X = "AACGTA", Y = "C"} |
|
|
81 |
\item \verb{"wavenet": X = "AACCGT", Y = "ACCGTA"} |
|
|
82 |
}} |
|
|
83 |
|
|
|
84 |
\item{ambiguous_nuc}{How to handle nucleotides outside vocabulary, either \code{"zero"}, \code{"discard"}, \code{"empirical"} or \code{"equal"}. |
|
|
85 |
\itemize{ |
|
|
86 |
\item If \code{"zero"}, input gets encoded as zero vector. |
|
|
87 |
\item If \code{"equal"}, input is repetition of \code{1/length(vocabulary)}. |
|
|
88 |
\item If \code{"discard"}, samples containing nucleotides outside vocabulary get discarded. |
|
|
89 |
\item If \code{"empirical"}, use nucleotide distribution of current file. |
|
|
90 |
}} |
|
|
91 |
|
|
|
92 |
\item{use_quality_score}{Whether to use fastq quality scores. If TRUE input is not one-hot-encoding but corresponds to probabilities. |
|
|
93 |
For example (0.97, 0.01, 0.01, 0.01) instead of (1, 0, 0, 0).} |
|
|
94 |
|
|
|
95 |
\item{proportion_per_seq}{Numerical value between 0 and 1. Proportion of sequence to take samples from (use random subsequence).} |
|
|
96 |
|
|
|
97 |
\item{padding}{Whether to pad sequences too short for one sample with zeros.} |
|
|
98 |
|
|
|
99 |
\item{added_label_path}{Path to file with additional input labels. Should be a csv file with one column named "file". Other columns should correspond to labels.} |
|
|
100 |
|
|
|
101 |
\item{add_input_as_seq}{Boolean vector specifying for each entry in \code{added_label_path} if rows from csv should be encoded as a sequence or used directly. |
|
|
102 |
If a row in your csv file is a sequence this should be \code{TRUE}. For example you may want to add another sequence, say ACCGT. Then this would correspond to 1,2,2,3,4 in |
|
|
103 |
csv file (if vocabulary = c("A", "C", "G", "T")). If \code{add_input_as_seq} is \code{TRUE}, 12234 gets one-hot encoded, so added input is a 3D tensor. If \code{add_input_as_seq} is |
|
|
104 |
\code{FALSE} this will feed network just raw data (a 2D tensor).} |
|
|
105 |
|
|
|
106 |
\item{skip_amb_nuc}{Threshold of ambiguous nucleotides to accept in fasta entry. Complete entry will get discarded otherwise.} |
|
|
107 |
|
|
|
108 |
\item{max_samples}{Maximum number of samples to use from one file. If not \code{NULL} and file has more than \code{max_samples} samples, will randomly choose a |
|
|
109 |
subset of \code{max_samples} samples.} |
|
|
110 |
|
|
|
111 |
\item{concat_seq}{Character string or \code{NULL}. If not \code{NULL} all entries from file get concatenated to one sequence with \code{concat_seq} string between them. |
|
|
112 |
Example: If 1.entry AACC, 2. entry TTTG and \code{concat_seq = "ZZZ"} this becomes AACCZZZTTTG.} |
|
|
113 |
|
|
|
114 |
\item{target_len}{Number of nucleotides to predict at once for language model.} |
|
|
115 |
|
|
|
116 |
\item{file_filter}{Vector of file names to use from path_corpus.} |
|
|
117 |
|
|
|
118 |
\item{use_coverage}{Integer or \code{NULL}. If not \code{NULL}, use coverage as encoding rather than one-hot encoding and normalize. |
|
|
119 |
Coverage information must be contained in fasta header: there must be a string \code{"cov_n"} in the header, where \code{n} is some integer.} |
|
|
120 |
|
|
|
121 |
\item{proportion_entries}{Proportion of fasta entries to keep. For example, if fasta file has 50 entries and \code{proportion_entries = 0.1}, |
|
|
122 |
will randomly select 5 entries.} |
|
|
123 |
|
|
|
124 |
\item{sample_by_file_size}{Sample new file weighted by file size (bigger files more likely).} |
|
|
125 |
|
|
|
126 |
\item{n_gram}{Integer, encode target not nucleotide wise but combine n nucleotides at once. For example for \verb{n=2, "AA" -> (1, 0,..., 0),} |
|
|
127 |
\verb{"AC" -> (0, 1, 0,..., 0), "TT" -> (0,..., 0, 1)}, where the one-hot vectors have length \code{length(vocabulary)^n}.} |
|
|
128 |
|
|
|
129 |
\item{n_gram_stride}{Step size for n-gram encoding. For AACCGGTT with \code{n_gram = 4} and \code{n_gram_stride = 2}, generator encodes |
|
|
130 |
\verb{(AACC), (CCGG), (GGTT)}; for \code{n_gram_stride = 4} generator encodes \verb{(AACC), (GGTT)}.} |
|
|
131 |
|
|
|
132 |
\item{add_noise}{\code{NULL} or list of arguments. If not \code{NULL}, list must contain the following arguments: \code{noise_type} can be \code{"normal"} or \code{"uniform"}; |
|
|
133 |
optional arguments \code{sd} or \code{mean} if noise_type is \code{"normal"} (default is \code{sd=1} and \code{mean=0}) or \verb{min, max} if \code{noise_type} is \code{"uniform"} |
|
|
134 |
(default is \verb{min=0, max=1}).} |
|
|
135 |
|
|
|
136 |
\item{return_int}{Whether to return integer encoding or one-hot encoding.} |
|
|
137 |
|
|
|
138 |
\item{reshape_xy}{Can be a list of functions to apply to input and/or target. List elements (containing the reshape functions) |
|
|
139 |
must be called x for input or y for target and each have arguments called x and y. For example: |
|
|
140 |
\code{reshape_xy = list(x = function(x, y) {return(x+1)}, y = function(x, y) {return(x+y)})} . |
|
|
141 |
For rds generator needs to have an additional argument called sw.} |
|
|
142 |
} |
|
|
143 |
\value{ |
|
|
144 |
A generator function. |
|
|
145 |
} |
|
|
146 |
\description{ |
|
|
147 |
Iterates over folder containing fasta/fastq files and produces encoding of predictor sequences |
|
|
148 |
and target variables. Will take a sequence of fixed size and use some part of sequence as input and other part as target. |
|
|
149 |
} |
|
|
150 |
\examples{ |
|
|
151 |
\dontshow{if (reticulate::py_module_available("tensorflow")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} |
|
|
152 |
# create dummy fasta files |
|
|
153 |
path_input_1 <- tempfile() |
|
|
154 |
dir.create(path_input_1) |
|
|
155 |
create_dummy_data(file_path = path_input_1, |
|
|
156 |
num_files = 2, |
|
|
157 |
seq_length = 8, |
|
|
158 |
num_seq = 1, |
|
|
159 |
vocabulary = c("a", "c", "g", "t")) |
|
|
160 |
|
|
|
161 |
gen <- generator_fasta_lm(path_corpus = path_input_1, batch_size = 2, |
|
|
162 |
maxlen = 7) |
|
|
163 |
z <- gen() |
|
|
164 |
dim(z[[1]]) |
|
|
165 |
z[[2]] |
|
|
166 |
\dontshow{\}) # examplesIf} |
|
|
167 |
} |