|
a |
|
b/man/seq_encoding_label.Rd |
|
|
1 |
% Generated by roxygen2: do not edit by hand |
|
|
2 |
% Please edit documentation in R/preprocess.R |
|
|
3 |
\name{seq_encoding_label} |
|
|
4 |
\alias{seq_encoding_label} |
|
|
5 |
\title{Encodes integer sequence for label classification.} |
|
|
6 |
\usage{ |
|
|
7 |
seq_encoding_label( |
|
|
8 |
sequence = NULL, |
|
|
9 |
maxlen, |
|
|
10 |
vocabulary, |
|
|
11 |
start_ind, |
|
|
12 |
ambiguous_nuc = "zero", |
|
|
13 |
nuc_dist = NULL, |
|
|
14 |
quality_vector = NULL, |
|
|
15 |
use_coverage = FALSE, |
|
|
16 |
max_cov = NULL, |
|
|
17 |
cov_vector = NULL, |
|
|
18 |
n_gram = NULL, |
|
|
19 |
n_gram_stride = 1, |
|
|
20 |
masked_lm = NULL, |
|
|
21 |
char_sequence = NULL, |
|
|
22 |
tokenizer = NULL, |
|
|
23 |
adjust_start_ind = FALSE, |
|
|
24 |
return_int = FALSE |
|
|
25 |
) |
|
|
26 |
} |
|
|
27 |
\arguments{ |
|
|
28 |
\item{sequence}{Sequence of integers.} |
|
|
29 |
|
|
|
30 |
\item{maxlen}{Length of predictor sequence.} |
|
|
31 |
|
|
|
32 |
\item{vocabulary}{Vector of allowed characters. Characters outside vocabulary get encoded as specified in \code{ambiguous_nuc}.} |
|
|
33 |
|
|
|
34 |
\item{start_ind}{Start positions of samples in \code{sequence}.} |
|
|
35 |
|
|
|
36 |
\item{ambiguous_nuc}{How to handle nucleotides outside vocabulary, either \code{"zero"}, \code{"empirical"} or \code{"equal"}. |
|
|
37 |
See \code{\link{train_model}}. Note that \code{"discard"} option is not available for this function.} |
|
|
38 |
|
|
|
39 |
\item{nuc_dist}{Nucleotide distribution.} |
|
|
40 |
|
|
|
41 |
\item{quality_vector}{Vector of quality probabilities.} |
|
|
42 |
|
|
|
43 |
\item{use_coverage}{Integer or \code{NULL}. If not \code{NULL}, use coverage as encoding rather than one-hot encoding and normalize. |
|
|
44 |
Coverage information must be contained in fasta header: there must be a string \code{"cov_n"} in the header, where \code{n} is some integer.} |
|
|
45 |
|
|
|
46 |
\item{max_cov}{Biggest coverage value. Only applies if \code{use_coverage = TRUE}.} |
|
|
47 |
|
|
|
48 |
\item{cov_vector}{Vector of coverage values associated to the input.} |
|
|
49 |
|
|
|
50 |
\item{n_gram}{Integer, encode target not nucleotide wise but combine n nucleotides at once. For example for \verb{n=2, "AA" -> (1, 0,..., 0),} |
|
|
51 |
\verb{"AC" -> (0, 1, 0,..., 0), "TT" -> (0,..., 0, 1)}, where the one-hot vectors have length \code{length(vocabulary)^n}.} |
|
|
52 |
|
|
|
53 |
\item{n_gram_stride}{Step size for n-gram encoding. For AACCGGTT with \code{n_gram = 4} and \code{n_gram_stride = 2}, generator encodes |
|
|
54 |
\verb{(AACC), (CCGG), (GGTT)}; for \code{n_gram_stride = 4} generator encodes \verb{(AACC), (GGTT)}.} |
|
|
55 |
|
|
|
56 |
\item{masked_lm}{If not \code{NULL}, input and target are equal except some parts of the input are masked or random. |
|
|
57 |
Must be list with the following arguments: |
|
|
58 |
\itemize{ |
|
|
59 |
\item \code{mask_rate}: Rate of input to mask (rate of input to replace with mask token). |
|
|
60 |
\item \code{random_rate}: Rate of input to set to random token. |
|
|
61 |
\item \code{identity_rate}: Rate of input where sample weights are applied but input and output are identical. |
|
|
62 |
\item \code{include_sw}: Whether to include sample weights. |
|
|
63 |
\item \code{block_len} (optional): Masked/random/identity regions appear in blocks of size \code{block_len}. |
|
|
64 |
}} |
|
|
65 |
|
|
|
66 |
\item{char_sequence}{A character string.} |
|
|
67 |
|
|
|
68 |
\item{tokenizer}{A keras tokenizer.} |
|
|
69 |
|
|
|
70 |
\item{adjust_start_ind}{Whether to shift values in \code{start_ind} to start at 1: for example (5,11,25) becomes (1,7,21).} |
|
|
71 |
|
|
|
72 |
\item{return_int}{Whether to return integer encoding or one-hot encoding.} |
|
|
73 |
} |
|
|
74 |
\value{ |
|
|
75 |
A list of 2 tensors. |
|
|
76 |
} |
|
|
77 |
\description{ |
|
|
78 |
Returns encoding for integer or character sequence. |
|
|
79 |
} |
|
|
80 |
\examples{ |
|
|
81 |
\dontshow{if (reticulate::py_module_available("tensorflow")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} |
|
|
82 |
# use integer sequence as input |
|
|
83 |
x <- seq_encoding_label(sequence = c(1,0,5,1,3,4,3,1,4,1,2), |
|
|
84 |
maxlen = 5, |
|
|
85 |
vocabulary = c("a", "c", "g", "t"), |
|
|
86 |
start_ind = c(1,3), |
|
|
87 |
ambiguous_nuc = "equal") |
|
|
88 |
|
|
|
89 |
x[1,,] # 1,0,5,1,3 |
|
|
90 |
|
|
|
91 |
x[2,,] # 5,1,3,4, |
|
|
92 |
|
|
|
93 |
# use character string as input |
|
|
94 |
x <- seq_encoding_label(maxlen = 5, |
|
|
95 |
vocabulary = c("a", "c", "g", "t"), |
|
|
96 |
start_ind = c(1,3), |
|
|
97 |
ambiguous_nuc = "equal", |
|
|
98 |
char_sequence = "ACTaaTNTNaZ") |
|
|
99 |
|
|
|
100 |
x[1,,] # actaa |
|
|
101 |
|
|
|
102 |
x[2,,] # taatn |
|
|
103 |
\dontshow{\}) # examplesIf} |
|
|
104 |
} |