Diff of /man/seq_encoding_label.Rd [000000] .. [409433]

Switch to unified view

a b/man/seq_encoding_label.Rd
1
% Generated by roxygen2: do not edit by hand
2
% Please edit documentation in R/preprocess.R
3
\name{seq_encoding_label}
4
\alias{seq_encoding_label}
5
\title{Encodes integer sequence for label classification.}
6
\usage{
7
seq_encoding_label(
8
  sequence = NULL,
9
  maxlen,
10
  vocabulary,
11
  start_ind,
12
  ambiguous_nuc = "zero",
13
  nuc_dist = NULL,
14
  quality_vector = NULL,
15
  use_coverage = FALSE,
16
  max_cov = NULL,
17
  cov_vector = NULL,
18
  n_gram = NULL,
19
  n_gram_stride = 1,
20
  masked_lm = NULL,
21
  char_sequence = NULL,
22
  tokenizer = NULL,
23
  adjust_start_ind = FALSE,
24
  return_int = FALSE
25
)
26
}
27
\arguments{
28
\item{sequence}{Sequence of integers.}
29
30
\item{maxlen}{Length of predictor sequence.}
31
32
\item{vocabulary}{Vector of allowed characters. Characters outside vocabulary get encoded as specified in \code{ambiguous_nuc}.}
33
34
\item{start_ind}{Start positions of samples in \code{sequence}.}
35
36
\item{ambiguous_nuc}{How to handle nucleotides outside vocabulary, either \code{"zero"}, \code{"empirical"} or \code{"equal"}.
37
See \code{\link{train_model}}. Note that \code{"discard"} option is not available for this function.}
38
39
\item{nuc_dist}{Nucleotide distribution.}
40
41
\item{quality_vector}{Vector of quality probabilities.}
42
43
\item{use_coverage}{Integer or \code{NULL}. If not \code{NULL}, use coverage as encoding rather than one-hot encoding and normalize.
44
Coverage information must be contained in fasta header: there must be a string \code{"cov_n"} in the header, where \code{n} is some integer.}
45
46
\item{max_cov}{Biggest coverage value. Only applies if \code{use_coverage = TRUE}.}
47
48
\item{cov_vector}{Vector of coverage values associated to the input.}
49
50
\item{n_gram}{Integer, encode target not nucleotide wise but combine n nucleotides at once. For example for \verb{n=2, "AA" ->  (1, 0,..., 0),}
51
\verb{"AC" ->  (0, 1, 0,..., 0), "TT" -> (0,..., 0, 1)}, where the one-hot vectors have length \code{length(vocabulary)^n}.}
52
53
\item{n_gram_stride}{Step size for n-gram encoding. For AACCGGTT with \code{n_gram = 4} and \code{n_gram_stride = 2}, generator encodes
54
\verb{(AACC), (CCGG), (GGTT)}; for \code{n_gram_stride = 4} generator encodes \verb{(AACC), (GGTT)}.}
55
56
\item{masked_lm}{If not \code{NULL}, input and target are equal except some parts of the input are masked or random.
57
Must be list with the following arguments:
58
\itemize{
59
\item \code{mask_rate}: Rate of input to mask (rate of input to replace with mask token).
60
\item \code{random_rate}: Rate of input to set to random token.
61
\item \code{identity_rate}: Rate of input where sample weights are applied but input and output are identical.
62
\item \code{include_sw}: Whether to include sample weights.
63
\item \code{block_len} (optional): Masked/random/identity regions appear in blocks of size \code{block_len}.
64
}}
65
66
\item{char_sequence}{A character string.}
67
68
\item{tokenizer}{A keras tokenizer.}
69
70
\item{adjust_start_ind}{Whether to shift values in \code{start_ind} to start at 1: for example (5,11,25) becomes (1,7,21).}
71
72
\item{return_int}{Whether to return integer encoding or one-hot encoding.}
73
}
74
\value{
75
A list of 2 tensors.
76
}
77
\description{
78
Returns encoding for integer or character sequence.
79
}
80
\examples{
81
\dontshow{if (reticulate::py_module_available("tensorflow")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
82
# use integer sequence as input
83
x <- seq_encoding_label(sequence = c(1,0,5,1,3,4,3,1,4,1,2),
84
                        maxlen = 5,
85
                        vocabulary = c("a", "c", "g", "t"),
86
                        start_ind = c(1,3),
87
                        ambiguous_nuc = "equal")
88
89
x[1,,] # 1,0,5,1,3
90
91
x[2,,] # 5,1,3,4,
92
93
# use character string as input
94
x <- seq_encoding_label(maxlen = 5,
95
                        vocabulary = c("a", "c", "g", "t"),
96
                        start_ind = c(1,3),
97
                        ambiguous_nuc = "equal",
98
                        char_sequence = "ACTaaTNTNaZ")
99
100
x[1,,] # actaa
101
102
x[2,,] # taatn
103
\dontshow{\}) # examplesIf}
104
}