a b/man/seq_encoding_lm.Rd
1
% Generated by roxygen2: do not edit by hand
2
% Please edit documentation in R/preprocess.R
3
\name{seq_encoding_lm}
4
\alias{seq_encoding_lm}
5
\title{Encodes integer sequence for language model}
6
\usage{
7
seq_encoding_lm(
8
  sequence = NULL,
9
  maxlen,
10
  vocabulary,
11
  start_ind,
12
  ambiguous_nuc = "zero",
13
  nuc_dist = NULL,
14
  quality_vector = NULL,
15
  return_int = FALSE,
16
  target_len = 1,
17
  use_coverage = FALSE,
18
  max_cov = NULL,
19
  cov_vector = NULL,
20
  n_gram = NULL,
21
  n_gram_stride = 1,
22
  output_format = "target_right",
23
  char_sequence = NULL,
24
  adjust_start_ind = FALSE,
25
  tokenizer = NULL
26
)
27
}
28
\arguments{
29
\item{sequence}{Sequence of integers.}
30
31
\item{maxlen}{Length of predictor sequence.}
32
33
\item{vocabulary}{Vector of allowed characters. Characters outside vocabulary get encoded as specified in \code{ambiguous_nuc}.}
34
35
\item{start_ind}{Start positions of samples in \code{sequence}.}
36
37
\item{ambiguous_nuc}{How to handle nucleotides outside vocabulary, either \code{"zero"}, \code{"empirical"} or \code{"equal"}.
38
See \code{\link{train_model}}. Note that \code{"discard"} option is not available for this function.}
39
40
\item{nuc_dist}{Nucleotide distribution.}
41
42
\item{quality_vector}{Vector of quality probabilities.}
43
44
\item{return_int}{Whether to return integer encoding or one-hot encoding.}
45
46
\item{target_len}{Number of nucleotides to predict at once for language model.}
47
48
\item{use_coverage}{Integer or \code{NULL}. If not \code{NULL}, use coverage as encoding rather than one-hot encoding and normalize.
49
Coverage information must be contained in fasta header: there must be a string \code{"cov_n"} in the header, where \code{n} is some integer.}
50
51
\item{max_cov}{Biggest coverage value. Only applies if \code{use_coverage = TRUE}.}
52
53
\item{cov_vector}{Vector of coverage values associated to the input.}
54
55
\item{n_gram}{Integer, encode target not nucleotide wise but combine n nucleotides at once. For example for \verb{n=2, "AA" ->  (1, 0,..., 0),}
56
\verb{"AC" ->  (0, 1, 0,..., 0), "TT" -> (0,..., 0, 1)}, where the one-hot vectors have length \code{length(vocabulary)^n}.}
57
58
\item{n_gram_stride}{Step size for n-gram encoding. For AACCGGTT with \code{n_gram = 4} and \code{n_gram_stride = 2}, generator encodes
59
\verb{(AACC), (CCGG), (GGTT)}; for \code{n_gram_stride = 4} generator encodes \verb{(AACC), (GGTT)}.}
60
61
\item{output_format}{Determines shape of output tensor for language model.
62
Either \code{"target_right"}, \code{"target_middle_lstm"}, \code{"target_middle_cnn"} or \code{"wavenet"}.
63
Assume a sequence \code{"AACCGTA"}. Output correspond as follows
64
\itemize{
65
\item \verb{"target_right": X = "AACCGT", Y = "A"}
66
\item \verb{"target_middle_lstm": X = (X_1 = "AAC", X_2 = "ATG"), Y = "C"} (note reversed order of X_2)
67
\item \verb{"target_middle_cnn": X = "AACGTA", Y = "C"}
68
\item \verb{"wavenet": X = "AACCGT", Y = "ACCGTA"}
69
}}
70
71
\item{char_sequence}{A character string.}
72
73
\item{adjust_start_ind}{Whether to shift values in \code{start_ind} to start at 1: for example (5,11,25) becomes (1,7,21).}
74
75
\item{tokenizer}{A keras tokenizer.}
76
}
77
\value{
78
A list of 2 tensors.
79
}
80
\description{
81
Helper function for \code{\link{generator_fasta_lm}}.
82
Encodes integer sequence to input/target list according to \code{output_format} argument.
83
}
84
\examples{
85
\dontshow{if (reticulate::py_module_available("tensorflow")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
86
# use integer sequence as input 
87
88
z <- seq_encoding_lm(sequence = c(1,0,5,1,3,4,3,1,4,1,2),
89
maxlen = 5,
90
vocabulary = c("a", "c", "g", "t"),
91
start_ind = c(1,3),
92
ambiguous_nuc = "equal",
93
target_len = 1,
94
output_format = "target_right")
95
96
x <- z[[1]]
97
y <- z[[2]]
98
99
x[1,,] # 1,0,5,1,3
100
y[1,] # 4
101
102
x[2,,] # 5,1,3,4,
103
y[2,] # 1
104
105
# use character string as input
106
z <- seq_encoding_lm(sequence = NULL,
107
maxlen = 5,
108
vocabulary = c("a", "c", "g", "t"),
109
start_ind = c(1,3),
110
ambiguous_nuc = "zero",
111
target_len = 1,
112
output_format = "target_right",
113
char_sequence = "ACTaaTNTNaZ")
114
115
116
x <- z[[1]]
117
y <- z[[2]]
118
119
x[1,,] # actaa
120
y[1,] # t
121
122
x[2,,] # taatn
123
y[2,] # t
124
\dontshow{\}) # examplesIf}
125
}