--- a +++ b/man/seq_encoding_lm.Rd @@ -0,0 +1,125 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocess.R +\name{seq_encoding_lm} +\alias{seq_encoding_lm} +\title{Encodes integer sequence for language model} +\usage{ +seq_encoding_lm( + sequence = NULL, + maxlen, + vocabulary, + start_ind, + ambiguous_nuc = "zero", + nuc_dist = NULL, + quality_vector = NULL, + return_int = FALSE, + target_len = 1, + use_coverage = FALSE, + max_cov = NULL, + cov_vector = NULL, + n_gram = NULL, + n_gram_stride = 1, + output_format = "target_right", + char_sequence = NULL, + adjust_start_ind = FALSE, + tokenizer = NULL +) +} +\arguments{ +\item{sequence}{Sequence of integers.} + +\item{maxlen}{Length of predictor sequence.} + +\item{vocabulary}{Vector of allowed characters. Characters outside vocabulary get encoded as specified in \code{ambiguous_nuc}.} + +\item{start_ind}{Start positions of samples in \code{sequence}.} + +\item{ambiguous_nuc}{How to handle nucleotides outside vocabulary, either \code{"zero"}, \code{"empirical"} or \code{"equal"}. +See \code{\link{train_model}}. Note that \code{"discard"} option is not available for this function.} + +\item{nuc_dist}{Nucleotide distribution.} + +\item{quality_vector}{Vector of quality probabilities.} + +\item{return_int}{Whether to return integer encoding or one-hot encoding.} + +\item{target_len}{Number of nucleotides to predict at once for language model.} + +\item{use_coverage}{Integer or \code{NULL}. If not \code{NULL}, use coverage as encoding rather than one-hot encoding and normalize. +Coverage information must be contained in fasta header: there must be a string \code{"cov_n"} in the header, where \code{n} is some integer.} + +\item{max_cov}{Biggest coverage value. Only applies if \code{use_coverage = TRUE}.} + +\item{cov_vector}{Vector of coverage values associated to the input.} + +\item{n_gram}{Integer, encode target not nucleotide wise but combine n nucleotides at once. For example for \verb{n=2, "AA" -> (1, 0,..., 0),} +\verb{"AC" -> (0, 1, 0,..., 0), "TT" -> (0,..., 0, 1)}, where the one-hot vectors have length \code{length(vocabulary)^n}.} + +\item{n_gram_stride}{Step size for n-gram encoding. For AACCGGTT with \code{n_gram = 4} and \code{n_gram_stride = 2}, generator encodes +\verb{(AACC), (CCGG), (GGTT)}; for \code{n_gram_stride = 4} generator encodes \verb{(AACC), (GGTT)}.} + +\item{output_format}{Determines shape of output tensor for language model. +Either \code{"target_right"}, \code{"target_middle_lstm"}, \code{"target_middle_cnn"} or \code{"wavenet"}. +Assume a sequence \code{"AACCGTA"}. Output correspond as follows +\itemize{ +\item \verb{"target_right": X = "AACCGT", Y = "A"} +\item \verb{"target_middle_lstm": X = (X_1 = "AAC", X_2 = "ATG"), Y = "C"} (note reversed order of X_2) +\item \verb{"target_middle_cnn": X = "AACGTA", Y = "C"} +\item \verb{"wavenet": X = "AACCGT", Y = "ACCGTA"} +}} + +\item{char_sequence}{A character string.} + +\item{adjust_start_ind}{Whether to shift values in \code{start_ind} to start at 1: for example (5,11,25) becomes (1,7,21).} + +\item{tokenizer}{A keras tokenizer.} +} +\value{ +A list of 2 tensors. +} +\description{ +Helper function for \code{\link{generator_fasta_lm}}. +Encodes integer sequence to input/target list according to \code{output_format} argument. +} +\examples{ +\dontshow{if (reticulate::py_module_available("tensorflow")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# use integer sequence as input + +z <- seq_encoding_lm(sequence = c(1,0,5,1,3,4,3,1,4,1,2), +maxlen = 5, +vocabulary = c("a", "c", "g", "t"), +start_ind = c(1,3), +ambiguous_nuc = "equal", +target_len = 1, +output_format = "target_right") + +x <- z[[1]] +y <- z[[2]] + +x[1,,] # 1,0,5,1,3 +y[1,] # 4 + +x[2,,] # 5,1,3,4, +y[2,] # 1 + +# use character string as input +z <- seq_encoding_lm(sequence = NULL, +maxlen = 5, +vocabulary = c("a", "c", "g", "t"), +start_ind = c(1,3), +ambiguous_nuc = "zero", +target_len = 1, +output_format = "target_right", +char_sequence = "ACTaaTNTNaZ") + + +x <- z[[1]] +y <- z[[2]] + +x[1,,] # actaa +y[1,] # t + +x[2,,] # taatn +y[2,] # t +\dontshow{\}) # examplesIf} +}