[02ea2d]: / man / get_start_ind.Rd

Download this file

51 lines (43 with data), 1.4 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/preprocess.R
\name{get_start_ind}
\alias{get_start_ind}
\title{Computes start position of samples}
\usage{
get_start_ind(
seq_vector,
length_vector,
maxlen,
step,
train_mode = "label",
discard_amb_nuc = FALSE,
vocabulary = c("A", "C", "G", "T")
)
}
\arguments{
\item{seq_vector}{Vector of character sequences.}
\item{length_vector}{Length of sequences in \code{seq_vector}.}
\item{maxlen}{Length of one predictor sequence.}
\item{step}{Distance between samples from one entry in \code{seq_vector}.}
\item{train_mode}{Either \code{"lm"} for language model or \code{"label"} for label classification.}
\item{discard_amb_nuc}{Whether to discard all samples that contain characters outside vocabulary.}
\item{vocabulary}{Vector of allowed characters. Characters outside vocabulary get encoded as specified in \code{ambiguous_nuc}.}
}
\value{
A numeric vector.
}
\description{
Helper function for data generators.
Computes start positions in sequence where samples can be extracted, given maxlen, step size and ambiguous nucleotide constraints.
}
\examples{
seq_vector <- c("AAACCCNNNGGGTTT")
get_start_ind(
seq_vector = seq_vector,
length_vector = nchar(seq_vector),
maxlen = 4,
step = 2,
train_mode = "label",
discard_amb_nuc = TRUE,
vocabulary = c("A", "C", "G", "T"))
}