--- a +++ b/man/get_start_ind.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocess.R +\name{get_start_ind} +\alias{get_start_ind} +\title{Computes start position of samples} +\usage{ +get_start_ind( + seq_vector, + length_vector, + maxlen, + step, + train_mode = "label", + discard_amb_nuc = FALSE, + vocabulary = c("A", "C", "G", "T") +) +} +\arguments{ +\item{seq_vector}{Vector of character sequences.} + +\item{length_vector}{Length of sequences in \code{seq_vector}.} + +\item{maxlen}{Length of one predictor sequence.} + +\item{step}{Distance between samples from one entry in \code{seq_vector}.} + +\item{train_mode}{Either \code{"lm"} for language model or \code{"label"} for label classification.} + +\item{discard_amb_nuc}{Whether to discard all samples that contain characters outside vocabulary.} + +\item{vocabulary}{Vector of allowed characters. Characters outside vocabulary get encoded as specified in \code{ambiguous_nuc}.} +} +\value{ +A numeric vector. +} +\description{ +Helper function for data generators. +Computes start positions in sequence where samples can be extracted, given maxlen, step size and ambiguous nucleotide constraints. +} +\examples{ +seq_vector <- c("AAACCCNNNGGGTTT") +get_start_ind( + seq_vector = seq_vector, + length_vector = nchar(seq_vector), + maxlen = 4, + step = 2, + train_mode = "label", + discard_amb_nuc = TRUE, + vocabulary = c("A", "C", "G", "T")) + +}