deepG / Git / [1c0e03] /man/predict_with_n

Models:
MarcoTheBlack/
deepG
Downloads: 2
[1c0e03]: / man / predict_with_n_gram.Rd
History
Download this file
65 lines (53 with data), 2.0 kB

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/n_gram.R
\name{predict_with_n_gram}
\alias{predict_with_n_gram}
\title{Predict the next nucleotide using n-gram}
\usage{
predict_with_n_gram(
  path_input,
  distribution_matrix,
  default_pred = "random",
  vocabulary = c("A", "C", "G", "T"),
  file_sample = NULL,
  format = "fasta",
  return_data_frames = FALSE,
  step = 1
)
}
\arguments{
\item{path_input}{Path to folder containing fasta files or single fasta file.}

\item{distribution_matrix}{A data frame containing frequency of next nucleotide given the previous n nucleotides (output of \code{\link{n_gram_dist}} function).}

\item{default_pred}{Either character from vocabulary or \code{"random"}. Will be used as prediction if certain n-gram did not appear before.
If \code{"random"} assign random prediction.}

\item{vocabulary}{Vector of allowed characters, samples outside vocabulary get discarded.}

\item{file_sample}{If integer, size of random sample of files in \code{path_input}.}

\item{format}{File format, either \code{"fasta"} or \code{"fastq"}.}

\item{return_data_frames}{Boolean, whether to return data frame with input, predictions, target position and true target.}

\item{step}{How often to take a sample.}
}
\value{
List of prediction evaluations.
}
\description{
Predict the next nucleotide using n-gram.
}
\examples{
# create dummy fasta files
temp_dir <- tempfile()
dir.create(temp_dir)
create_dummy_data(file_path = temp_dir,
                  num_files = 3,
                  seq_length = 8,
                  vocabulary = c("A", "C", "G", "T"),
                  num_seq = 2)

m <- n_gram_dist(path_input = temp_dir,
                 n = 3,
                 step = 1,
                 nuc_dist = FALSE)

# use distribution matrix to make predictions for one file
predictions <- predict_with_n_gram(path_input = list.files(temp_dir, full.names = TRUE)[1], 
                                   distribution_matrix = m)

# show accuracy
predictions[[1]]

}