b/R/n_gram.R
+#' Get distribution of n-grams
+#'
+#' Get distribution of next character given previous n nucleotides.
+#'
+#' @inheritParams generator_fasta_lm
+#' @param path_input Path to folder containing fasta files or single fasta file.
+#' @param n Size of n gram.
+#' @param vocabulary Vector of allowed characters, samples outside vocabulary get discarded.
+#' @param file_sample If integer, size of random sample of files in \code{path_input}.
+#' @param nuc_dist Nucleotide distribution.
+#' @return Returns a matrix with distributions of nucleotides given the previous n nucleotides.
+#' @examples
+#' temp_dir <- tempfile()
+#' dir.create(temp_dir)
+#' create_dummy_data(file_path = temp_dir,
+#'                   num_files = 3,
+#'                   seq_length = 80,
+#'                   vocabulary = c("A", "C", "G", "T"),
+#'                   num_seq = 2)
+#'
+#' m <- n_gram_dist(path_input = temp_dir,
+#'                  n = 3,
+#'                  step = 1,
+#'                  nuc_dist = FALSE)
+#' head(round(m, 2))
+#' @returns A data frame of n-gram predictions.
+#' @export
+n_gram_dist <- function(path_input,
+                        n = 2,
+                        vocabulary = c("A", "C", "G", "T"),
+                        format = "fasta",
+                        file_sample = NULL,
+                        step = 1,
+                        nuc_dist = FALSE) {
+  if (endsWith(path_input, paste0(".", format))) {
+    num_files <- 1
+    fasta_files <- path_input
+  } else {
+    fasta_files <- list.files(
+      path = path_input,
+      pattern = paste0("\\.", format, "$"),
+      full.names = TRUE)
+    num_files <- length(fasta_files)
+  }
+  # take random subset of files
+  if (!is.null(file_sample)){
+    fasta_files <- sample(fasta_files)[1:min(file_sample, length(fasta_files))]
+    num_files <- length(fasta_files)
+  }
+  l <- vector("list")
+  for (i in 1:n){
+    l[[i]] <-  vocabulary
+  }
+  label_df <- apply(expand.grid(l), 2, as.character)
+  labels <- vector("character")
+  for (i in 1:nrow(label_df)){
+    labels[i] <- paste(label_df[i, ], collapse = "")
+  }
+  #labels
+  targets <- vector("character")
+  for (i in 1:length(vocabulary)){
+    targets <- c(targets, rep(vocabulary[i], length(labels)))
+  }
+  gram <- rep(labels, length(vocabulary))
+  freq <- rep(0, length(labels) * length(vocabulary))
+  freq_df <- data.frame(gram, targets, freq)
+  nuc_table <- vector("list")
+  for (i in 1:num_files) {
+    if (format == "fasta") {
+      fasta_file <-  microseq::readFasta(fasta_files[i])
+    }
+    if (format == "fastq") {
+      fasta_file <-  microseq::readFastq(fasta_files[i])
+    }
+    seq_vector <- fasta_file$Sequence
+    start_ind <- get_start_ind(seq_vector = seq_vector,
+                               length_vector = nchar(seq_vector),
+                               maxlen = n, step = step, train_mode = "lm")
+    nuc_seq <- paste(seq_vector, collapse = "")
+    split_seq <- strsplit(nuc_seq, "")[[1]]
+    nuc_seq_length <- nchar(nuc_seq)
+    gram <- split_seq[1 : (nuc_seq_length - n)]
+    if (n > 1){
+      for (j in 2:n){
+        gram <- paste0(gram, split_seq[j : (nuc_seq_length - n + j - 1)])
+      }
+    }
+    targets <- split_seq[(n + 1) : nuc_seq_length]
+    # remove sequences with overlapping fasta entries
+    gram <- gram[start_ind]
+    targets <- targets[start_ind]
+    # remove sequences with ambiguous nucleotides
+    amb_pos_gram <- c(1:(length(gram)))[stringr::str_detect(gram, paste0("[^", paste0(vocabulary, collapse = ""), "]"))]
+    amb_pos_targets <- c(1:(length(gram)))[stringr::str_detect(targets, paste0("[^", paste0(vocabulary, collapse = ""), "]"))]
+    amb_pos <- union(amb_pos_gram, amb_pos_targets)
+    if (length(amb_pos) > 0){
+      gram <- gram[-amb_pos]
+      targets <- targets[-amb_pos]
+    }
+    gram_df <- data.frame(gram = factor(gram, levels = labels),
+                          targets = factor(targets, levels = vocabulary))
+    table_df <- as.data.frame(table(gram_df))
+    stopifnot(all(freq_df$gram == table_df$gram) & all(freq_df$targets == table_df$targets))
+    freq_df$freq <- freq_df$freq + table_df$Freq
+  }
+  dist_matrix <- df_to_distribution_matrix(freq_df, vocabulary = vocabulary)
+  dist_matrix
+}
+df_to_distribution_matrix <- function(freq_df, vocabulary = c("A", "C", "G", "T")) {
+  stopifnot(names(freq_df) == c("gram", "targets", "freq"))
+  gram_levels <- levels(factor(freq_df$gram))
+  num_levels <- length(gram_levels)
+  dist_matrix <- matrix(0, nrow = num_levels, ncol = length(vocabulary))
+  dist_matrix <- as.data.frame(dist_matrix)
+  rownames(dist_matrix) <- as.character(freq_df$gram[1:nrow(dist_matrix)])
+  colnames(dist_matrix) <- vocabulary
+  for (nuc in vocabulary){
+    nuc_column <- freq_df %>% dplyr::filter(targets == nuc) %>% dplyr::select(gram, freq)
+    stopifnot(nuc_column$gram == rownames(dist_matrix))
+    dist_matrix[ , nuc] <- nuc_column$freq
+  }
+  dist_matrix$sum <- apply(dist_matrix, 1, sum)
+  non_zero <- dist_matrix$sum != 0
+  for (nuc in vocabulary) {
+    dist_matrix[non_zero, nuc] <- dist_matrix[non_zero, nuc]/dist_matrix$sum[non_zero]
+  }
+  dist_matrix[ , vocabulary]
+}
+#' Predict the next nucleotide using n-gram
+#'
+#' Predict the next nucleotide using n-gram.
+#'
+#' @inheritParams generator_fasta_lm
+#' @param path_input Path to folder containing fasta files or single fasta file.
+#' @param distribution_matrix A data frame containing frequency of next nucleotide given the previous n nucleotides (output of \code{\link{n_gram_dist}} function).
+#' @param default_pred Either character from vocabulary or `"random"`. Will be used as prediction if certain n-gram did not appear before.
+#' If `"random"` assign random prediction.
+#' @param vocabulary Vector of allowed characters, samples outside vocabulary get discarded.
+#' @param file_sample If integer, size of random sample of files in \code{path_input}.
+#' @param return_data_frames Boolean, whether to return data frame with input, predictions, target position and true target.
+#'
+#' @examples
+#' # create dummy fasta files
+#' temp_dir <- tempfile()
+#' dir.create(temp_dir)
+#' create_dummy_data(file_path = temp_dir,
+#'                   num_files = 3,
+#'                   seq_length = 8,
+#'                   vocabulary = c("A", "C", "G", "T"),
+#'                   num_seq = 2)
+#'
+#' m <- n_gram_dist(path_input = temp_dir,
+#'                  n = 3,
+#'                  step = 1,
+#'                  nuc_dist = FALSE)
+#'
+#' # use distribution matrix to make predictions for one file
+#' predictions <- predict_with_n_gram(path_input = list.files(temp_dir, full.names = TRUE)[1],
+#'                                    distribution_matrix = m)
+#'
+#' # show accuracy
+#' predictions[[1]]
+#'
+#' @returns List of prediction evaluations.
+#' @export
+predict_with_n_gram <- function(path_input, distribution_matrix, default_pred = "random", vocabulary = c("A", "C", "G", "T"),
+                                file_sample = NULL, format = "fasta", return_data_frames = FALSE, step = 1) {
+  n <- nchar(rownames(distribution_matrix)[1])
+  pred_int <- apply(distribution_matrix, 1, which.max)
+  # predict most common nucleotide if gram did not appear before
+  sum_columns <- apply(distribution_matrix, 2, sum)
+  zero_rows <- which(sum_columns == 0)
+  if (default_pred == "random") {
+    random_pred <- sample(1:length(vocabulary), length(zero_rows), replace = TRUE)
+    pred_int[zero_rows] <- random_pred
+  } else {
+    pred_int[zero_rows] <- which(vocabulary == default_pred)
+  }
+  # integer to nucleotide
+  pred <- vector("character")
+  for (i in 1:length(pred_int)){
+    pred[i] <- vocabulary[pred_int[i]]
+  }
+  model <- data.frame(gram = rownames(distribution_matrix), pred = pred)
+  if (endsWith(path_input, paste0(".", format))) {
+    num_files <- 1
+    fasta_files <- path_input
+  } else {
+    fasta_files <- list.files(
+      path = path_input,
+      pattern = paste0("\\.", format, "$"),
+      full.names = TRUE)
+    num_files <- length(fasta_files)
+  }
+  # take random subset of files
+  if (!is.null(file_sample)){
+    fasta_files <- sample(fasta_files)[1 : min(file_sample, length(fasta_files))]
+    num_files <- length(fasta_files)
+  }
+  labels <- rownames(distribution_matrix)
+  pred_df_list <- vector("list")
+  for (i in 1:num_files) {
+    if (format == "fasta") {
+      fasta_file <-  microseq::readFasta(fasta_files[i])
+    }
+    if (format == "fastq") {
+      fasta_file <-  microseq::readFastq(fasta_files[i])
+    }
+    seq_vector <- fasta_file$Sequence
+    start_ind <- get_start_ind(seq_vector = seq_vector,
+                               length_vector = nchar(seq_vector),
+                               maxlen = n, step = step, train_mode = "lm")
+    nuc_seq <- paste(seq_vector, collapse = "")
+    split_seq <- strsplit(nuc_seq, "")[[1]]
+    nuc_seq_length <- nchar(nuc_seq)
+    gram <- split_seq[1 : (nuc_seq_length - n)]
+    if (n > 1){
+      for (j in 2:n){
+        gram <- paste0(gram, split_seq[j : (nuc_seq_length - n + j - 1)])
+      }
+    }
+    targets <- split_seq[(n + 1) : nuc_seq_length]
+    # remove sequences with overlapping fasta entries
+    gram <- gram[start_ind]
+    targets <- targets[start_ind]
+    gram_df <- data.frame(gram = factor(gram, levels = labels),
+                          targets = factor(targets, levels = vocabulary),
+                          target_pos = start_ind + n)
+    # remove sequences with ambiguous nucleotides
+    gram_df <- gram_df[stats::complete.cases(gram_df), ]
+    pred_df <- dplyr::left_join(gram_df, model, by = "gram")
+    names(pred_df)[2] <- "true"
+    if (return_data_frames) {
+      pred_df_list[[i]] <- list(pred_df, accuracy = sum(pred_df$true == pred_df$pred)/nrow(pred_df))
+    } else {
+      pred_df_list[[i]] <- list(accuracy = sum(pred_df$true == pred_df$pred)/nrow(pred_df))
+    }
+  }
+  return(pred_df_list)
+}