--- a +++ b/R/preprocess.R @@ -0,0 +1,1616 @@ +#' Encodes integer sequence for language model +#' +#' Helper function for \code{\link{generator_fasta_lm}}. +#' Encodes integer sequence to input/target list according to \code{output_format} argument. +#' +#' @inheritParams generator_fasta_lm +#' @param sequence Sequence of integers. +#' @param start_ind Start positions of samples in \code{sequence}. +#' @param ambiguous_nuc How to handle nucleotides outside vocabulary, either `"zero"`, `"empirical"` or `"equal"`. +#' See \code{\link{train_model}}. Note that `"discard"` option is not available for this function. +#' @param nuc_dist Nucleotide distribution. +#' @param max_cov Biggest coverage value. Only applies if `use_coverage = TRUE`. +#' @param cov_vector Vector of coverage values associated to the input. +#' @param adjust_start_ind Whether to shift values in \code{start_ind} to start at 1: for example (5,11,25) becomes (1,7,21). +#' @param quality_vector Vector of quality probabilities. +#' @param tokenizer A keras tokenizer. +#' @param char_sequence A character string. +#' @examplesIf reticulate::py_module_available("tensorflow") +#' # use integer sequence as input +#' +#' z <- seq_encoding_lm(sequence = c(1,0,5,1,3,4,3,1,4,1,2), +#' maxlen = 5, +#' vocabulary = c("a", "c", "g", "t"), +#' start_ind = c(1,3), +#' ambiguous_nuc = "equal", +#' target_len = 1, +#' output_format = "target_right") +#' +#' x <- z[[1]] +#' y <- z[[2]] +#' +#' x[1,,] # 1,0,5,1,3 +#' y[1,] # 4 +#' +#' x[2,,] # 5,1,3,4, +#' y[2,] # 1 +#' +#' # use character string as input +#' z <- seq_encoding_lm(sequence = NULL, +#' maxlen = 5, +#' vocabulary = c("a", "c", "g", "t"), +#' start_ind = c(1,3), +#' ambiguous_nuc = "zero", +#' target_len = 1, +#' output_format = "target_right", +#' char_sequence = "ACTaaTNTNaZ") +#' +#' +#' x <- z[[1]] +#' y <- z[[2]] +#' +#' x[1,,] # actaa +#' y[1,] # t +#' +#' x[2,,] # taatn +#' y[2,] # t +#' +#' @returns A list of 2 tensors. +#' @export +seq_encoding_lm <- function(sequence = NULL, maxlen, vocabulary, start_ind, ambiguous_nuc = "zero", + nuc_dist = NULL, quality_vector = NULL, return_int = FALSE, + target_len = 1, use_coverage = FALSE, max_cov = NULL, cov_vector = NULL, + n_gram = NULL, n_gram_stride = 1, output_format = "target_right", + char_sequence = NULL, adjust_start_ind = FALSE, + tokenizer = NULL) { + + use_quality <- ifelse(is.null(quality_vector), FALSE, TRUE) + discard_amb_nt <- FALSE + ## TODO: add discard_amb_nt + if (!is.null(char_sequence)) { + + vocabulary <- stringr::str_to_lower(vocabulary) + pattern <- paste0("[^", paste0(vocabulary, collapse = ""), "]") + + + # token for ambiguous nucleotides + for (i in letters) { + if (!(i %in% stringr::str_to_lower(vocabulary))) { + amb_nuc_token <- i + break + } + } + + if (is.null(tokenizer)) { + tokenizer <- keras::fit_text_tokenizer(keras::text_tokenizer(char_level = TRUE, lower = TRUE, oov_token = "0"), c(vocabulary, amb_nuc_token)) + } + + sequence <- stringr::str_to_lower(char_sequence) + sequence <- stringr::str_replace_all(string = sequence, pattern = pattern, amb_nuc_token) + sequence <- keras::texts_to_sequences(tokenizer, sequence)[[1]] - 1 + } + + voc_len <- length(vocabulary) + if (target_len == 1) { + n_gram <- NULL + } + if (!is.null(n_gram)) { + if (target_len < n_gram) stop("target_len needs to be at least as big as n_gram") + } + + if (adjust_start_ind) start_ind <- start_ind - start_ind[1] + 1 + numberOfSamples <- length(start_ind) + + # every row in z one-hot encodes one character in sequence, oov is zero-vector + num_classes <- voc_len + 2 + z <- keras::to_categorical(sequence, num_classes = num_classes)[ , -c(1, num_classes)] + + if (use_quality) { + ones_pos <- apply(z, 1, which.max) + is_zero_row <- apply(z == 0, 1, all) + z <- purrr::map(1:length(quality_vector), ~create_quality_vector(pos = ones_pos[.x], prob = quality_vector[.x], + voc_length = length(vocabulary))) %>% unlist() %>% + matrix(ncol = length(vocabulary), byrow = TRUE) + z[is_zero_row, ] <- 0 + } + + if (ambiguous_nuc == "equal") { + amb_nuc_pos <- which(sequence == (voc_len + 1)) + z[amb_nuc_pos, ] <- matrix(rep(1/voc_len, ncol(z) * length(amb_nuc_pos)), ncol = ncol(z)) + } + + if (ambiguous_nuc == "empirical") { + if (!is.null(n_gram)) stop("Can only use equal, zero or discard option for ambiguous_nuc when using n_gram encoding") + amb_nuc_pos <- which(sequence == (voc_len + 1)) + z[amb_nuc_pos, ] <- matrix(rep(nuc_dist, length(amb_nuc_pos)), nrow = length(amb_nuc_pos), byrow = TRUE) + } + + if (use_coverage) { + z <- z * (cov_vector/max_cov) + } + + if (target_len == 1) { + + if (output_format == "target_right") { + x <- array(0, dim = c(numberOfSamples, maxlen, voc_len)) + for (i in 1:numberOfSamples) { + start <- start_ind[i] + x[i, , ] <- z[start : (start + maxlen - 1), ] + } + y <- z[start_ind + maxlen, ] + } + + if (output_format == "wavenet") { + if (!is.null(n_gram)) stop("Wavenet format not implemented for n_gram.") + x <- array(0, dim = c(numberOfSamples, maxlen, voc_len)) + y <- array(0, dim = c(numberOfSamples, maxlen, voc_len)) + for (i in 1:numberOfSamples) { + start <- start_ind[i] + x[i, , ] <- z[start : (start + maxlen - 1), ] + y[i, , ] <- z[(start + 1) : (start + maxlen), ] + } + } + + if (output_format == "target_middle_cnn") { + x <- array(0, dim = c(numberOfSamples, maxlen + 1, voc_len)) + for (i in 1:numberOfSamples) { + start <- start_ind[i] + x[i, , ] <- z[start : (start + maxlen), ] + } + missing_val <- ceiling(maxlen/2) + y <- z[start_ind + missing_val, ] + x <- x[ , -(missing_val + 1), ] + } + + if (output_format == "target_middle_lstm") { + len_input_1 <- ceiling(maxlen/2) + len_input_2 <- floor(maxlen/2) + input_tensor_1 <- array(0, dim = c(numberOfSamples, len_input_1, voc_len)) + input_tensor_2 <- array(0, dim = c(numberOfSamples, len_input_2, voc_len)) + for (i in 1:numberOfSamples) { + start <- start_ind[i] + input_tensor_1[i, , ] <- z[start : (start + len_input_1 - 1), ] + input_tensor_2[i, , ] <- z[(start + maxlen) : (start + len_input_1 + 1), ] + } + if (!is.null(n_gram)) { + input_tensor_1 <- input_tensor_1[ , 1:(dim(input_tensor_1) - n_gram + 1), ] + input_tensor_2 <- input_tensor_2[ , 1:(dim(input_tensor_2) - n_gram + 1), ] + } + x <- list(input_tensor_1, input_tensor_2) + y <- z[start_ind + len_input_1, ] + } + + } + + if (target_len > 1) { + + if (output_format == "target_right") { + x <- array(0, dim = c(numberOfSamples, maxlen - target_len + 1, voc_len)) + for (i in 1:numberOfSamples) { + start <- start_ind[i] + x[i, , ] <- z[start : (start + maxlen - target_len), ] + } + y <- list() + for (i in 1:target_len) { + y[[i]] <- z[start_ind + maxlen - target_len + i, ] + } + } + + if (output_format == "target_middle_cnn") { + x <- array(0, dim = c(numberOfSamples, maxlen + 1, voc_len)) + for (i in 1:numberOfSamples) { + start <- start_ind[i] + x[i, , ] <- z[start : (start + maxlen), ] + } + missing_val <- ceiling((maxlen - target_len)/2) + y <- list() + for (i in 1:target_len) { + y[[i]] <- z[start_ind + missing_val + i - 1, ] + } + x <- x[ , -((missing_val + 1):(missing_val + target_len)), ] + } + + if (output_format == "target_middle_lstm") { + len_input_1 <- ceiling((maxlen - target_len + 1)/2) + len_input_2 <- maxlen + 1 - len_input_1 - target_len + input_tensor_1 <- array(0, dim = c(numberOfSamples, len_input_1, voc_len)) + input_tensor_2 <- array(0, dim = c(numberOfSamples, len_input_2, voc_len)) + for (i in 1:numberOfSamples) { + start <- start_ind[i] + input_tensor_1[i, , ] <- z[start : (start + len_input_1 - 1), ] + input_tensor_2[i, , ] <- z[(start + maxlen) : (start + maxlen - len_input_2 + 1), ] + } + + x <- list(input_tensor_1, input_tensor_2) + y <- list() + for (i in 1:target_len) { + y[[i]] <- z[start_ind + len_input_1 - 1 + i, ] + } + } + + if (output_format == "wavenet") { + stop("Multi target not implemented for wavenet format.") + } + } + + if (is.matrix(x)) { + x <- array(x, dim = c(1, dim(x))) + } + + if (!is.null(n_gram)) { + if (is.list(y)) y <- do.call(rbind, y) + y_list <- list() + for (i in 1:numberOfSamples) { + index <- (i-1) + (1 + (0:(target_len-1))*numberOfSamples) + input_matrix <- y[index, ] + if (length(index) == 1) input_matrix <- matrix(input_matrix, nrow = 1) + n_gram_matrix <- n_gram_of_matrix(input_matrix = input_matrix, n = n_gram) + y_list[[i]] <- n_gram_matrix # tensorflow::tf$expand_dims(n_gram_matrix, axis = 0L) + } + y_tensor <- keras::k_stack(y_list, axis = 1L) %>% keras::k_eval() + y <- vector("list", dim(y_tensor)[2]) + + for (i in 1:dim(y_tensor)[2]) { + y_subset <- y_tensor[ , i, ] + if (numberOfSamples == 1) y_subset <- matrix(y_subset, nrow = 1) + y[[i]] <- y_subset + } + + if (is.list(y) & length(y) == 1) { + y <- y[[1]] + } + + if (n_gram_stride > 1 & is.list(y)) { + stride_index <- 0:(length(y)-1) %% n_gram_stride == 0 + y <- y[stride_index] + } + } + + return(list(x, y)) +} + +#' Encodes integer sequence for label classification. +#' +#' Returns encoding for integer or character sequence. +#' +#' @inheritParams seq_encoding_lm +#' @inheritParams generator_fasta_lm +#' @inheritParams train_model +#' @param return_int Whether to return integer encoding or one-hot encoding. +#' @examplesIf reticulate::py_module_available("tensorflow") +#' # use integer sequence as input +#' x <- seq_encoding_label(sequence = c(1,0,5,1,3,4,3,1,4,1,2), +#' maxlen = 5, +#' vocabulary = c("a", "c", "g", "t"), +#' start_ind = c(1,3), +#' ambiguous_nuc = "equal") +#' +#' x[1,,] # 1,0,5,1,3 +#' +#' x[2,,] # 5,1,3,4, +#' +#' # use character string as input +#' x <- seq_encoding_label(maxlen = 5, +#' vocabulary = c("a", "c", "g", "t"), +#' start_ind = c(1,3), +#' ambiguous_nuc = "equal", +#' char_sequence = "ACTaaTNTNaZ") +#' +#' x[1,,] # actaa +#' +#' x[2,,] # taatn +#' +#' @returns A list of 2 tensors. +#' @export +seq_encoding_label <- function(sequence = NULL, maxlen, vocabulary, start_ind, ambiguous_nuc = "zero", nuc_dist = NULL, + quality_vector = NULL, use_coverage = FALSE, max_cov = NULL, + cov_vector = NULL, n_gram = NULL, n_gram_stride = 1, masked_lm = NULL, + char_sequence = NULL, tokenizer = NULL, adjust_start_ind = FALSE, + return_int = FALSE) { + + ## TODO: add discard_amb_nt, add conditions for return_int + use_quality <- ifelse(is.null(quality_vector), FALSE, TRUE) + discard_amb_nt <- FALSE + maxlen_original <- maxlen + if (return_int) ambiguous_nuc <- "zero" + + if (!is.null(char_sequence)) { + + vocabulary <- stringr::str_to_lower(vocabulary) + pattern <- paste0("[^", paste0(vocabulary, collapse = ""), "]") + + # token for ambiguous nucleotides + for (i in letters) { + if (!(i %in% stringr::str_to_lower(vocabulary))) { + amb_nuc_token <- i + break + } + } + + if (is.null(tokenizer)) { + tokenizer <- keras::fit_text_tokenizer(keras::text_tokenizer(char_level = TRUE, lower = TRUE, oov_token = "0"), c(vocabulary, amb_nuc_token)) + } + + sequence <- stringr::str_to_lower(char_sequence) + sequence <- stringr::str_replace_all(string = sequence, pattern = pattern, amb_nuc_token) + sequence <- keras::texts_to_sequences(tokenizer, sequence)[[1]] - 1 + } + + if (adjust_start_ind) start_ind <- start_ind - start_ind[1] + 1 + numberOfSamples <- length(start_ind) + + if (is.null(n_gram_stride)) n_gram_stride <- 1 + voc_len <- length(vocabulary) + if (!is.null(n_gram)) { + sequence <- int_to_n_gram(int_seq = sequence, n = n_gram, voc_size = length(vocabulary)) + maxlen <- ceiling((maxlen - n_gram + 1)/n_gram_stride) + voc_len <- length(vocabulary)^n_gram + } + + if (!is.null(masked_lm)) { + l <- mask_seq(int_seq = sequence, + mask_rate = masked_lm$mask_rate, + random_rate = masked_lm$random_rate, + identity_rate = masked_lm$identity_rate, + start_ind = start_ind, + block_len = masked_lm$block_len, + voc_len = voc_len) + masked_seq <- l$masked_seq + sample_weight_seq <- l$sample_weight_seq + } + + if (!return_int) { + if (!is.null(masked_lm)) { + # every row in z one-hot encodes one character in sequence, oov is zero-vector + z_masked <- keras::to_categorical(masked_seq, num_classes = voc_len + 2)[ , -c(1)] + z_masked <- matrix(z_masked, ncol = voc_len + 1) + z <- keras::to_categorical(sequence, num_classes = voc_len + 2)[ , -c(1)] + z <- matrix(z, ncol = voc_len + 1) + } else { + # every row in z one-hot encodes one character in sequence, oov is zero-vector + z <- keras::to_categorical(sequence, num_classes = voc_len + 2)[ , -c(1, voc_len + 2)] + z <- matrix(z, ncol = voc_len) + } + } + + if (use_quality) { + ones_pos <- apply(z, 1, which.max) + is_zero_row <- apply(z == 0, 1, all) + z <- purrr::map(1:length(quality_vector), ~create_quality_vector(pos = ones_pos[.x], prob = quality_vector[.x], + voc_length = voc_len)) %>% unlist() %>% matrix(ncol = voc_len, byrow = TRUE) + z[is_zero_row, ] <- 0 + } + + if (ambiguous_nuc == "equal") { + amb_nuc_pos <- which(sequence == (voc_len + 1)) + z[amb_nuc_pos, ] <- matrix(rep(1/voc_len, ncol(z) * length(amb_nuc_pos)), ncol = ncol(z)) + } + + if (ambiguous_nuc == "empirical") { + amb_nuc_pos <- which(sequence == (voc_len + 1)) + z[amb_nuc_pos, ] <- matrix(rep(nuc_dist, length(amb_nuc_pos)), nrow = length(amb_nuc_pos), byrow = TRUE) + } + + if (use_coverage) { + z <- z * (cov_vector/max_cov) + } + + remove_end_of_seq <- ifelse(is.null(n_gram), 1, n_gram) + + if (!return_int) { + if (is.null(masked_lm)) { + + x <- array(0, dim = c(numberOfSamples, maxlen, voc_len)) + for (i in 1:numberOfSamples) { + start <- start_ind[i] + subset_index <- seq(start, (start + maxlen_original - remove_end_of_seq), by = n_gram_stride) + x[i, , ] <- z[subset_index, ] + } + return(x) + + } else { + + x <- array(0, dim = c(numberOfSamples, maxlen, voc_len + 1)) + y <- array(0, dim = c(numberOfSamples, maxlen, voc_len + 1)) + sw <- array(0, dim = c(numberOfSamples, maxlen)) + + for (i in 1:numberOfSamples) { + start <- start_ind[i] + subset_index <- seq(start, (start + maxlen - remove_end_of_seq), by = n_gram_stride) + x[i, , ] <- z_masked[subset_index, ] + y[i, , ] <- z[subset_index, ] + sw[i, ] <- sample_weight_seq[subset_index] + } + return(list(x=x, y=y, sample_weight=sw)) + + } + } + + if (return_int) { + if (is.null(masked_lm)) { + + x <- array(0, dim = c(numberOfSamples, maxlen)) + for (i in 1:numberOfSamples) { + start <- start_ind[i] + subset_index <- seq(start, (start + maxlen_original - remove_end_of_seq), by = n_gram_stride) + x[i, ] <- sequence[subset_index] + } + return(x) + + } else { + x <- array(0, dim = c(numberOfSamples, maxlen)) + y <- array(0, dim = c(numberOfSamples, maxlen)) + sw <- array(0, dim = c(numberOfSamples, maxlen)) + for (i in 1:numberOfSamples) { + start <- start_ind[i] + subset_index <- seq(start, (start + maxlen_original - remove_end_of_seq), by = n_gram_stride) + x[i, ] <- masked_seq[subset_index] + y[i, ] <- sequence[subset_index] + sw[i, ] <- sample_weight_seq[subset_index] + } + return(list(x=x, y=y, sample_weight=sw)) + + } + } + +} + +#' Computes start position of samples +#' +#' Helper function for data generators. +#' Computes start positions in sequence where samples can be extracted, given maxlen, step size and ambiguous nucleotide constraints. +#' +#' @inheritParams train_model +#' @param seq_vector Vector of character sequences. +#' @param length_vector Length of sequences in \code{seq_vector}. +#' @param maxlen Length of one predictor sequence. +#' @param step Distance between samples from one entry in \code{seq_vector}. +#' @param train_mode Either `"lm"` for language model or `"label"` for label classification. +#' @param discard_amb_nuc Whether to discard all samples that contain characters outside vocabulary. +#' @examples +#' seq_vector <- c("AAACCCNNNGGGTTT") +#' get_start_ind( +#' seq_vector = seq_vector, +#' length_vector = nchar(seq_vector), +#' maxlen = 4, +#' step = 2, +#' train_mode = "label", +#' discard_amb_nuc = TRUE, +#' vocabulary = c("A", "C", "G", "T")) +#' +#' @returns A numeric vector. +#' @export +get_start_ind <- function(seq_vector, length_vector, maxlen, + step, train_mode = "label", + discard_amb_nuc = FALSE, + vocabulary = c("A", "C", "G", "T")) { + + stopifnot(train_mode == "lm" | train_mode == "label") + if (!discard_amb_nuc) { + if (length(length_vector) > 1) { + startNewEntry <- cumsum(c(1, length_vector[-length(length_vector)])) + if (train_mode == "label") { + indexVector <- purrr::map(1:(length(length_vector) - 1), ~seq(startNewEntry[.x], startNewEntry[.x + 1] - maxlen, by = step)) + } else { + indexVector <- purrr::map(1:(length(length_vector) - 1), ~seq(startNewEntry[.x], startNewEntry[.x + 1] - maxlen - 1, by = step)) + } + indexVector <- unlist(indexVector) + last_seq <- length(seq_vector) + if (!(startNewEntry[last_seq] > (sum(length_vector) - maxlen + 1))) { + if (train_mode == "label") { + indexVector <- c(indexVector, seq(startNewEntry[last_seq], sum(length_vector) - maxlen + 1, by = step)) + } else { + indexVector <- c(indexVector, seq(startNewEntry[last_seq], sum(length_vector) - maxlen, by = step)) + } + } + return(indexVector) + } else { + if (train_mode == "label") { + indexVector <- seq(1, length_vector - maxlen + 1, by = step) + } else { + indexVector <- seq(1, length_vector - maxlen, by = step) + } + } + } else { + indexVector <- start_ind_ignore_amb(seq_vector = seq_vector, length_vector = length_vector, + maxlen = maxlen, step = step, vocabulary = c(vocabulary, "0"), train_mode = train_mode) + } + return(indexVector) +} + + +#' Helper function for get_start_ind, extracts the start positions of all potential samples (considering step size and vocabulary) +#' +#' @param seq Sequences. +#' @param maxlen Length of one sample. +#' @param step How often to take a sample. +#' @param vocabulary Vector of allowed characters in samples. +#' @param train_mode "lm" or "label". +#' @noRd +start_ind_ignore_amb_single_seq <- function(seq, maxlen, step, vocabulary, train_mode = "lm") { + + vocabulary <- stringr::str_to_lower(vocabulary) + vocabulary <- c(vocabulary, "0") + seq <- stringr::str_to_lower(seq) + len_seq <- nchar(seq) + if (train_mode != "label") maxlen <- maxlen + 1 + stopifnot(len_seq >= maxlen) + # regular expressions for allowed characters + voc_pattern <- paste0("[^", paste0(vocabulary, collapse = ""), "]") + + pos_of_amb_nucleotides <- stringr::str_locate_all(seq, pattern = voc_pattern)[[1]][ , 1] + non_start_index <- pos_of_amb_nucleotides - maxlen + 1 + + # define range of unallowed start indices + non_start_index <- purrr::map(non_start_index, ~(.x:(.x + maxlen - 1))) %>% + unlist() %>% union((len_seq - maxlen + 2):len_seq) %>% unique() + # drop non-positive values + if (length(non_start_index[non_start_index < 1])) { + non_start_index <- unique(c(1, non_start_index[non_start_index >= 1])) + } + + non_start_index <- non_start_index %>% sort() + allowed_start <- setdiff(1:len_seq, non_start_index) + len_start_vector <- length(allowed_start) + + + if (len_start_vector < 1) { + # message("Can not extract a single sampling point with current settings.") + return(NULL) + } + + # only keep indices with sufficient distance, as defined by step + start_indices <- vector("integer") + index <- allowed_start[1] + start_indices[1] <- index + count <- 1 + if (length(allowed_start) > 1) { + for (j in 1:(length(allowed_start) - 1)) { + if (allowed_start[j + 1] - index >= step) { + count <- count + 1 + start_indices[count] <- allowed_start[j + 1] + index <- allowed_start[j + 1] + } + } + } + + start_indices +} + + +#' Helper function for get_start_ind, extracts the start positions of all potential samples (considering step size and vocabulary) +#' +#' @param seq_vector Vector of character sequences. +#' @param maxlen Length of one sample. +#' @param step How often to take a sample. +#' @param vocabulary Vector of allowed characters in samples. +#' @param train_mode "lm" or "label". +#' @noRd +start_ind_ignore_amb <- function(seq_vector, length_vector, maxlen, step, vocabulary, train_mode = "lm") { + start_ind <- purrr::map(1:length(seq_vector), ~start_ind_ignore_amb_single_seq(seq = seq_vector[.x], + maxlen = maxlen, + step = step, + vocabulary = vocabulary, + train_mode = train_mode)) + + cum_sum_length <- cumsum(length_vector) + if (length(start_ind) > 1) { + for (i in 2:length(start_ind)) { + start_ind[[i]] <- start_ind[[i]] + cum_sum_length[i - 1] + } + } + start_ind <- unlist(start_ind) + start_ind +} + +quality_to_probability <- function(quality_vector) { + Q <- utf8ToInt(quality_vector) - 33 + 1 - 10^(-Q/10) +} + +create_quality_vector <- function(pos, prob, voc_length = 4) { + vec <- rep(0, voc_length) + vec[pos] <- prob + vec[-pos] <- (1 - prob)/(voc_length - 1) + vec +} + +remove_amb_nuc_entries <- function(fasta.file, skip_amb_nuc, pattern) { + chars_per_row <- nchar(fasta.file$Sequence) + amb_per_row <- stringr::str_count(stringr::str_to_lower(fasta.file$Sequence), pattern) + threshold_index <- (amb_per_row/chars_per_row) > skip_amb_nuc + fasta.file <- fasta.file[!threshold_index, ] + fasta.file +} + +#' Estimate frequency of different classes +#' +#' Count number of nucleotides for each class and use as estimation for relation of class distribution. +#' Outputs list of class relations. Can be used as input for \code{class_weigth} in \code{\link{train_model}} function. +#' +#' @inheritParams generator_fasta_lm +#' @inheritParams generator_fasta_label_header_csv +#' @inheritParams train_model +#' @param file_proportion Proportion of files to randomly sample for estimating class distributions. +#' @param csv_path If `train_type = "label_csv"`, path to csv file containing labels. +#' @param named_list Whether to give class weight list names `"0", "1", ...` or not. +#' @examples +#' +#' # create dummy data +#' path_1 <- tempfile() +#' path_2 <- tempfile() +#' +#' for (current_path in c(path_1, path_2)) { +#' +#' dir.create(current_path) +#' # create twice as much data for first class +#' num_files <- ifelse(current_path == path_1, 6, 3) +#' create_dummy_data(file_path = current_path, +#' num_files = num_files, +#' seq_length = 10, +#' num_seq = 5, +#' vocabulary = c("a", "c", "g", "t")) +#' } +#' +#' +#' class_weight <- get_class_weight( +#' path = c(path_1, path_2), +#' vocabulary_label = c("A", "B"), +#' format = "fasta", +#' file_proportion = 1, +#' train_type = "label_folder", +#' csv_path = NULL) +#' +#' class_weight +#' +#' @returns A list of numeric values (class weights). +#' @export +get_class_weight <- function(path, + vocabulary_label = NULL, + format = "fasta", + file_proportion = 1, + train_type = "label_folder", + named_list = FALSE, + csv_path = NULL) { + + classes <- count_nuc(path = path, + vocabulary_label = vocabulary_label, + format = format, + file_proportion = file_proportion, + train_type = train_type, + csv_path = csv_path) + + zero_entry <- classes == 0 + if (sum(zero_entry) > 0) { + warning_message <- paste("The following classes have no samples:", paste(vocabulary_label[zero_entry]), + "\n Try bigger file_proportion size or check vocabulary_label.") + warning(warning_message) + } + + if (!is.list(classes)) { + num_classes <- length(classes) + total <- sum(classes) + weight_list <- list() + for (i in 1:(length(classes))) { + weight_list[[as.character(i-1)]] <- total/(classes[i] * num_classes) + } + if (!named_list) names(classes) <- NULL # no list names in tf version > 2.8 + classes <- weight_list + } else { + weight_collection <- list() + for (j in 1:length(classes)) { + num_classes <- length(classes[[j]]) + total <- sum(classes[[j]]) + weight_list <- list() + for (i in 1:(length(classes[[j]]))) { + weight_list[[as.character(i-1)]] <- total/(classes[[j]][i] * num_classes) + } + if (!named_list) names(classes) <- NULL + weigth_collection[[j]] <- weight_list + } + classes <- weight_collection + } + + classes +} + +#' Count nucleotides per class +#' +#' @inheritParams get_class_weight +#' @noRd +count_nuc <- function(path, + vocabulary_label = NULL, + format = "fasta", + # estimate class distribution from subset + file_proportion = 1, + train_type = "label_folder", + csv_path = NULL) { + + classes <- rep(0, length(vocabulary_label)) + names(classes) <- vocabulary_label + + # label by folder + if (train_type == "label_folder") { + for (j in 1:length(path)) { + files <- list.files(path[[j]], full.names = TRUE) + if (file_proportion < 1) { + files <- sample(files, floor(file_proportion * length(files))) + } + for (i in files) { + if (format == "fasta") { + fasta.file <- microseq::readFasta(i) + } + if (format == "fastq") { + fasta.file <- microseq::readFastq(i) + } + freq <- sum(nchar(fasta.file$Sequence)) + classes[j] <- classes[j] + freq + } + } + } + + # label header + if (train_type == "label_header") { + files <- list.files(unlist(path), full.names = TRUE) + if (file_proportion < 1) { + files <- sample(files, floor(file_proportion * length(files))) + } + for (i in files) { + if (format == "fasta") { + fasta.file <- microseq::readFasta(i) + } + if (format == "fastq") { + fasta.file <- microseq::readFastq(i) + } + df <- data.frame(Header = fasta.file$Header, freq = nchar(fasta.file$Sequence)) + df <- stats::aggregate(df$freq, by = list(Category = df$Header), FUN = sum) + freq <- df$x + names(freq) <- df$Category + for (k in names(freq)) { + classes[k] <- classes[k] + freq[k] + } + } + } + + # label csv + if (train_type == "label_csv") { + + label_csv <- utils::read.csv2(csv_path, header = TRUE, stringsAsFactors = FALSE) + if (dim(label_csv)[2] == 1) { + label_csv <- utils::read.csv(csv_path, header = TRUE, stringsAsFactors = FALSE) + } + if (!("file" %in% names(label_csv))) { + stop('csv file needs one column named "file"') + } + + row_sums <- label_csv %>% dplyr::select(-file) %>% rowSums() + if (!(all(row_sums == 1))) { + stop("Can only estimate class weights if labels are mutually exclusive.") + } + + if (is.null(vocabulary_label) || missing(vocabulary_label)) { + vocabulary_label <- names(label_csv)[!names(label_csv) == "file"] + } else { + label_csv <- label_csv %>% dplyr::select(c(dplyr::all_of(vocabulary_label), "file")) + } + + classes <- rep(0, length(vocabulary_label)) + names(classes) <- vocabulary_label + + path <- unlist(path) + single_file_index <- stringr::str_detect(path, "fasta$|fastq$") + files <- c(list.files(path[!single_file_index], full.names = TRUE), path[single_file_index]) + if (file_proportion < 1) { + files <- sample(files, floor(file_proportion * length(files))) + } + for (i in files) { + if (format == "fasta") { + fasta.file <- microseq::readFasta(i) + } + if (format == "fastq") { + fasta.file <- microseq::readFastq(i) + } + count_nuc <- sum(nchar(fasta.file$Sequence)) + df <- label_csv %>% dplyr::filter(file == basename(i)) + if (nrow(df) == 0) next + index <- df[1, ] == 1 + current_label <- names(df)[index] + classes[current_label] <- classes[current_label] + count_nuc + } + } + return(classes) +} + +read_fasta_fastq <- function(format, skip_amb_nuc, file_index, pattern, shuffle_input, + reverse_complement, fasta.files, use_coverage = FALSE, proportion_entries = NULL, + vocabulary_label = NULL, filter_header = FALSE, target_from_csv = NULL) { + + if (stringr::str_detect(format, "fasta")) { + if (is.null(skip_amb_nuc)) { + fasta.file <- microseq::readFasta(fasta.files[file_index]) + } else { + fasta.file <- remove_amb_nuc_entries(microseq::readFasta(fasta.files[file_index]), skip_amb_nuc = skip_amb_nuc, + pattern = pattern) + } + + if (filter_header & is.null(target_from_csv)) { + label_vector <- trimws(stringr::str_to_lower(fasta.file$Header)) + label_filter <- label_vector %in% vocabulary_label + fasta.file <- fasta.file[label_filter, ] + } + + if (!is.null(proportion_entries) && proportion_entries < 1) { + index <- sample(nrow(fasta.file), max(1, floor(nrow(fasta.file) * proportion_entries))) + fasta.file <- fasta.file[index, ] + } + + if (shuffle_input) { + fasta.file <- fasta.file[sample(nrow(fasta.file)), ] + } + + if (reverse_complement) { + index <- sample(c(TRUE, FALSE), nrow(fasta.file), replace = TRUE) + fasta.file$Sequence[index] <- microseq::reverseComplement(fasta.file$Sequence[index]) + } + + } + + if (stringr::str_detect(format, "fastq")) { + if (is.null(skip_amb_nuc)) { + fasta.file <- microseq::readFastq(fasta.files[file_index]) + } else { + fasta.file <- remove_amb_nuc_entries(microseq::readFastq(fasta.files[file_index]), skip_amb_nuc = skip_amb_nuc, + pattern = pattern) + } + + if (filter_header & is.null(target_from_csv)) { + label_vector <- trimws(stringr::str_to_lower(fasta.file$Header)) + label_filter <- label_vector %in% vocabulary_label + fasta.file <- fasta.file[label_filter, ] + } + + if (!is.null(proportion_entries) && proportion_entries < 1) { + index <- sample(nrow(fasta.file), max(1, floor(nrow(fasta.file) * proportion_entries))) + fasta.file <- fasta.file[index, ] + } + + if (shuffle_input) { + fasta.file <- fasta.file[sample(nrow(fasta.file)), ] + } + + if (reverse_complement & sample(c(TRUE, FALSE), 1)) { + fasta.file$Sequence <- microseq::reverseComplement(fasta.file$Sequence) + } + } + return(fasta.file) +} + +input_from_csv <- function(added_label_path) { + .datatable.aware = TRUE + label_csv <- utils::read.csv2(added_label_path, header = TRUE, stringsAsFactors = FALSE) + if (dim(label_csv)[2] == 1) { + label_csv <- utils::read.csv(added_label_path, header = TRUE, stringsAsFactors = FALSE) + } + label_csv <- data.table::as.data.table(label_csv) + label_csv$file <- stringr::str_to_lower(as.character(label_csv$file)) + data.table::setkey(label_csv, file) + added_label_by_header <- FALSE + + if (!("file" %in% names(label_csv))) { + stop('names in added_label_path should contain one column named "file" ') + } + col_name <- ifelse(added_label_by_header, "header", "file") + return(list(label_csv = label_csv, col_name = col_name)) +} + +#' @rawNamespace import(data.table, except = c(first, last, between)) +#' @noRd +csv_to_tensor <- function(label_csv, added_label_vector, added_label_by_header, batch_size, + start_index_list) { + .datatable.aware = TRUE + label_tensor <- matrix(0, ncol = ncol(label_csv) - 1, nrow = batch_size, byrow = TRUE) + + if (added_label_by_header) { + header_unique <- unique(added_label_vector) + for (i in header_unique) { + label_from_csv <- label_csv[ .(i), -"header"] + index_label_vector <- added_label_vector == i + if (nrow(label_from_csv) > 0) { + label_tensor[index_label_vector, ] <- matrix(as.matrix(label_from_csv[1, ]), + nrow = sum(index_label_vector), ncol = ncol(label_tensor), byrow = TRUE) + } + } + } else { + row_index <- 1 + for (i in 1:length(added_label_vector)) { + row_filter <- added_label_vector[i] + label_from_csv <- label_csv[data.table(row_filter), -"file"] + samples_per_file <- length(start_index_list[[i]]) + assign_rows <- row_index:(row_index + samples_per_file - 1) + + if (nrow(stats::na.omit(label_from_csv)) > 0) { + label_tensor[assign_rows, ] <- matrix(as.matrix(label_from_csv[1, ]), + nrow = samples_per_file, ncol = ncol(label_tensor), byrow = TRUE) + } + row_index <- row_index + samples_per_file + } + } + return(label_tensor) +} + +#' Divide tensor to list of subsets +#' +#' @noRd +slice_tensor <- function(tensor, target_split) { + + num_row <- nrow(tensor) + l <- vector("list", length = length(target_split)) + for (i in 1:length(target_split)) { + if (length(target_split[[i]]) == 1 | num_row == 1) { + l[[i]] <- matrix(tensor[ , target_split[[i]]], ncol = length(target_split[[i]])) + } else { + l[[i]] <- tensor[ , target_split[[i]]] + } + } + return(l) +} + +check_header_names <- function(target_split, vocabulary_label) { + target_split <- unlist(target_split) + if (!all(target_split %in% vocabulary_label)) { + stop_text <- paste("Your csv file has no columns named", + paste(target_split[!(target_split %in% vocabulary_label)], collapse = " ")) + stop(stop_text) + } + if (!all(vocabulary_label %in% target_split)) { + warning_text <- paste("target_split does not cover the following columns:", + paste(vocabulary_label[!(vocabulary_label %in% target_split)], collapse = " ")) + warning(warning_text) + } +} + +count_files <- function(path, format = "fasta", train_type, + target_from_csv = NULL, train_val_split_csv = NULL) { + + num_files <- rep(0, length(path)) + if (!is.null(target_from_csv) & train_type == "label_csv") { + target_files <- utils::read.csv(target_from_csv) + if (ncol(target_files) == 1) target_files <- utils::read.csv2(target_from_csv) + target_files <- target_files$file + # are files given with absolute path + full.names <- ifelse(dirname(target_files[1]) == ".", FALSE, TRUE) + } + if (!is.null(train_val_split_csv)) { + tvt_files <- utils::read.csv(train_val_split_csv) + if (ncol(tvt_files) == 1) tvt_files <- utils::read.csv2(train_val_split_csv) + train_index <- tvt_files$type == "train" + tvt_files <- tvt_files$file + target_files <- intersect(tvt_files[train_index], target_files) + } + + for (i in 1:length(path)) { + for (k in 1:length(path[[i]])) { + current_path <- path[[i]][[k]] + + if (!is.null(train_val_split_csv)) { + if (!(current_path %in% target_files)) next + } + + if (endsWith(current_path, paste0(".", format))) { + # remove files not in csv file + if (!is.null(target_from_csv)) { + current_files <- length(intersect(basename(target_files), basename(current_path))) + } else { + current_files <- 1 + } + } else { + # remove files not in csv file + if (!is.null(target_from_csv)) { + current_files <- list.files(current_path, pattern = paste0(".", format, "$"), full.names = full.names) %>% + intersect(target_files) %>% length() + } else { + current_files <- list.files(current_path, pattern = paste0(".", format, "$")) %>% length() + } + } + num_files[i] <- num_files[i] + current_files + + if (current_files == 0) { + stop(paste0(path[[i]][[k]], " is empty or no files with .", format, " ending in this directory")) + } + } + } + + # return number of files per class for "label_folder" + if (train_type == "label_folder") { + return(num_files) + } else { + return(sum(num_files)) + } +} + +list_fasta_files <- function(path_corpus, format, file_filter) { + + fasta.files <- list() + path_corpus <- unlist(path_corpus) + + for (i in 1:length(path_corpus)) { + + if (endsWith(path_corpus[[i]], paste0(".", format))) { + fasta.files[[i]] <- path_corpus[[i]] + + } else { + + fasta.files[[i]] <- list.files( + path = path_corpus[[i]], + pattern = paste0("\\.", format, "$"), + full.names = TRUE) + } + } + fasta.files <- unlist(fasta.files) + num_files <- length(fasta.files) + + if (!is.null(file_filter)) { + + # file filter files given with/without absolute path + if (all(basename(file_filter) == file_filter)) { + fasta.files <- fasta.files[basename(fasta.files) %in% file_filter] + } else { + fasta.files <- fasta.files[fasta.files %in% file_filter] + } + + if (length(fasta.files) < 1) { + stop_text <- paste0("None of the files from ", unlist(path_corpus), + " are present in train_val_split_csv table for either train or validation. \n") + stop(stop_text) + } + } + + fasta.files <- gsub(pattern="/+", replacement="/", x = fasta.files) + fasta.files <- gsub(pattern="/$", replacement="", x = fasta.files) + return(fasta.files) +} + +get_coverage <- function(fasta.file) { + header <- fasta.file$Header + cov <- stringr::str_extract(header, "cov_\\d+") %>% + stringr::str_extract("\\d+") %>% as.integer() + cov[is.na(cov)] <- 1 + return(cov) +} + +get_coverage_concat <- function(fasta.file, concat_seq) { + header <- fasta.file$Header + cov <- stringr::str_extract(header, "cov_\\d+") %>% + stringr::str_extract("\\d+") %>% as.integer() + cov[is.na(cov)] <- 1 + len_vec <- nchar(fasta.file$Sequence) + cov <- purrr::map(1:nrow(fasta.file), ~rep(cov[.x], times = len_vec[.x])) + cov <- lapply(cov, append, rep(1, nchar(concat_seq))) + cov <- unlist(cov) + cov <- cov[-((length(cov) - nchar(concat_seq)) : length(cov))] + return(cov) +} + +#' Reshape tensors for set learning +#' +#' Reshape input x and target y. Aggregates multiple samples from x and y into single input/target batches. +#' +#' @param x 3D input tensor. +#' @param y 2D target tensor. +#' @param samples_per_target How many samples to use for one target +#' @param reshape_mode `"time_dist", "multi_input"` or `"concat"` +#' \itemize{ +#' \item If `"multi_input"`, will produce `samples_per_target` separate inputs, each of length `maxlen`. +#' \item If `"time_dist"`, will produce a 4D input array. The dimensions correspond to +#' `(new_batch_size, samples_per_target, maxlen, length(vocabulary))`. +#' \item If `"concat"`, will concatenate `samples_per_target` sequences of length `maxlen` to one long sequence +#' } +#' @param buffer_len Only applies if `reshape_mode = "concat"`. If `buffer_len` is an integer, the subsequences are interspaced with `buffer_len` rows. The reshaped x has +#' new maxlen: (`maxlen` \eqn{*} `samples_per_target`) + `buffer_len` \eqn{*} (`samples_per_target` - 1). +#' @param new_batch_size Size of first axis of input/targets after reshaping. +#' @param check_y Check if entries in `y` are consistent with reshape strategy (same label when aggregating). +#' @examplesIf reticulate::py_module_available("tensorflow") +#' # create dummy data +#' batch_size <- 8 +#' maxlen <- 11 +#' voc_len <- 4 +#' x <- sample(0:(voc_len-1), maxlen*batch_size, replace = TRUE) +#' x <- keras::to_categorical(x, num_classes = voc_len) +#' x <- array(x, dim = c(batch_size, maxlen, voc_len)) +#' y <- rep(0:1, each = batch_size/2) +#' y <- keras::to_categorical(y, num_classes = 2) +#' y +#' +#' # reshape data for multi input model +#' reshaped_data <- reshape_tensor( +#' x = x, +#' y = y, +#' new_batch_size = 2, +#' samples_per_target = 4, +#' reshape_mode = "multi_input") +#' +#' length(reshaped_data[[1]]) +#' dim(reshaped_data[[1]][[1]]) +#' reshaped_data[[2]] +#' +#' @returns A list of 2 tensors. +#' @export +reshape_tensor <- function(x, y, new_batch_size, + samples_per_target, + buffer_len = NULL, + reshape_mode = "time_dist", + check_y = FALSE) { + + batch_size <- dim(x)[1] + maxlen <- dim(x)[2] + voc_len <- dim(x)[3] + num_classes <- dim(y)[2] + + if (check_y) { + targets <- apply(y, 1, which.max) + test_y_dist <- all(targets == rep(1:num_classes, each = batch_size/num_classes)) + if (!test_y_dist) { + stop("y must have same number of samples for each class") + } + } + + if (reshape_mode == "time_dist") { + + x_new <- array(0, dim = c(new_batch_size, samples_per_target, maxlen, voc_len)) + y_new <- array(0, dim = c(new_batch_size, num_classes)) + for (i in 1:new_batch_size) { + index <- (1:samples_per_target) + (i-1)*samples_per_target + x_new[i, , , ] <- x[index, , ] + y_new[i, ] <- y[index[1], ] + } + + return(list(x = x_new, y = y_new)) + } + + if (reshape_mode == "multi_input") { + + x_list <- vector("list", samples_per_target) + for (i in 1:samples_per_target) { + x_index <- base::seq(i, batch_size, samples_per_target) + x_list[[i]] <- x[x_index, , ] + } + y <- y[base::seq(1, batch_size, samples_per_target), ] + return(list(x = x_list, y = y)) + } + + if (reshape_mode == "concat") { + + use_buffer <- !is.null(buffer_len) && buffer_len > 0 + if (use_buffer) { + buffer_tensor <- array(0, dim = c(buffer_len, voc_len)) + buffer_tensor[ , voc_len] <- 1 + concat_maxlen <- (maxlen * samples_per_target) + (buffer_len * (samples_per_target - 1)) + } else { + concat_maxlen <- maxlen * samples_per_target + } + + x_new <- array(0, dim = c(new_batch_size, concat_maxlen, voc_len)) + y_new <- array(0, dim = c(new_batch_size, num_classes)) + + + for (i in 1:new_batch_size) { + index <- (1:samples_per_target) + (i-1)*samples_per_target + if (!use_buffer) { + x_temp <- x[index, , ] + x_temp <- reticulate::array_reshape(x_temp, dim = c(1, dim(x_temp)[1] * dim(x_temp)[2], voc_len)) + } else { + # create list of subsequences interspaced with buffer tensor + x_list <- vector("list", (2*samples_per_target) - 1) + x_list[seq(2, length(x_list), by = 2)] <- list(buffer_tensor) + for (k in 1:length(index)) { + x_list[[(2*k) - 1]] <- x[index[k], , ] + } + x_temp <- do.call(rbind, x_list) + } + + x_new[i, , ] <- x_temp + y_new[i, ] <- y[index[1], ] + } + return(list(x = x_new, y = y_new)) + } +} + +#' Transform confusion matrix with total numbers to matrix with percentages. +#' +#' @noRd +cm_perc <- function(cm, round_dig = 2) { + col_sums <- colSums(cm) + for (i in 1:ncol(cm)) { + if (col_sums[i] == 0) { + cm[ , i] <- 0 + } else { + cm[ , i] <- cm[ , i]/col_sums[i] + } + } + cm <- round(cm, round_dig) + cm +} + +create_conf_mat_obj <- function(m, confMatLabels) { + dimnames(m) <- list(Prediction = confMatLabels, Truth = confMatLabels) + l <- list() + m <- as.table(m) + l[["table"]] <- m + l[["dots"]] <- list() + class(l) <- "conf_mat" + return(l) +} + +#' Encode sequence of integers to sequence of n-gram +#' +#' Input is sequence of integers from vocabulary of size \code{voc_size}. +#' Returns vector of integers corresponding to n-gram encoding. +#' Integers greater than `voc_size` get encoded as `voc_size^n + 1`. +#' +#' @param int_seq Integer sequence +#' @param n Length of n-gram aggregation +#' @param voc_size Size of vocabulary. +#' @examples +#' int_to_n_gram(int_seq = c(1,1,2,4,4), n = 2, voc_size = 4) +#' +#' @returns A numeric vector. +#' @export +int_to_n_gram <- function(int_seq, n, voc_size = 4) { + + encoding_len <- length(int_seq) - n + 1 + n_gram_encoding <- vector("numeric", encoding_len) + oov_token <- voc_size^n + 1 + padding_token <- 0 + + for (i in 1:encoding_len) { + int_seq_subset <- int_seq[i:(i + n - 1)] + + if (prod(int_seq_subset) == 0) { + n_gram_encoding[i] <- padding_token + } else { + # encoding for amb nuc + if (any(int_seq_subset > voc_size)) { + n_gram_encoding[i] <- oov_token + } else { + int_seq_subset <- int_seq_subset - 1 + n_gram_encoding[i] <- 1 + sum(voc_size^((n-1):0) * (int_seq_subset)) + } + } + } + n_gram_encoding +} + +#' One-hot encoding matrix to n-gram encoding matrix +#' +#' @param input_matrix Matrix with one 1 per row and zeros otherwise. +#' @param n Length of one n-gram. +#' @examplesIf reticulate::py_module_available("tensorflow") +#' x <- c(0,0,1,3,3) +#' input_matrix <- keras::to_categorical(x, 4) +#' n_gram_of_matrix(input_matrix, n = 2) +#' +#' @returns Matrix of one-hot encodings. +#' @export +n_gram_of_matrix <- function(input_matrix, n = 3) { + voc_len <- ncol(input_matrix)^n + oov_index <- apply(input_matrix, 1, max) != 1 + max_index <- apply(input_matrix, 1, which.max) + max_index[oov_index] <- voc_len + 1 + int_enc <- int_to_n_gram(int_seq = max_index, n = n, voc_size = ncol(input_matrix)) + if (length(int_enc) == 1) { + n_gram_matrix <- matrix(keras::to_categorical(int_enc, num_classes = voc_len + 2), nrow = 1)[ , -c(1, voc_len + 2)] + } else { + n_gram_matrix <- keras::to_categorical(int_enc, num_classes = voc_len + 2)[ , -c(1, voc_len + 2)] + } + n_gram_matrix <- matrix(n_gram_matrix, ncol = voc_len) + return(n_gram_matrix) +} + +n_gram_of_3d_tensor <- function(tensor_3d, n) { + new_dim <- dim(tensor_3d) + new_dim[2] <- new_dim[2] - n + 1 + new_dim[3] <- new_dim[3]^n + new_tensor <- array(0, dim = new_dim) + for (i in 1:dim(tensor_3d)[1]) { + new_tensor[i, , ] <- n_gram_of_matrix(tensor_3d[i, , ], n = n) + } + new_tensor +} + +n_gram_vocabulary <- function(n_gram = 3, vocabulary = c("A", "C", "G", "T")) { + l <- list() + for (i in 1:n_gram) { + l[[i]] <- vocabulary + } + df <- expand.grid(l) + df <- df[ , ncol(df) : 1] + n_gram_nuc <- apply(df, 1, paste, collapse = "") + n_gram_nuc +} + + +#' Split fasta file into smaller files. +#' +#' Returns smaller files with same file name and "_x" (where x is an integer). For example, +#' assume we have input file called "abc.fasta" with 100 entries and `split_n = 50`. Function will +#' create two files called "abc_1.fasta" and "abc_2.fasta" in `target_path`. +#' +#' @param path_input Fasta file to split into smaller files +#' @param split_n Maximum number of entries to use in smaller file. +#' @param target_folder Directory for output. +#' @param shuffle_entries Whether to shuffle fasta entries before split. +#' @param delete_input Whether to delete the original file. +#' @examples +#' path_input <- tempfile(fileext = '.fasta') +#' create_dummy_data(file_path = path_input, +#' num_files = 1, +#' write_to_file_path = TRUE, +#' seq_length = 7, +#' num_seq = 25, +#' vocabulary = c("a", "c", "g", "t")) +#' target_folder <- tempfile() +#' dir.create(target_folder) +#' +#' # split 25 entries into 5 files +#' split_fasta(path_input = path_input, +#' target_folder = target_folder, +#' split_n = 5) +#' length(list.files(target_folder)) +#' +#' @returns None. Writes files to output. +#' @export +split_fasta <- function(path_input, + target_folder, + split_n = 500, + shuffle_entries = TRUE, + delete_input = FALSE) { + + fasta_file <- microseq::readFasta(path_input) + + base_name <- basename(stringr::str_remove(path_input, ".fasta")) + new_path <- paste0(target_folder, "/", base_name) + count <- 1 + start_index <- 1 + end_index <- 1 + + if (nrow(fasta_file) == 1) { + fasta_name <- paste0(new_path, "_", count, ".fasta") + microseq::writeFasta(fasta_file, fasta_name) + if (delete_input) { + file.remove(path_input) + } + return(NULL) + } + + if (shuffle_entries) { + fasta_file <- fasta_file[sample(nrow(fasta_file)), ] + } + + while (end_index < nrow(fasta_file)) { + end_index <- min(start_index + split_n - 1, nrow(fasta_file)) + index <- start_index : end_index + sub_df <- fasta_file[index, ] + fasta_name <- paste0(new_path, "_", count, ".fasta") + microseq::writeFasta(sub_df, fasta_name) + start_index <- start_index + split_n + count <- count + 1 + } + + if (delete_input) { + file.remove(path_input) + } +} + +#' Add noise to tensor +#' +#' @param noise_type "normal" or "uniform". +#' @param ... additional arguments for rnorm or runif call. +#' @noRd +add_noise_tensor <- function(x, noise_type, ...) { + + stopifnot(noise_type %in% c("normal", "uniform")) + random_fn <- ifelse(noise_type == "normal", "rnorm", "runif") + + if (is.list(x)) { + for (i in 1:length(x)) { + x_dim <- dim(x[[i]]) + noise_tensor <- do.call(random_fn, list(n = prod(x_dim[-1]), ...)) + noise_tensor <- array(noise_tensor, dim = x_dim) + x[[i]] <- x[[i]] + noise_tensor + } + } else { + x_dim <- dim(x) + stopifnot(noise_type %in% c("normal", "uniform")) + random_fn <- ifelse(noise_type == "normal", "rnorm", "runif") + noise_tensor <- do.call(random_fn, list(n = prod(x_dim[-1]), ...)) + noise_tensor <- array(noise_tensor, dim = x_dim) + x <- x + noise_tensor + } + + return(x) +} + +reverse_complement_tensor <- function(x) { + stopifnot(dim(x)[3] == 4) + x_rev_comp <- x[ , dim(x)[2]:1, 4:1] + x_rev_comp <- array(x_rev_comp, dim = dim(x)) + x_rev_comp +} + + +get_pos_enc <- function(pos, i, d_model, n = 10000) { + + pw <- (2 * floor(i/2)) / d_model + angle_rates <- 1 / (n ^ pw) + angle <- pos * angle_rates + pos_enc <- ifelse(i %% 2 == 0, sin(angle), cos(angle)) + return(pos_enc) +} + +positional_encoding <- function(seq_len, d_model, n=10000) { + + P = matrix(0, nrow = seq_len, ncol = d_model) + + for (pos in 0:(seq_len - 1)) { + for (i in 0:(d_model - 1)) { + P[pos + 1, i + 1] <- get_pos_enc(pos, i, d_model, n) + } + } + + return(P) +} + + +subset_tensor_list <- function(tensor_list, dim_list, subset_index, dim_n_list) { + + for (i in 1:length(tensor_list)) { + tensor_list[[i]] <- subset_tensor(tensor = tensor_list[[i]], + subset_index = subset_index, + dim_n = dim_n_list[[i]]) + } + +} + +subset_tensor <- function(tensor, subset_index, dim_n) { + + if (dim_n == 1) { + subset_tensor <- tensor[subset_index] + } + + if (dim_n == 2) { + subset_tensor <- tensor[subset_index, ] + } + + if (dim_n == 3) { + subset_tensor <- tensor[subset_index, , ] + } + + if (dim_n == 4) { + subset_tensor <- tensor[subset_index, , , ] + } + + if (length(subset_index) == 1 & dim_n > 1) { + subset_tensor <- tensorflow::tf$expand_dims(subset_tensor, axis = 0L) + } +} + + +mask_seq <- function(int_seq, + mask_rate = NULL, + random_rate = NULL, + identity_rate = NULL, + block_len = NULL, + start_ind = NULL, + voc_len) { + + mask_token <- voc_len + 1 + if (is.null(mask_rate)) mask_rate <- 0 + if (is.null(random_rate)) random_rate <- 0 + if (is.null(identity_rate)) identity_rate <- 0 + mask_perc <- mask_rate + random_rate + identity_rate + if (mask_perc > 1) { + stop("Sum of mask_rate, random_rate, identity_rate bigger than 1") + } + # don't mask padding or oov positions + valid_pos <- which(int_seq != 0 & int_seq != mask_token) + + # randomly decide whether to round up or down + ceiling_floor <- sample(c(TRUE, FALSE), 3, replace = TRUE) + # adjust for block len + block_len_adjust <- ifelse(is.null(block_len), 1, block_len) + + num_mask_pos <- (mask_rate * length(valid_pos))/block_len_adjust + num_mask_pos <- ifelse(ceiling_floor[1], floor(num_mask_pos), ceiling(num_mask_pos)) + num_random_pos <- (random_rate * length(valid_pos))/block_len_adjust + num_random_pos <- ifelse(ceiling_floor[2], floor(num_random_pos), ceiling(num_random_pos)) + num_identity_pos <- (identity_rate * length(valid_pos))/block_len_adjust + num_identity_pos <- ifelse(ceiling_floor[3], floor(num_identity_pos), ceiling(num_identity_pos)) + num_all_pos <- num_mask_pos + num_random_pos + num_identity_pos + if (is.null(block_len)) { + all_pos <- sample(valid_pos, num_all_pos) + } else { + valid_pos_block_len <- seq(from = sample(1:(block_len - 1), 1), to = length(valid_pos), by = block_len) + valid_pos <- intersect(valid_pos_block_len, valid_pos) + all_pos <- sample(valid_pos, min(num_all_pos, length(valid_pos))) + } + + sample_weight_seq <- rep(0, length(int_seq)) + if (is.null(block_len)) { + sample_weight_seq[all_pos] <- 1 + } else { + all_pos_blocks <- purrr::map(all_pos, ~seq(.x, .x + block_len - 1, by = 1)) + sample_weight_seq[unlist(all_pos_blocks)] <- 1 + } + + if (num_mask_pos > 0) { + mask_index <- sample(all_pos, num_mask_pos) + all_pos <- setdiff(all_pos, mask_index) + if (!is.null(block_len)) { + mask_index <- purrr::map(mask_index, ~seq(.x, .x + block_len - 1, by = 1)) %>% + unlist() + } + int_seq[mask_index] <- mask_token + } + + if (num_random_pos > 0) { + random_index <- sample(all_pos, num_random_pos) + all_pos <- setdiff(all_pos, random_index) + if (!is.null(block_len)) { + random_index <- purrr::map(random_index, ~seq(.x, .x + block_len - 1, by = 1)) %>% + unlist() + } + int_seq[random_index] <- sample(1:voc_len, length(random_index), replace = TRUE) + } + + # mask oov tokens + sample_weight_seq[int_seq == mask_token] <- 1 + + return(list(masked_seq = int_seq, sample_weight_seq = sample_weight_seq)) + +} + +#' Char sequence corresponding to one-hot matrix. +#' +#' Return character sequence corresponding to one-hot elements in matrix or tensor. +#' +#' @inheritParams generator_fasta_lm +#' @param m One-hot encoding matrix or 3d array where each element of first axis is one-hot matrix. +#' @param amb_enc Either `"zero"` or `"equal"`. How oov tokens where treated for one-hot encoding. +#' @param amb_char Char to use for oov positions. +#' @param paste_chars Whether to return vector or single sequence. +#' @examples +#' m <- matrix(c(1,0,0,0,0,1,0,0), 2) +#' one_hot_to_seq(m) +#' +#' @returns A string. +#' @export +one_hot_to_seq <- function(m, vocabulary = c("A", "C", "G", "T"), amb_enc = "zero", + amb_char = "N", paste_chars = TRUE) { + + if (length(dim(m)) == 3) { + seq_list <- list() + for (i in 1:dim(m)[1]) { + seq_list[[i]] <- one_hot_to_seq(m = m[i, , ], vocabulary = vocabulary, amb_enc = amb_enc, + amb_char = amb_char, paste_chars = paste_chars) + } + return(seq_list) + } + + if (amb_enc == "zero") { + amb_row <- which(rowSums(m) == 0) + } + + if (amb_enc == "equal") { + amb_row <- which(rowSums[ , 1] == 1/length(vocabulary)) + } + + nt_seq <- vocabulary[apply(m, 1, which.max)] + nt_seq[amb_row] <- amb_char + + if (paste_chars) { + nt_seq <- paste(nt_seq, collapse = "") + } + + return(nt_seq) + +}