--- a +++ b/tests/testthat/test-generators.R @@ -0,0 +1,2832 @@ +context("generators") + +test_that("Checking the generator for the Fasta files", { + + testthat::skip_if_not_installed("tensorflow") + testthat::skip_if_not(reticulate::py_module_available("tensorflow")) + + testpath <- file.path("fasta_2") + vocabulary <- c("a", "c", "g", "t") + batch_size <- 5 + maxlen <- 3 + gen <- generator_fasta_lm(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen, vocabulary = vocabulary) + + arrays <- gen() + + expect_equivalent(dim(arrays[[1]])[1], batch_size) + expect_equivalent(dim(arrays[[1]])[2], maxlen) + expect_equivalent(dim(arrays[[1]])[3], length(vocabulary)) + expect_equivalent(dim(arrays[[2]])[1], batch_size) + expect_equivalent(dim(arrays[[2]])[2], length(vocabulary)) + expect_equivalent(length(arrays),2) + + # a.fasta file starts with aaccggtt + + expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) # a + expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) # a + expect_equivalent(arrays[[1]][1, 3, ], c(0, 1, 0, 0)) # c + expect_equivalent(arrays[[2]][1, ], c(0, 1, 0, 0)) # c + + arrays_2 <- gen() + + expect_equivalent(arrays_2[[1]][2, 1, ], c(1, 0, 0, 0)) # a + expect_equivalent(arrays_2[[1]][2, 2, ], c(1, 0, 0, 0)) # a + expect_equivalent(arrays_2[[1]][2, 3, ], c(1, 0, 0, 0)) # a + expect_equivalent(arrays_2[[2]][2, ], c(0, 0, 0, 1)) # t + + # test transition to second fasta file + gen <- generator_fasta_lm(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen, vocabulary = vocabulary) + for (i in 1:5){ + arrays <- gen() + } + + # samples start at beginning of b.fasta + expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) # a + expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) # a + expect_equivalent(arrays[[1]][1, 3, ], c(1, 0, 0, 0)) # a + expect_equivalent(arrays[[2]][1, ], c(1, 0, 0, 0)) # a + + expect_equivalent(arrays[[1]][5, 1, ], c(1, 0, 0, 0)) # a + expect_equivalent(arrays[[1]][5, 2, ], c(1, 0, 0, 0)) # a + expect_equivalent(arrays[[1]][5, 3, ], c(1, 0, 0, 0)) # a + expect_equivalent(arrays[[2]][5, ], c(1, 0, 0, 0)) # a + + # complete one iteration (100 samples) + gen <- generator_fasta_lm(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen, vocabulary = vocabulary) + for (i in 1:9){ + arrays <- gen() + } + + # start from a.fasta again + expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) # a + expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) # a + expect_equivalent(arrays[[1]][1, 3, ], c(0, 1, 0, 0)) # c + expect_equivalent(arrays[[2]][1, ], c(0, 1, 0, 0)) # c + + ################### + # test for different step size + gen <- generator_fasta_lm(path_corpus = testpath, batch_size = 4, maxlen = 3, step = 2) + arrays <- gen() + + expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[2]][3, ], c(0, 0, 0, 1)) + + expect_equivalent(arrays[[1]][4, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][4, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][4, 3, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[2]][4, ], c(1, 0, 0, 0)) + + #### + # tests with chars outside vocabulary, vocabulary does not contain "A" + gen <- generator_fasta_lm(path_corpus = testpath, batch_size = 5, maxlen = 3, step = 2, vocabulary = c("c", "g", "t")) + arrays <- gen() + + expect_equivalent(arrays[[1]][1, 1, ], c(0, 0, 0)) # a + expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 0)) # a + expect_equivalent(arrays[[1]][1, 3, ], c(1, 0, 0)) # c + expect_equivalent(arrays[[2]][1, ], c(1, 0, 0)) # c + + #### + # test padding + gen <- generator_fasta_lm(path_corpus = testpath, batch_size = 1, maxlen = 10, step = 4, + vocabulary = c("a", "c", "g", "t")) + arrays <- gen() + expect_equivalent(arrays[[1]][1, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 3, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 4, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 5, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 6, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][1, 7, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][1, 8, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][1, 9, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][1, 10, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[2]][1, ], c(0, 0, 0, 1)) + + # no padding + testpath <- file.path("fasta_3") + gen <- generator_fasta_lm(path_corpus = testpath, batch_size = 2, maxlen = 12, step = 1, + vocabulary = c("a", "c", "g", "t"), padding = FALSE) + arrays <- gen() + expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 3, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 4, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][1, 5, ], c(0, 1, 0, 0)) + + expect_equivalent(arrays[[1]][2, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 3, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 4, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][2, 5, ], c(0, 1, 0, 0)) + #### + testpath <- file.path("fasta_2") + expect_error(generator_fasta_lm()) + expect_error(generator_fasta_lm("")) + + expect_is(generator_fasta_lm(testpath, batch_size = batch_size, maxlen = maxlen), "function") + expect_is(gen(), "list") + expect_is(gen()[[1]], "array") + expect_is(gen()[[2]], "matrix") + + expect_silent(generator_fasta_lm(testpath, batch_size = batch_size, maxlen = maxlen)) + + expect_type(gen()[[1]], "double") + expect_type(gen()[[2]], "double") + + ############# Test label generator (header) ############# + testpath <- file.path("fasta_2") + + gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = 5, maxlen = 3, step = 2, vocabulary = c("a", "c", "g", "t"), + reverse_complement = FALSE, vocabulary_label = c("w", "x", "y")) + + arrays <- gen() + + expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) # A + expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) # A + expect_equivalent(arrays[[1]][1, 3, ], c(0, 1, 0, 0)) # C + expect_equivalent(arrays[[2]][1, ], c(1, 0, 0)) # W + + expect_equivalent(arrays[[1]][5, 1, ], c(1, 0, 0, 0)) # A + expect_equivalent(arrays[[1]][5, 2, ], c(1, 0, 0, 0)) # A + expect_equivalent(arrays[[1]][5, 3, ], c(0, 0, 0, 1)) # T + expect_equivalent(arrays[[2]][5, ], c(0, 1, 0)) # W + + + gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = 5, maxlen = 8, step = 2, vocabulary = c("a", "c", "g", "t"), + reverse_complement = FALSE, vocabulary_label = c("w", "x", "y")) + + arrays <- gen() + expect_equivalent(arrays[[2]][1, ], c(1, 0, 0)) + expect_equivalent(arrays[[2]][2, ], c(0, 1, 0)) + expect_equivalent(arrays[[2]][3, ], c(0, 0, 1)) + expect_equivalent(arrays[[2]][4, ], c(0, 1, 0)) + expect_equivalent(arrays[[2]][5, ], c(0, 1, 0)) + + arrays <- gen() + expect_equivalent(arrays[[1]][5, 1, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][5, 2, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][5, 3, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][5, 4, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][5, 5, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][5, 6, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][5, 7, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][5, 8, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[2]][1, ], c(0, 0, 1)) + expect_equivalent(arrays[[2]][2, ], c(0, 0, 1)) + expect_equivalent(arrays[[2]][3, ], c(1, 0, 0)) + expect_equivalent(arrays[[2]][4, ], c(0, 1, 0)) + expect_equivalent(arrays[[2]][5, ], c(0, 0, 1)) + + + gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = 8, maxlen = 7, step = 2, vocabulary = c("a", "c", "g", "t"), + reverse_complement = FALSE, vocabulary_label = c("w", "x", "y")) + + arrays <- gen() + + # go through a/b.fasta once discard samples with target z + expect_equivalent(arrays[[1]][8, 1, ], c(1, 0, 0, 0)) # A + expect_equivalent(arrays[[1]][8, 2, ], c(1, 0, 0, 0)) # A + expect_equivalent(arrays[[1]][8, 3, ], c(0, 1, 0, 0)) # C + expect_equivalent(arrays[[2]][8, ], c(1, 0, 0)) # W + + + ############# Test label generator (folder) ############# + directories <- c("label_folder/x", "label_folder/y", "label_folder/z") + val <- FALSE + gen_list <- generator_initialize(directories = directories, + val = val, + format = "fasta", + batch_size = 6, + maxlen = 2, + vocabulary = c("a", "c", "g", "t"), + step = 2) + + gen <- generator_fasta_label_folder_wrapper(val = val, path = directories, gen_list = gen_list) + arrays <- gen() + expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 2, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][2, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 2, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][3, 2, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][4, 2, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][5, 1, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][5, 2, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][6, 1, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][6, 2, ], c(0, 0, 0, 1)) + + expect_equivalent(arrays[[2]][1, ], c(1, 0, 0)) + expect_equivalent(arrays[[2]][2, ], c(1, 0, 0)) + expect_equivalent(arrays[[2]][3, ], c(0, 1, 0)) + expect_equivalent(arrays[[2]][4, ], c(0, 1, 0)) + expect_equivalent(arrays[[2]][5, ], c(0, 0, 1)) + expect_equivalent(arrays[[2]][6, ], c(0, 0, 1)) + + + # test skipping file + for (i in 1:2){ + arrays <- gen() + } + + expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 2, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][2, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 2, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][3, 1, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][4, 1, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][5, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][5, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][6, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][6, 2, ], c(1, 0, 0, 0)) + + expect_equivalent(arrays[[2]][1, ], c(1, 0, 0)) + expect_equivalent(arrays[[2]][2, ], c(1, 0, 0)) + expect_equivalent(arrays[[2]][3, ], c(0, 1, 0)) + expect_equivalent(arrays[[2]][4, ], c(0, 1, 0)) + expect_equivalent(arrays[[2]][5, ], c(0, 0, 1)) + expect_equivalent(arrays[[2]][6, ], c(0, 0, 1)) + + # + arrays <- gen() + + expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][2, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 2, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][3, 2, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][4, 2, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][5, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][5, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][6, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][6, 2, ], c(1, 0, 0, 0)) + + expect_equivalent(arrays[[2]][1, ], c(1, 0, 0)) + expect_equivalent(arrays[[2]][2, ], c(1, 0, 0)) + expect_equivalent(arrays[[2]][3, ], c(0, 1, 0)) + expect_equivalent(arrays[[2]][4, ], c(0, 1, 0)) + expect_equivalent(arrays[[2]][5, ], c(0, 0, 1)) + expect_equivalent(arrays[[2]][6, ], c(0, 0, 1)) + + ####### Test discard ambiguous nucleotides ########### + + testpath <- file.path("fasta_3") + vocabulary = c("a", "c", "g", "t") + batch_size <- 6 + maxlen <- 3 + step <- 2 + gen <- generator_fasta_lm(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen, + vocabulary = vocabulary, ambiguous_nuc = "discard", step = step) + arrays <- gen() + + expect_equivalent(arrays[[1]][1, 1, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][1, 3, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[2]][1, ], c(1, 0, 0, 0)) + + expect_equivalent(arrays[[1]][2, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 3, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[2]][2, ], c(1, 0, 0, 0)) + + expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[2]][3, ], c(0, 0, 0, 1)) + + expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][4, 3, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[2]][4, ], c(0, 0, 1, 0)) + + expect_equivalent(arrays[[1]][5, 1, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][5, 2, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][5, 3, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[2]][5, ], c(0, 0, 0, 1)) + + expect_equivalent(arrays[[1]][6, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][6, 2, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][6, 3, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[2]][6, ], c(0, 0, 0, 1)) + + # label header + + gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen, + vocabulary = vocabulary, ambiguous_nuc = "discard", step = step, reverse_complement = FALSE, + vocabulary_label = c("X", "Y")) + arrays <- gen() + + expect_equivalent(arrays[[1]][1, 1, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][1, 3, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[2]][1, ], c(1, 0)) + + expect_equivalent(arrays[[1]][2, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 3, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[2]][2, ], c(1, 0)) + + expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[2]][3, ], c(1, 0)) + + expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][4, 3, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[2]][4, ], c(0, 1)) + + expect_equivalent(arrays[[1]][5, 1, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][5, 2, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][5, 3, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[2]][5, ], c(0, 1)) + + expect_equivalent(arrays[[1]][6, 1, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][6, 2, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][6, 3, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[2]][6, ], c(0, 1)) + + # label folder + + directories = c("fasta_2", "fasta_3") + gen <- get_generator(val = FALSE, + train_type = "label_folder", + path = directories, + format = "fasta", + batch_size = 6, + maxlen = 3, + ambiguous_nuc = "discard", + vocabulary = c("a", "c", "g", "t"), + reverse_complement = FALSE, + step = 2) + + arrays <- gen() + + expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 3, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[2]][1, ], c(1, 0)) + + expect_equivalent(arrays[[1]][2, 1, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][2, 2, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][2, 3, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[2]][2, ], c(1, 0)) + + expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[2]][3, ], c(1, 0)) + + expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][4, 3, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[2]][4, ], c(0, 1)) + + expect_equivalent(arrays[[1]][5, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][5, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][5, 3, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[2]][5, ], c(0, 1)) + + expect_equivalent(arrays[[1]][6, 1, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][6, 2, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][6, 3, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[2]][6, ], c(0, 1)) + + ####### Test ambiguous nucleotides as 1/length(vocabulary) ########### + + testpath <- file.path("fasta_3") + vocabulary = c("a", "c", "g", "t") + batch_size <- 4 + maxlen <- 3 + step <- 2 + gen <- generator_fasta_lm(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen, + vocabulary = vocabulary, ambiguous_nuc = "equal", step = step) + arrays <- gen() + + expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 3, ], c(1/4, 1/4, 1/4, 1/4)) + expect_equivalent(arrays[[2]][1, ], c(0, 1, 0, 0)) + + expect_equivalent(arrays[[1]][2, 1, ], c(1/4, 1/4, 1/4, 1/4)) + expect_equivalent(arrays[[1]][2, 2, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][2, 3, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[2]][2, ], c(1/4, 1/4, 1/4, 1/4)) + + expect_equivalent(arrays[[1]][3, 1, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][3, 2, ], c(1/4, 1/4, 1/4, 1/4)) + expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[2]][3, ], c(0, 0, 1, 0)) + + expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][4, 3, ], c(1/4, 1/4, 1/4, 1/4)) + expect_equivalent(arrays[[2]][4, ], c(0, 0, 0, 1)) + + # label header + + gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen, + vocabulary = vocabulary, ambiguous_nuc = "equal", step = step, reverse_complement = FALSE, + vocabulary_label = c("X", "Y")) + arrays <- gen() + + expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 3, ], c(1/4, 1/4, 1/4, 1/4)) + expect_equivalent(arrays[[2]][1, ], c(1, 0)) + + expect_equivalent(arrays[[1]][2, 1, ], c(1/4, 1/4, 1/4, 1/4)) + expect_equivalent(arrays[[1]][2, 2, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][2, 3, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[2]][2, ], c(1, 0)) + + expect_equivalent(arrays[[1]][3, 1, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][3, 2, ], c(1/4, 1/4, 1/4, 1/4)) + expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[2]][3, ], c(1, 0)) + + expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][4, 3, ], c(1/4, 1/4, 1/4, 1/4)) + expect_equivalent(arrays[[2]][4, ], c(1, 0)) + + # label folder + + directories = c("fasta_2", "fasta_3") + gen <- get_generator(train_type = "label_folder", + val = FALSE, + path = directories, + format = "fasta", + batch_size = 4, + maxlen = 3, + vocabulary = c("a", "c", "g", "t"), + reverse_complement = FALSE, + ambiguous_nuc = "equal", + step = 2) + + arrays <- gen() + + expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 3, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[2]][1, ], c(1, 0)) + + expect_equivalent(arrays[[1]][2, 1, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][2, 2, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][2, 3, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[2]][2, ], c(1, 0)) + + expect_equivalent(arrays[[1]][3, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][3, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][3, 3, ], c(1/4, 1/4, 1/4, 1/4)) + expect_equivalent(arrays[[2]][3, ], c(0, 1)) + + expect_equivalent(arrays[[1]][4, 1, ], c(1/4, 1/4, 1/4, 1/4)) + expect_equivalent(arrays[[1]][4, 2, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][4, 3, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[2]][4, ], c(0, 1)) + + ####### Test ambiguous nucleotides as "empirical" ########### + # LM + + testpath <- file.path("fasta_3") + vocabulary <- c("a", "c", "g", "t") + batch_size <- 4 + maxlen <- 3 + step <- 2 + gen <- generator_fasta_lm(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen, + vocabulary = vocabulary, ambiguous_nuc = "empirical", step = step) + arrays <- gen() + nuc_dist <- 1/18*c(8, 2, 3, 5) + + expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 3, ], nuc_dist) + expect_equivalent(arrays[[2]][1, ], c(0, 1, 0, 0)) + + expect_equivalent(arrays[[1]][2, 1, ], nuc_dist) + expect_equivalent(arrays[[1]][2, 2, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][2, 3, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[2]][2, ], nuc_dist) + + expect_equivalent(arrays[[1]][3, 1, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][3, 2, ], nuc_dist) + expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[2]][3, ], c(0, 0, 1, 0)) + + expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][4, 3, ], nuc_dist) + expect_equivalent(arrays[[2]][4, ], c(0, 0, 0, 1)) + + # LM second file + + testpath <- file.path("fasta_3") + vocabulary <- c("a", "c", "g", "t") + batch_size <- 4 + maxlen <- 3 + step <- 20 + gen <- generator_fasta_lm(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen, + vocabulary = vocabulary, ambiguous_nuc = "empirical", step = step) + arrays <- gen() + nuc_dist_1 <- 1/18*c(8, 2, 3, 5) + nuc_dist_2 <- 1/17*c(3, 2, 6, 6) + + expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 3, ], nuc_dist) + expect_equivalent(arrays[[2]][1, ], c(0, 1, 0, 0)) + + expect_equivalent(arrays[[1]][2, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 3, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[2]][2, ], c(1, 0, 0, 0)) + + expect_equivalent(arrays[[1]][3, 1, ], nuc_dist_2) + expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[2]][3, ], c(0, 0, 1, 0)) + + expect_equivalent(arrays[[1]][4, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][4, 2, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][4, 3, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[2]][4, ], c(0, 0, 0, 1)) + + # label header + + testpath <- file.path("fasta_3") + vocabulary <- c("a", "c", "g", "t") + batch_size <- 4 + maxlen <- 3 + step <- 2 + gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = batch_size, maxlen = maxlen, + vocabulary = vocabulary, ambiguous_nuc = "empirical", step = step, reverse_complement = FALSE, + vocabulary_label = c("X", "Y")) + arrays <- gen() + nuc_dist <- 1/18*c(8, 2, 3, 5) + + expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 3, ], nuc_dist) + expect_equivalent(arrays[[2]][1, ], c(1, 0)) + + expect_equivalent(arrays[[1]][2, 1, ], nuc_dist) + expect_equivalent(arrays[[1]][2, 2, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][2, 3, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[2]][2, ], c(1, 0)) + + expect_equivalent(arrays[[1]][3, 1, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][3, 2, ], nuc_dist) + expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[2]][3, ], c(1, 0)) + + expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][4, 3, ], nuc_dist) + expect_equivalent(arrays[[2]][4, ], c(1, 0)) + + # label folder + + directories = c("fasta_2", "fasta_3") + gen <- get_generator(path = directories, + val = FALSE, + train_type = "label_folder", + format = "fasta", + batch_size = 4, + maxlen = 3, + vocabulary = c("a", "c", "g", "t"), + reverse_complement = FALSE, + ambiguous_nuc = "empirical", + step = 2) + + arrays <- gen() + nuc_dist <- 1/18*c(8, 2, 3, 5) + + expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 3, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[2]][1, ], c(1, 0)) + + expect_equivalent(arrays[[1]][2, 1, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][2, 2, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][2, 3, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[2]][2, ], c(1, 0)) + + expect_equivalent(arrays[[1]][3, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][3, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][3, 3, ], nuc_dist) + expect_equivalent(arrays[[2]][3, ], c(0, 1)) + + expect_equivalent(arrays[[1]][4, 1, ], nuc_dist) + expect_equivalent(arrays[[1]][4, 2, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][4, 3, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[2]][4, ], c(0, 1)) + + ############# padding/amb nucleotide LM ############ + + gen <- generator_fasta_lm(path_corpus = "fasta_3", + batch_size = 3, + maxlen = 15, + step = 1, + ambiguous_nuc = "equal") + + arrays <- gen() + equal_vector <- rep(0.25, 4) + expect_equivalent(arrays[[1]][1, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 3, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 4, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 5, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 6, ], equal_vector) + expect_equivalent(arrays[[1]][1, 7, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][1, 8, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][1, 9, ], equal_vector) + expect_equivalent(arrays[[1]][1, 10, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][1, 11, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][1, 12, ], equal_vector) + expect_equivalent(arrays[[1]][1, 13, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][1, 14, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][1, 15, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[2]][1, ], c(1, 0, 0, 0)) + + expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][3, 4, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][3, 5, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][3, 6, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][3, 7, ], equal_vector) + expect_equivalent(arrays[[1]][3, 8, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][3, 9, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][3, 10, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][3, 11, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][3, 12, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][3, 13, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][3, 14, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][3, 15, ], equal_vector) + expect_equivalent(arrays[[2]][3, ], c(0, 0, 0, 1)) + + ############# padding/amb nucleotide, label_header ############ + + gen <- generator_fasta_label_header_csv(path_corpus = "fasta_3", + batch_size = 3, + maxlen = 15, + step = 1, + vocabulary_label = c("X", "Y"), + reverse_complement = FALSE, + ambiguous_nuc = "empirical") + + nuc_dist_1 <- 1/18*c(8, 2, 3, 5) + nuc_dist_2 <- 1/17*c(3, 2, 6, 6) + arrays <- gen() + expect_equivalent(arrays[[1]][1, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 3, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 4, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 5, ], nuc_dist_1) + expect_equivalent(arrays[[1]][1, 6, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][1, 7, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][1, 8, ], nuc_dist_1) + expect_equivalent(arrays[[1]][1, 9, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][1, 10, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][1, 11, ], nuc_dist_1) + expect_equivalent(arrays[[1]][1, 12, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][1, 13, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][1, 14, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 14, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[2]][1, ], c(1, 0)) + + expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][3, 4, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][3, 5, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][3, 6, ], nuc_dist_2) + expect_equivalent(arrays[[1]][3, 7, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][3, 8, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][3, 9, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][3, 10, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][3, 11, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][3, 12, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][3, 13, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][3, 14, ], nuc_dist_2) + expect_equivalent(arrays[[1]][3, 15, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[2]][3, ], c(0, 1)) + + ############# padding/amb nucleotide, label_folder ############ + + directories = c("fasta_2", "fasta_3") + gen <- get_generator(path = directories, + val = FALSE, + train_type = "label_folder", + padding = TRUE, + format = "fasta", + batch_size = 6, + maxlen = 15, + ambiguous_nuc = "equal", + vocabulary = c("a", "c", "g", "t"), + reverse_complement = FALSE, + step = 1) + + equal_vector <- rep(0.25, 4) + arrays <- gen() + + expect_equivalent(arrays[[1]][1, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 3, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 4, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 5, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 6, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 7, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 8, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 9, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 10, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][1, 11, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][1, 12, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][1, 13, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][1, 14, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][1, 14, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[2]][1, ], c(1, 0)) + + expect_equivalent(arrays[[1]][6, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][6, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][6, 3, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][6, 4, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][6, 5, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][6, 6, ], equal_vector) + expect_equivalent(arrays[[1]][6, 7, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][6, 8, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][6, 9, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][6, 10, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][6, 11, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][6, 12, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][6, 13, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][6, 14, ], equal_vector) + expect_equivalent(arrays[[1]][6, 15, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[2]][6, ], c(0, 1)) + + ###### more than 2 files in one batch ###### + # LM + + gen <- generator_fasta_lm(path_corpus = "fasta_3", + batch_size = 8, + maxlen = 12, + max_iter = 10000, + step = 50, + ambiguous_nuc = "empirical") + + nuc_dist_1 <- 1/18*c(8, 2, 3, 5) + nuc_dist_2 <- 1/17*c(3, 2, 6, 6) + arrays <- gen() + + expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 3, ], nuc_dist_1) + expect_equivalent(arrays[[1]][1, 4, ], c(0, 1, 0, 0)) + + expect_equivalent(arrays[[1]][2, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 3, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 4, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 5, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 6, ], c(1, 0, 0, 0)) + + expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][3, 3, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][3, 4, ], nuc_dist_2) + expect_equivalent(arrays[[1]][3, 5, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][3, 6, ], c(0, 0, 1, 0)) + + expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][4, 3, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][4, 4, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][4, 5, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][4, 6, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][4, 7, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][4, 8, ], c(0, 0, 1, 0)) + + expect_equivalent(arrays[[1]][5, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][5, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][5, 3, ], nuc_dist_2) + expect_equivalent(arrays[[1]][5, 4, ], nuc_dist_2) + expect_equivalent(arrays[[1]][5, 5, ], nuc_dist_2) + expect_equivalent(arrays[[1]][5, 6, ], nuc_dist_2) + expect_equivalent(arrays[[1]][5, 7, ], nuc_dist_2) + expect_equivalent(arrays[[1]][5, 8, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][5, 9, ], nuc_dist_2) + + expect_equivalent(arrays[[1]][6, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][6, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][6, 3, ], nuc_dist_1) + expect_equivalent(arrays[[1]][6, 4, ], c(0, 1, 0, 0)) + + expect_equivalent(arrays[[1]][7, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][7, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][7, 3, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][7, 4, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][7, 5, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][7, 6, ], c(1, 0, 0, 0)) + + expect_equivalent(arrays[[1]][8, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][8, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][8, 3, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][8, 4, ], nuc_dist_2) + expect_equivalent(arrays[[1]][8, 5, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][8, 6, ], c(0, 0, 1, 0)) + + arrays <- gen() + + expect_equivalent(arrays[[1]][1, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 3, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 4, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 5, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 6, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 7, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][1, 8, ], c(0, 0, 1, 0)) + + expect_equivalent(arrays[[1]][2, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 3, ], nuc_dist_2) + expect_equivalent(arrays[[1]][2, 4, ], nuc_dist_2) + expect_equivalent(arrays[[1]][2, 5, ], nuc_dist_2) + expect_equivalent(arrays[[1]][2, 6, ], nuc_dist_2) + expect_equivalent(arrays[[1]][2, 7, ], nuc_dist_2) + expect_equivalent(arrays[[1]][2, 8, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 9, ], nuc_dist_2) + + expect_equivalent(arrays[[1]][3, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][3, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][3, 3, ], nuc_dist_1) + expect_equivalent(arrays[[1]][3, 4, ], c(0, 1, 0, 0)) + + expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][4, 3, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][4, 4, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][4, 5, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][4, 6, ], c(1, 0, 0, 0)) + + expect_equivalent(arrays[[1]][5, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][5, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][5, 3, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][5, 4, ], nuc_dist_2) + expect_equivalent(arrays[[1]][5, 5, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][5, 6, ], c(0, 0, 1, 0)) + + expect_equivalent(arrays[[1]][6, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][6, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][6, 3, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][6, 4, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][6, 5, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][6, 6, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][6, 7, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][6, 8, ], c(0, 0, 1, 0)) + + expect_equivalent(arrays[[1]][7, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][7, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][7, 3, ], nuc_dist_2) + expect_equivalent(arrays[[1]][7, 4, ], nuc_dist_2) + expect_equivalent(arrays[[1]][7, 5, ], nuc_dist_2) + expect_equivalent(arrays[[1]][7, 6, ], nuc_dist_2) + expect_equivalent(arrays[[1]][7, 7, ], nuc_dist_2) + expect_equivalent(arrays[[1]][7, 8, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][7, 9, ], nuc_dist_2) + + expect_equivalent(arrays[[1]][8, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][8, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][8, 3, ], nuc_dist_1) + expect_equivalent(arrays[[1]][8, 4, ], c(0, 1, 0, 0)) + + # label header + + gen <- generator_fasta_label_header_csv(path_corpus = "fasta_3", + batch_size = 8, + maxlen = 12, + max_iter = 10000, + step = 50, + ambiguous_nuc = "empirical", + reverse_complement = FALSE, + vocabulary_label = c("X", "Y") + ) + + nuc_dist_1 <- 1/18*c(8, 2, 3, 5) + nuc_dist_2 <- 1/17*c(3, 2, 6, 6) + arrays <- gen() + + expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 3, ], nuc_dist_1) + expect_equivalent(arrays[[1]][1, 4, ], c(0, 1, 0, 0)) + + expect_equivalent(arrays[[1]][2, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 3, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 4, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 5, ], c(1, 0, 0, 0)) + + expect_equivalent(arrays[[1]][3, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][3, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][3, 3, ], nuc_dist_2) + expect_equivalent(arrays[[1]][3, 4, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][3, 5, ], c(0, 0, 1, 0)) + + expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][4, 3, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][4, 4, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][4, 5, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][4, 6, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][4, 7, ], c(0, 0, 1, 0)) + + expect_equivalent(arrays[[1]][5, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][5, 2, ], nuc_dist_2) + expect_equivalent(arrays[[1]][5, 3, ], nuc_dist_2) + expect_equivalent(arrays[[1]][5, 4, ], nuc_dist_2) + expect_equivalent(arrays[[1]][5, 5, ], nuc_dist_2) + expect_equivalent(arrays[[1]][5, 6, ], nuc_dist_2) + expect_equivalent(arrays[[1]][5, 7, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][5, 8, ], nuc_dist_2) + + expect_equivalent(arrays[[1]][6, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][6, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][6, 3, ], nuc_dist_1) + expect_equivalent(arrays[[1]][6, 4, ], c(0, 1, 0, 0)) + + expect_equivalent(arrays[[1]][7, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][7, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][7, 3, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][7, 4, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][7, 5, ], c(1, 0, 0, 0)) + + expect_equivalent(arrays[[1]][8, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][8, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][8, 3, ], nuc_dist_2) + expect_equivalent(arrays[[1]][8, 4, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][8, 5, ], c(0, 0, 1, 0)) + + arrays <- gen() + + expect_equivalent(arrays[[1]][1, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 3, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 4, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 5, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 6, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][1, 7, ], c(0, 0, 1, 0)) + + expect_equivalent(arrays[[1]][2, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 2, ], nuc_dist_2) + expect_equivalent(arrays[[1]][2, 3, ], nuc_dist_2) + expect_equivalent(arrays[[1]][2, 4, ], nuc_dist_2) + expect_equivalent(arrays[[1]][2, 5, ], nuc_dist_2) + expect_equivalent(arrays[[1]][2, 6, ], nuc_dist_2) + expect_equivalent(arrays[[1]][2, 7, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 8, ], nuc_dist_2) + + expect_equivalent(arrays[[1]][3, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][3, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][3, 3, ], nuc_dist_1) + expect_equivalent(arrays[[1]][3, 4, ], c(0, 1, 0, 0)) + + expect_equivalent(arrays[[1]][4, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][4, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][4, 3, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][4, 4, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][4, 5, ], c(1, 0, 0, 0)) + + expect_equivalent(arrays[[1]][5, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][5, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][5, 3, ], nuc_dist_2) + expect_equivalent(arrays[[1]][5, 4, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][5, 5, ], c(0, 0, 1, 0)) + + expect_equivalent(arrays[[1]][6, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][6, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][6, 3, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][6, 4, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][6, 5, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][6, 6, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][6, 7, ], c(0, 0, 1, 0)) + + expect_equivalent(arrays[[1]][7, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][7, 2, ], nuc_dist_2) + expect_equivalent(arrays[[1]][7, 3, ], nuc_dist_2) + expect_equivalent(arrays[[1]][7, 4, ], nuc_dist_2) + expect_equivalent(arrays[[1]][7, 5, ], nuc_dist_2) + expect_equivalent(arrays[[1]][7, 6, ], nuc_dist_2) + expect_equivalent(arrays[[1]][7, 7, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][7, 8, ], nuc_dist_2) + + expect_equivalent(arrays[[1]][8, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][8, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][8, 3, ], nuc_dist_1) + expect_equivalent(arrays[[1]][8, 4, ], c(0, 1, 0, 0)) + + # label folder + + directories = c("fasta_2", "fasta_3") + gen <- get_generator(path = directories, + train_type = "label_folder", + batch_size = 20, + maxlen = 12, + val = FALSE, + padding = TRUE, + ambiguous_nuc = "empirical", + vocabulary = c("a", "c", "g", "t"), + reverse_complement = FALSE, + step = 1) + + nuc_dist_1 <- 1/18*c(8, 2, 3, 5) + nuc_dist_2 <- 1/17*c(3, 2, 6, 6) + arrays <- gen() + + expect_equivalent(arrays[[1]][9, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][9, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][9, 3, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][9, 4, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][9, 5, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][9, 6, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][9, 7, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][9, 8, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][9, 9, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][9, 10, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][9, 11, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][9, 12, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[2]][9, ], c(1, 0)) + + expect_equivalent(arrays[[1]][12, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][12, 2, ], nuc_dist_1) + expect_equivalent(arrays[[1]][12, 3, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][12, 4, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][12, 5, ], nuc_dist_1) + expect_equivalent(arrays[[1]][12, 6, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][12, 7, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][12, 8, ], nuc_dist_1) + expect_equivalent(arrays[[1]][12, 9, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][12, 10, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][12, 11, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][12, 12, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[2]][12, ], c(0, 1)) + + expect_equivalent(arrays[[1]][18, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][18, 2, ], nuc_dist_1) + expect_equivalent(arrays[[1]][18, 3, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][18, 4, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][18, 5, ], nuc_dist_1) + expect_equivalent(arrays[[1]][18, 6, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][18, 7, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][18, 8, ], nuc_dist_1) + expect_equivalent(arrays[[1]][18, 9, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][18, 10, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][18, 11, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][18, 12, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[2]][18, ], c(0, 1)) + + arrays <- gen() + + expect_equivalent(arrays[[1]][7, 1, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][7, 2, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][7, 3, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][7, 4, ], c(0, 0, 0, 0)) + expect_equivalent(arrays[[1]][7, 5, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][7, 6, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][7, 7, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][7, 8, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][7, 9, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][7, 10, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][7, 11, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][7, 12, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[2]][7, ], c(1, 0)) + + expect_equivalent(arrays[[1]][14, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][14, 2, ], nuc_dist_1) + expect_equivalent(arrays[[1]][14, 3, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][14, 4, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][14, 5, ], nuc_dist_1) + expect_equivalent(arrays[[1]][14, 6, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][14, 7, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][14, 8, ], nuc_dist_1) + expect_equivalent(arrays[[1]][14, 9, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][14, 10, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][14, 11, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][14, 12, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[2]][14, ], c(0, 1)) + + expect_equivalent(arrays[[1]][20, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][20, 2, ], nuc_dist_1) + expect_equivalent(arrays[[1]][20, 3, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][20, 4, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][20, 5, ], nuc_dist_1) + expect_equivalent(arrays[[1]][20, 6, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][20, 7, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][20, 8, ], nuc_dist_1) + expect_equivalent(arrays[[1]][20, 9, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][20, 10, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][20, 11, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][20, 12, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[2]][20, ], c(0, 1)) + + # test quality scores LM + + gen <- generator_fasta_lm(path_corpus = "fastq", + format = "fastq", + batch_size = 10, + maxlen = 3, + max_iter = 10000, + vocabulary = c("a", "c", "g", "t"), + verbose = FALSE, + shuffle_file_order = FALSE, + step = 2, + seed = 1234, + shuffle_input = FALSE, + file_limit = NULL, + path_file_log = NULL, + reverse_complement = FALSE, + output_format = "target_right", + ambiguous_nuc = "zeros", + use_quality_score = TRUE, + proportion_per_seq = NULL, + padding = FALSE) + + a <- create_quality_vector(prob = quality_to_probability("J") , pos = 1, voc_length = 4) + c <- create_quality_vector(prob = quality_to_probability("C") , pos = 2, voc_length = 4) + g <- create_quality_vector(prob = quality_to_probability("G") , pos = 3, voc_length = 4) + t <- create_quality_vector(prob = quality_to_probability("?") , pos = 4, voc_length = 4) + + arrays <- gen() + expect_equivalent(arrays[[1]][1, 1, ], a) + expect_equivalent(arrays[[1]][1, 2, ], a) + expect_equivalent(arrays[[1]][1, 3, ], c) + + expect_equivalent(arrays[[1]][2, 1, ], c) + expect_equivalent(arrays[[1]][2, 2, ], c) + expect_equivalent(arrays[[1]][2, 3, ], g) + + expect_equivalent(arrays[[1]][3, 1, ], a) + expect_equivalent(arrays[[1]][3, 2, ], c) + expect_equivalent(arrays[[1]][3, 3, ], g) + + expect_equivalent(arrays[[1]][4, 1, ], g) + expect_equivalent(arrays[[1]][4, 2, ], t) + expect_equivalent(arrays[[1]][4, 3, ], a) + + expect_equivalent(arrays[[1]][5, 1, ], c) + expect_equivalent(arrays[[1]][5, 2, ], g) + expect_equivalent(arrays[[1]][5, 3, ], t) + + expect_equivalent(arrays[[1]][6, 1, ], t) + expect_equivalent(arrays[[1]][6, 2, ], c) + expect_equivalent(arrays[[1]][6, 3, ], g) + + expect_equivalent(arrays[[1]][7, 1, ], a) + expect_equivalent(arrays[[1]][7, 2, ], t) + expect_equivalent(arrays[[1]][7, 3, ], a) + + expect_equivalent(arrays[[1]][8, 1, ], a) + expect_equivalent(arrays[[1]][8, 2, ], a) + expect_equivalent(arrays[[1]][8, 3, ], c) + + expect_equivalent(arrays[[2]][1, ], c) + expect_equivalent(arrays[[2]][2, ], g) + expect_equivalent(arrays[[2]][3, ], t) + expect_equivalent(arrays[[2]][4, ], c) + expect_equivalent(arrays[[2]][5, ], c) + expect_equivalent(arrays[[2]][6, ], t) + expect_equivalent(arrays[[2]][7, ], t) + expect_equivalent(arrays[[2]][8, ], c) + + # test quality scores label + + gen <- generator_fasta_label_folder(path_corpus = "fastq", + format = "fastq", + batch_size = 10, + maxlen = 3, + max_iter = 10000, + vocabulary = c("a", "c", "g", "t"), + verbose = FALSE, + shuffle_file_order = FALSE, + step = 2, + seed = 1234, + shuffle_input = FALSE, + file_limit = NULL, + path_file_log = NULL, + reverse_complement = FALSE, + ambiguous_nuc = "zeros", + use_quality_score = TRUE, + proportion_per_seq = NULL, + num_targets = 2, + ones_column = 1, + padding = FALSE) + + a <- create_quality_vector(prob = quality_to_probability("J") , pos = 1, voc_length = 4) + c <- create_quality_vector(prob = quality_to_probability("C") , pos = 2, voc_length = 4) + g <- create_quality_vector(prob = quality_to_probability("G") , pos = 3, voc_length = 4) + t <- create_quality_vector(prob = quality_to_probability("?") , pos = 4, voc_length = 4) + + arrays <- gen() + expect_equivalent(arrays[[1]][1, 1, ], a) + expect_equivalent(arrays[[1]][1, 2, ], a) + expect_equivalent(arrays[[1]][1, 3, ], c) + + expect_equivalent(arrays[[1]][2, 1, ], c) + expect_equivalent(arrays[[1]][2, 2, ], c) + expect_equivalent(arrays[[1]][2, 3, ], g) + + expect_equivalent(arrays[[1]][3, 1, ], a) + expect_equivalent(arrays[[1]][3, 2, ], c) + expect_equivalent(arrays[[1]][3, 3, ], g) + + expect_equivalent(arrays[[1]][4, 1, ], g) + expect_equivalent(arrays[[1]][4, 2, ], t) + expect_equivalent(arrays[[1]][4, 3, ], a) + + expect_equivalent(arrays[[1]][5, 1, ], a) + expect_equivalent(arrays[[1]][5, 2, ], c) + expect_equivalent(arrays[[1]][5, 3, ], g) + + expect_equivalent(arrays[[1]][6, 1, ], c) + expect_equivalent(arrays[[1]][6, 2, ], g) + expect_equivalent(arrays[[1]][6, 3, ], t) + + expect_equivalent(arrays[[1]][7, 1, ], t) + expect_equivalent(arrays[[1]][7, 2, ], c) + expect_equivalent(arrays[[1]][7, 3, ], g) + + expect_equivalent(arrays[[1]][8, 1, ], a) + expect_equivalent(arrays[[1]][8, 2, ], t) + expect_equivalent(arrays[[1]][8, 3, ], a) + + expect_equivalent(arrays[[1]][9, 1, ], a) + expect_equivalent(arrays[[1]][9, 2, ], t) + expect_equivalent(arrays[[1]][9, 3, ], a) + + expect_equivalent(arrays[[1]][10, 1, ], a) + expect_equivalent(arrays[[1]][10, 2, ], a) + expect_equivalent(arrays[[1]][10, 3, ], c) + + expect_equivalent(arrays[[2]][1, ], c(1,0)) + expect_equivalent(arrays[[2]][10, ], c(1,0)) + + ## test read data with quality + + gen <- generator_fasta_label_folder(path_corpus = "read_data", + format = "fastq", + batch_size = 5, + maxlen = 12, + max_iter = 10000, + vocabulary = c("a", "c", "g", "t"), + verbose = FALSE, + shuffle_file_order = FALSE, + step = 2, + seed = 1234, + shuffle_input = FALSE, + file_limit = NULL, + path_file_log = NULL, + read_data = TRUE, + reverse_complement = FALSE, + ambiguous_nuc = "zeros", + use_quality_score = TRUE, + proportion_per_seq = NULL, + num_targets = 2, + ones_column = 1, + padding = FALSE) + + a <- create_quality_vector(prob = quality_to_probability("J") , pos = 1, voc_length = 4) + c <- create_quality_vector(prob = quality_to_probability("C") , pos = 2, voc_length = 4) + g <- create_quality_vector(prob = quality_to_probability("G") , pos = 3, voc_length = 4) + t <- create_quality_vector(prob = quality_to_probability("?") , pos = 4, voc_length = 4) + + arrays <- gen() + expect_equivalent(arrays[[1]][[1]][1, , ], rbind(a,a,a,c,c,c)) + expect_equivalent(arrays[[1]][[2]][1, , ], rbind(c,c,c,g,g,g)) + + expect_equivalent(arrays[[1]][[1]][2, , ], rbind(a,c,a,c,a,c)) + expect_equivalent(arrays[[1]][[2]][2, , ], rbind(c,g,c,g,c,g)) + + expect_equivalent(arrays[[1]][[1]][3, , ], rbind(g,g,g,t,t,t)) + expect_equivalent(arrays[[1]][[2]][3, , ], rbind(t,t,t,g,g,g)) + + expect_equivalent(arrays[[1]][[1]][4, , ], rbind(g,t,g,t,g,t)) + expect_equivalent(arrays[[1]][[2]][4, , ], rbind(t,g,t,g,t,g)) + + expect_equivalent(arrays[[1]][[1]][5, , ], rbind(a,a,a,c,c,c)) + expect_equivalent(arrays[[1]][[2]][5, , ], rbind(c,c,c,g,g,g)) + + arrays <- gen() + expect_equivalent(arrays[[1]][[1]][1, , ], rbind(a,c,a,c,a,c)) + expect_equivalent(arrays[[1]][[2]][1, , ], rbind(c,g,c,g,c,g)) + + expect_equivalent(arrays[[1]][[1]][2, , ], rbind(g,g,g,t,t,t)) + expect_equivalent(arrays[[1]][[2]][2, , ], rbind(t,t,t,g,g,g)) + + expect_equivalent(arrays[[1]][[1]][3, , ], rbind(g,t,g,t,g,t)) + expect_equivalent(arrays[[1]][[2]][3, , ], rbind(t,g,t,g,t,g)) + + # additional input LM + + gen <- generator_fasta_lm(path_corpus = "fasta_3", + format = "fasta", + batch_size = 10, + maxlen = 5, + vocabulary = c("a", "c", "g", "t"), + shuffle_file_order = FALSE, + step = 4, + shuffle_input = FALSE, + reverse_complement = FALSE, + output_format = "target_right", + ambiguous_nuc = "zeros", + added_label_path = "label.csv", + add_input_as_seq = FALSE, + padding = FALSE) + + arrays <- gen() + expect_equivalent(arrays[[1]][[1]][1,], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][[1]][2,], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][[1]][3,], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][[1]][4,], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][[1]][5,], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][[1]][6,], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][[1]][7,], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][[1]][8,], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][[1]][9,], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][[1]][10,], c(0, 0, 1, 0)) + + expect_equivalent(arrays[[1]][[2]][10, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][[2]][10, 3, ], c(0, 0, 0, 0)) + + # additional input label_folder + dir <- c("label_folder/x", "label_folder/y", "label_folder/z") + gen_list <- generator_initialize(directories = dir, + format = "fasta", + batch_size = 15, + maxlen = 4, + step = 2, + val = FALSE, + padding = FALSE, + added_label_path = "label.csv", + add_input_as_seq = FALSE) + + gen <- generator_fasta_label_folder_wrapper(val = FALSE, path = dir, gen_list = gen_list) + arrays <- gen() + expect_equivalent(arrays[[1]][[1]][1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][[1]][2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][[1]][3, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][[1]][4, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][[1]][5, ], c(1, 0, 0, 1)) + + expect_equivalent(arrays[[1]][[1]][6, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][[1]][7, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][[1]][8, ], c(0, 1, 0, 1)) + expect_equivalent(arrays[[1]][[1]][9, ], c(0, 1, 0, 1)) + expect_equivalent(arrays[[1]][[1]][10, ], c(0, 1, 0, 0)) + + expect_equivalent(arrays[[1]][[1]][11, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][[1]][12, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][[1]][13, ], c(0, 0, 1, 1)) + expect_equivalent(arrays[[1]][[1]][14, ], c(0, 0, 1, 1)) + expect_equivalent(arrays[[1]][[1]][15, ], c(0, 0, 1, 0)) + + expect_equivalent(arrays[[1]][[2]][5, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][[2]][5, 2, ], c(0, 0, 1, 0)) + + expect_equivalent(arrays[[1]][[2]][10, 1, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][[2]][10, 2, ], c(0, 1, 0, 0)) + + expect_equivalent(arrays[[1]][[2]][15, 1, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][[2]][15, 2, ], c(0, 0, 0, 1)) + + gen <- generator_fasta_label_folder_wrapper(val = FALSE, path = dir, gen_list = gen_list) + arrays <- gen() + expect_equivalent(arrays[[1]][[1]][1, ], c(1, 0, 0, 1)) + expect_equivalent(arrays[[1]][[1]][2, ], c(1, 0, 0, 1)) + expect_equivalent(arrays[[1]][[1]][3, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][[1]][4, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][[1]][5, ], c(1, 0, 0, 0)) + + expect_equivalent(arrays[[1]][[1]][6, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][[1]][7, ], c(0, 1, 0, 1)) + expect_equivalent(arrays[[1]][[1]][8, ], c(0, 1, 0, 1)) + expect_equivalent(arrays[[1]][[1]][9, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][[1]][10, ], c(0, 1, 0, 0)) + + expect_equivalent(arrays[[1]][[1]][11, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][[1]][12, ], c(0, 0, 1, 1)) + expect_equivalent(arrays[[1]][[1]][13, ], c(0, 0, 1, 1)) + expect_equivalent(arrays[[1]][[1]][14, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][[1]][15, ], c(0, 0, 1, 0)) + + ## read data with quality and 2 classes + + gen <- get_generator(path = c("read_data_2/label_a", "read_data_2/label_b"), + train_type = "label_folder", + format = "fastq", + batch_size = 4, + maxlen = 12, + vocabulary = c("a", "c", "g", "t"), + verbose = FALSE, + shuffle_file_order = FALSE, + step = 1, + seed = 1234, + shuffle_input = FALSE, + file_limit = NULL, + path_file_log = NULL, + reverse_complement = FALSE, + val = FALSE, + ambiguous_nuc = "zero", + proportion_per_seq = NULL, + read_data = TRUE, + use_quality_score = TRUE, + padding = FALSE, + added_label_path = NULL, + skip_amb_nuc = NULL) + + arrays <- gen() + + a <- create_quality_vector(prob = quality_to_probability("J") , pos = 1, voc_length = 4) + c <- create_quality_vector(prob = quality_to_probability("C") , pos = 2, voc_length = 4) + g <- create_quality_vector(prob = quality_to_probability("G") , pos = 3, voc_length = 4) + t <- create_quality_vector(prob = quality_to_probability("?") , pos = 4, voc_length = 4) + + arrays <- gen() + expect_equivalent(arrays[[1]][[1]][1, , ], rbind(a,a,a,a,a,a)) + expect_equivalent(arrays[[1]][[2]][1, , ], rbind(c,c,c,c,c,c)) + expect_equivalent(arrays[[1]][[1]][2, , ], rbind(a,a,a,a,a,a)) + expect_equivalent(arrays[[1]][[2]][2, , ], rbind(c,c,c,c,c,c)) + + expect_equivalent(arrays[[1]][[1]][3, , ], rbind(g,g,g,g,g,g)) + expect_equivalent(arrays[[1]][[2]][3, , ], rbind(t,t,t,t,t,t)) + expect_equivalent(arrays[[1]][[1]][4, , ], rbind(g,g,g,g,g,g)) + expect_equivalent(arrays[[1]][[2]][4, , ], rbind(t,t,t,t,t,t)) + + ### get output tensor from csv file + concat + + testpath <- file.path("fasta_2") + label_from_csv <- "output_label.csv" + gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = 5, + maxlen = 10, step = 10, + vocabulary = c("a", "c", "g", "t", "Z"), + reverse_complement = FALSE, + vocabulary_label = c("w", "x", "y"), + format = "fasta", + max_iter = 10000, + verbose = FALSE, + shuffle_file_order = FALSE, + seed = 1234, + shuffle_input = FALSE, + file_limit = NULL, + path_file_log = NULL, + ambiguous_nuc = "zero", + proportion_per_seq = NULL, + read_data = FALSE, + use_quality_score = FALSE, + padding = TRUE, + skip_amb_nuc = NULL, + max_samples = NULL, + concat_seq = "ZZ", + added_label_path = NULL, + add_input_as_seq = NULL, + target_from_csv = label_from_csv) + + arrays <- gen() + + expect_equivalent(arrays[[1]][1, 8, ], c(0, 0, 0, 1, 0)) + expect_equivalent(arrays[[1]][1, 9, ], c(0, 0, 0, 0, 1)) + expect_equivalent(arrays[[1]][1, 10, ], c(0, 0, 0, 0, 1)) + + expect_equivalent(arrays[[1]][4, 1, ], c(1, 0, 0, 0, 0)) + expect_equivalent(arrays[[1]][4, 2, ], c(1, 0, 0, 0, 0)) + expect_equivalent(arrays[[1]][4, 3, ], c(1, 0, 0, 0, 0)) + expect_equivalent(arrays[[1]][4, 4, ], c(1, 0, 0, 0, 0)) + + expect_equivalent(arrays[[2]][1, ], 1:4) + expect_equivalent(arrays[[2]][2, ], 1:4) + expect_equivalent(arrays[[2]][3, ], 1:4) + expect_equivalent(arrays[[2]][4, ], 11:14) + expect_equivalent(arrays[[2]][5, ], 11:14) + + arrays <- gen() + + expect_equivalent(arrays[[1]][1, 8, ], c(1, 0, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 9, ], c(0, 0, 0, 0, 1)) + expect_equivalent(arrays[[1]][1, 10, ], c(0, 0, 0, 0, 1)) + + expect_equivalent(arrays[[1]][2, 1, ], c(1, 0, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 2, ], c(1, 0, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 3, ], c(0, 1, 0, 0, 0)) + + expect_equivalent(arrays[[2]][1, ], 11:14) + expect_equivalent(arrays[[2]][2, ], 1:4) + expect_equivalent(arrays[[2]][3, ], 1:4) + expect_equivalent(arrays[[2]][4, ], 1:4) + expect_equivalent(arrays[[2]][5, ], 11:14) + + + ## 2 added input files LM + + gen <- generator_fasta_lm(path_corpus = "fasta_3", + format = "fasta", + batch_size = 10, + maxlen = 5, + vocabulary = c("a", "c", "g", "t"), + shuffle_file_order = FALSE, + step = 4, + shuffle_input = FALSE, + reverse_complement = FALSE, + output_format = "target_right", + ambiguous_nuc = "zeros", + added_label_path = c("label.csv", + "add_seq.csv"), + add_input_as_seq = c(FALSE, TRUE), + padding = FALSE) + + v1 <- c(0, 0, 1, 0) + v2 <- c(1, 0, 0, 0) + m1 <- matrix(c(1, 0, 0, 0, + 0, 1, 0, 0, + 0, 0, 1, 0, + 0, 0, 0, 1), byrow = TRUE, ncol = 4) + m2 <- matrix(c(0, 0, 0, 1, + 0, 0, 0, 0, + 0, 1, 0, 0, + 0, 0, 0, 0), byrow = TRUE, ncol = 4) + + arrays <- gen() + + expect_equivalent(arrays[[1]][[1]][1, ], v1) + expect_equivalent(arrays[[1]][[1]][2, ], v1) + expect_equivalent(arrays[[1]][[1]][3, ], v1) + expect_equivalent(arrays[[1]][[1]][4, ], v1) + expect_equivalent(arrays[[1]][[1]][5, ], v2) + expect_equivalent(arrays[[1]][[1]][6, ], v2) + expect_equivalent(arrays[[1]][[1]][7, ], v2) + expect_equivalent(arrays[[1]][[1]][8, ], v2) + expect_equivalent(arrays[[1]][[1]][9, ], v2) + expect_equivalent(arrays[[1]][[1]][10, ], v1) + + expect_equivalent(arrays[[1]][[2]][1, , ], m1) + expect_equivalent(arrays[[1]][[2]][2, , ], m1) + expect_equivalent(arrays[[1]][[2]][3, , ], m1) + expect_equivalent(arrays[[1]][[2]][4, , ], m1) + expect_equivalent(arrays[[1]][[2]][5, , ], m2) + expect_equivalent(arrays[[1]][[2]][6, , ], m2) + expect_equivalent(arrays[[1]][[2]][7, , ], m2) + expect_equivalent(arrays[[1]][[2]][8, , ], m2) + expect_equivalent(arrays[[1]][[2]][9, , ], m2) + expect_equivalent(arrays[[1]][[2]][10, , ], m1) + + arrays <- gen() + + expect_equivalent(arrays[[1]][[1]][1, ], v1) + expect_equivalent(arrays[[1]][[1]][2, ], v1) + expect_equivalent(arrays[[1]][[1]][3, ], v1) + expect_equivalent(arrays[[1]][[1]][4, ], v2) + expect_equivalent(arrays[[1]][[1]][5, ], v2) + expect_equivalent(arrays[[1]][[1]][6, ], v2) + expect_equivalent(arrays[[1]][[1]][7, ], v2) + expect_equivalent(arrays[[1]][[1]][8, ], v2) + expect_equivalent(arrays[[1]][[1]][9, ], v1) + expect_equivalent(arrays[[1]][[1]][10, ], v1) + + expect_equivalent(arrays[[1]][[2]][1, , ], m1) + expect_equivalent(arrays[[1]][[2]][2, , ], m1) + expect_equivalent(arrays[[1]][[2]][3, , ], m1) + expect_equivalent(arrays[[1]][[2]][4, , ], m2) + expect_equivalent(arrays[[1]][[2]][5, , ], m2) + expect_equivalent(arrays[[1]][[2]][6, , ], m2) + expect_equivalent(arrays[[1]][[2]][7, , ], m2) + expect_equivalent(arrays[[1]][[2]][8, , ], m2) + expect_equivalent(arrays[[1]][[2]][9, , ], m1) + expect_equivalent(arrays[[1]][[2]][10, , ], m1) + + ## 2 added input files, label_folder + + dir <- c("label_folder/x", "label_folder/y", "label_folder/z") + gen <- get_generator(path = dir, + train_type = "label_folder", + format = "fasta", + batch_size = 15, + maxlen = 4, + step = 2, + val = FALSE, + padding = FALSE, + added_label_path = c("label.csv", + "add_seq.csv"), + add_input_as_seq = c(FALSE, TRUE) + ) + + x1 <- c(1, 0, 0, 0) + x2 <- c(1, 0, 0, 1) + y1 <- c(0, 1, 0, 0) + y2 <- c(0, 1, 0, 1) + z1 <- c(0, 0, 1, 0) + z2 <- c(0, 0, 1, 1) + + mx1 <- matrix(c(1, 0, 0, 0, + 1, 0, 0, 0, + 1, 0, 0, 0, + 1, 0, 0, 0), byrow = TRUE, ncol = 4) + + mx2 <- matrix(c(1, 0, 0, 0, + 0, 1, 0, 0, + 1, 0, 0, 0, + 0, 1, 0, 0), byrow = TRUE, ncol = 4) + + my1 <- matrix(c(0, 1, 0, 0, + 0, 1, 0, 0, + 0, 1, 0, 0, + 0, 1, 0, 0), byrow = TRUE, ncol = 4) + + my2 <- matrix(c(0, 1, 0, 0, + 0, 0, 1, 0, + 0, 1, 0, 0, + 0, 0, 1, 0), byrow = TRUE, ncol = 4) + + mz1 <- matrix(c(0, 0, 1, 0, + 0, 0, 1, 0, + 0, 0, 1, 0, + 0, 0, 1, 0), byrow = TRUE, ncol = 4) + + mz2 <- matrix(c(0, 0, 1, 0, + 0, 0, 0, 1, + 0, 0, 1, 0, + 0, 0, 0, 1), byrow = TRUE, ncol = 4) + + arrays <- gen() + + expect_equivalent(arrays[[1]][[1]][1, ], x1) + expect_equivalent(arrays[[1]][[1]][2, ], x1) + expect_equivalent(arrays[[1]][[1]][3, ], x1) + expect_equivalent(arrays[[1]][[1]][4, ], x1) + expect_equivalent(arrays[[1]][[1]][5, ], x2) + expect_equivalent(arrays[[1]][[1]][6, ], y1) + expect_equivalent(arrays[[1]][[1]][7, ], y1) + expect_equivalent(arrays[[1]][[1]][8, ], y2) + expect_equivalent(arrays[[1]][[1]][9, ], y2) + expect_equivalent(arrays[[1]][[1]][10, ], y1) + expect_equivalent(arrays[[1]][[1]][11, ], z1) + expect_equivalent(arrays[[1]][[1]][12, ], z1) + expect_equivalent(arrays[[1]][[1]][13, ], z2) + expect_equivalent(arrays[[1]][[1]][14, ], z2) + expect_equivalent(arrays[[1]][[1]][15, ], z1) + + expect_equivalent(arrays[[1]][[2]][1, , ], mx1) + expect_equivalent(arrays[[1]][[2]][2, , ], mx1) + expect_equivalent(arrays[[1]][[2]][3, , ], mx1) + expect_equivalent(arrays[[1]][[2]][4, , ], mx1) + expect_equivalent(arrays[[1]][[2]][5, , ], mx2) + expect_equivalent(arrays[[1]][[2]][6, , ], my1) + expect_equivalent(arrays[[1]][[2]][7, , ], my1) + expect_equivalent(arrays[[1]][[2]][8, , ], my2) + expect_equivalent(arrays[[1]][[2]][9, , ], my2) + expect_equivalent(arrays[[1]][[2]][10, , ], my1) + expect_equivalent(arrays[[1]][[2]][11, , ], mz1) + expect_equivalent(arrays[[1]][[2]][12, , ], mz1) + expect_equivalent(arrays[[1]][[2]][13, , ], mz2) + expect_equivalent(arrays[[1]][[2]][14, , ], mz2) + expect_equivalent(arrays[[1]][[2]][15, , ], mz1) + + arrays <- gen() + expect_equivalent(arrays[[1]][[1]][1, ], x2) + expect_equivalent(arrays[[1]][[1]][2, ], x2) + expect_equivalent(arrays[[1]][[1]][3, ], x1) + expect_equivalent(arrays[[1]][[1]][4, ], x1) + expect_equivalent(arrays[[1]][[1]][5, ], x1) + expect_equivalent(arrays[[1]][[1]][6, ], y1) + expect_equivalent(arrays[[1]][[1]][7, ], y2) + expect_equivalent(arrays[[1]][[1]][8, ], y2) + expect_equivalent(arrays[[1]][[1]][9, ], y1) + expect_equivalent(arrays[[1]][[1]][10, ], y1) + expect_equivalent(arrays[[1]][[1]][11, ], z1) + expect_equivalent(arrays[[1]][[1]][12, ], z2) + expect_equivalent(arrays[[1]][[1]][13, ], z2) + expect_equivalent(arrays[[1]][[1]][14, ], z1) + expect_equivalent(arrays[[1]][[1]][15, ], z1) + + expect_equivalent(arrays[[1]][[2]][1, , ], mx2) + expect_equivalent(arrays[[1]][[2]][2, , ], mx2) + expect_equivalent(arrays[[1]][[2]][3, , ], mx1) + expect_equivalent(arrays[[1]][[2]][4, , ], mx1) + expect_equivalent(arrays[[1]][[2]][5, , ], mx1) + expect_equivalent(arrays[[1]][[2]][6, , ], my1) + expect_equivalent(arrays[[1]][[2]][7, , ], my2) + expect_equivalent(arrays[[1]][[2]][8, , ], my2) + expect_equivalent(arrays[[1]][[2]][9, , ], my1) + expect_equivalent(arrays[[1]][[2]][10, , ], my1) + expect_equivalent(arrays[[1]][[2]][11, , ], mz1) + expect_equivalent(arrays[[1]][[2]][12, , ], mz2) + expect_equivalent(arrays[[1]][[2]][13, , ], mz2) + expect_equivalent(arrays[[1]][[2]][14, , ], mz1) + expect_equivalent(arrays[[1]][[2]][15, , ], mz1) + + # 3 targets, target right + + gen <- generator_fasta_lm(path_corpus = "fasta_3", + batch_size = 5, + maxlen = 4, + step = 5, + output_format = "target_right", + padding = FALSE, + target_len = 3) + + arrays <- gen() + + expect_equivalent(arrays[[1]][1, , ], + matrix( + c(1, 0, 0, 0, + 1, 0, 0, 0, + 0, 0, 0, 0, + 0, 1, 0, 0), + byrow = TRUE, ncol = 4 + )) + + expect_equivalent(arrays[[1]][5, , ], + matrix( + c(1, 0, 0, 0, + 0, 1, 0, 0, + 0, 0, 1, 0, + 0, 0, 0, 1), + byrow = TRUE, ncol = 4 + )) + + m1 <- matrix( + c(0, 1, 0, 0, + 0, 0, 0, 1, + 0, 0, 0, 0, + 0, 0, 1, 0, + 1, 0, 0, 0), + byrow = TRUE, ncol = 4 + ) + m2 <- matrix( + c(0, 0, 0, 0, + 0, 0, 1, 0, + 0, 0, 0, 1, + 0, 0, 0, 1, + 0, 1, 0, 0), + byrow = TRUE, ncol = 4) + m3 <- matrix( + c(0, 0, 1, 0, + 1, 0, 0, 0, + 0, 0, 0, 1, + 0, 0, 0, 1, + 0, 0, 1, 0), + byrow = TRUE, ncol = 4) + expect_equivalent(arrays[[2]][ ,1 , ], m1) + expect_equivalent(arrays[[2]][ ,2 , ], m2) + expect_equivalent(arrays[[2]][ ,3 , ], m3) + + # 3 targets, target middle cnn + + gen <- generator_fasta_lm(path_corpus = "fasta_3", + batch_size = 5, + maxlen = 4, + step = 5, + output_format = "target_middle_cnn", + padding = FALSE, + target_len = 3) + + arrays <- gen() + + expect_equivalent(arrays[[1]][1, , ], + matrix( + c(1, 0, 0, 0, + 1, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 1, 0), + byrow = TRUE, ncol = 4 + )) + + expect_equivalent(arrays[[1]][5, , ], + matrix( + c(1, 0, 0, 0, + 0, 1, 0, 0, + 0, 1, 0, 0, + 0, 0, 1, 0), + byrow = TRUE, ncol = 4 + )) + + m1 <- matrix( + c(0, 0, 0, 0, + 0, 0, 1, 0, + 1, 0, 0, 0, + 0, 0, 1, 0, + 0, 0, 1, 0), + byrow = TRUE, ncol = 4 + ) + m2 <- matrix( + c(0, 1, 0, 0, + 0, 0, 0, 0, + 1, 0, 0, 0, + 0, 0, 1, 0, + 0, 0, 0, 1), + byrow = TRUE, ncol = 4) + m3 <- matrix( + c(0, 1, 0, 0, + 0, 0, 0, 1, + 0, 0, 0, 0, + 0, 0, 1, 0, + 1, 0, 0, 0), + byrow = TRUE, ncol = 4) + expect_equivalent(arrays[[2]][ ,1 , ], m1) + expect_equivalent(arrays[[2]][ ,2 , ], m2) + expect_equivalent(arrays[[2]][ ,3 , ], m3) + + # 3 targets, target middle lstm + + gen <- generator_fasta_lm(path_corpus = "fasta_3", + batch_size = 5, + maxlen = 4, + step = 5, + output_format = "target_middle_lstm", + padding = FALSE, + target_len = 3) + + arrays <- gen() + + expect_equivalent(arrays[[1]][[1]][1, , ], + matrix( + c(1, 0, 0, 0, + 1, 0, 0, 0), + byrow = TRUE, ncol = 4 + )) + + expect_equivalent(arrays[[1]][[2]][1, , ], + matrix( + c(0, 0, 1, 0, + 0, 0, 0, 0), + byrow = TRUE, ncol = 4 + )) + + expect_equivalent(arrays[[1]][[1]][5, , ], + matrix( + c(1, 0, 0, 0, + 0, 1, 0, 0), + byrow = TRUE, ncol = 4 + )) + + expect_equivalent(arrays[[1]][[2]][5, , ], + matrix( + c(0, 0, 1, 0, + 0, 1, 0, 0), + byrow = TRUE, ncol = 4 + )) + + m1 <- matrix( + c(0, 0, 0, 0, + 0, 0, 1, 0, + 1, 0, 0, 0, + 0, 0, 1, 0, + 0, 0, 1, 0), + byrow = TRUE, ncol = 4 + ) + m2 <- matrix( + c(0, 1, 0, 0, + 0, 0, 0, 0, + 1, 0, 0, 0, + 0, 0, 1, 0, + 0, 0, 0, 1), + byrow = TRUE, ncol = 4) + m3 <- matrix( + c(0, 1, 0, 0, + 0, 0, 0, 1, + 0, 0, 0, 0, + 0, 0, 1, 0, + 1, 0, 0, 0), + byrow = TRUE, ncol = 4) + expect_equivalent(arrays[[2]][ ,1 , ], m1) + expect_equivalent(arrays[[2]][ ,2 , ], m2) + expect_equivalent(arrays[[2]][ ,3 , ], m3) + + # coverage + set learning for label_folder + + directories <- c("coverage_data/x", "coverage_data/y") + val <- FALSE + batch_size <- 6 + samples_per_target <- 3 + #new_batch_size <- batch_size/samples_per_target + path <- directories + voc_len <- 4 + maxlen <- 7 + reshape_mode <- "time_dist" + set_learning <- list(reshape_mode = reshape_mode, + maxlen = maxlen, + samples_per_target = samples_per_target) + + gen <- get_generator(path = directories, + train_type = "label_folder", + val = FALSE, + padding = TRUE, + format = "fasta", + batch_size = batch_size, + maxlen = maxlen, + vocabulary = c("a", "c", "g", "t"), + step = 4, + use_coverage = 1, + set_learning = set_learning) + + arrays <- gen() + expect_equivalent(arrays[[1]][1, 1, , ], matrix( + c(7,0,0,0, + 7,0,0,0, + 0,0,0,0, + 0,7,0,0, + 0,7,0,0, + 0,0,0,0, + 0,0,7,0), + byrow = TRUE, ncol = 4 + )) + + expect_equivalent(arrays[[1]][1, 3, , ], matrix( + c(11,0,0,0, + 11,0,0,0, + 11,0,0,0, + 11,0,0,0, + 0,0,0,0, + 0,0,0,11, + 0,0,0,11), + byrow = TRUE, ncol = 4 + )) + + expect_equivalent(arrays[[1]][2, 1, , ], matrix( + c(0,0,0,0, + 0,0,1,0, + 0,0,1,0, + 0,0,1,0, + 0,0,1,0, + 0,0,0,1, + 0,0,0,1), + byrow = TRUE, ncol = 4 + )) + + expect_equivalent(arrays[[1]][3, 1, , ], matrix( + c(0,0,0,0, + 17,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0), + byrow = TRUE, ncol = 4 + )) + + expect_equivalent(arrays[[1]][3, 2, , ], matrix( + c(7,0,0,0, + 7,0,0,0, + 0,0,0,0, + 0,7,0,0, + 0,7,0,0, + 0,0,0,0, + 0,0,7,0), + byrow = TRUE, ncol = 4 + )) + + expect_equivalent(arrays[[1]][4, 1, , ], matrix( + c(2,0,0,0, + 0,2,0,0, + 0,0,2,0, + 0,0,0,2, + 2,0,0,0, + 2,0,0,0, + 0,2,0,0), + byrow = TRUE, ncol = 4 + )) + + expect_equivalent(arrays[[1]][5, 3, , ], matrix( + c(0,0,0,0, + 17,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0), + byrow = TRUE, ncol = 4 + )) + + expect_equivalent(arrays[[1]][6, 3, , ], matrix( + c(0,0,0,0, + 0,0,1,0, + 0,0,1,0, + 0,0,1,0, + 0,0,1,0, + 0,0,0,1, + 0,0,0,1), + byrow = TRUE, ncol = 4 + )) + + expect_equivalent(arrays[[2]], matrix( + c(1,0, + 1,0, + 1,0, + 0,1, + 0,1, + 0,1), + byrow = TRUE, ncol = 2 + )) + + arrays <- gen() + + expect_equivalent(arrays[[1]][1, 1, , ], matrix( + c(11,0,0,0, + 11,0,0,0, + 11,0,0,0, + 11,0,0,0, + 0,0,0,0, + 0,0,0,11, + 0,0,0,11), + byrow = TRUE, ncol = 4 + )) + + expect_equivalent(arrays[[1]][4, 3, , ], matrix( + c(0,0,0,0, + 17,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0), + byrow = TRUE, ncol = 4 + )) + + expect_equivalent(arrays[[2]], matrix( + c(1,0, + 1,0, + 1,0, + 0,1, + 0,1, + 0,1), + byrow = TRUE, ncol = 2 + )) + + # coverage + set learning for label_folder + normalizing input tensor + + directories <- c("coverage_data/x", "coverage_data/y") + val <- FALSE + batch_size <- 6 + samples_per_target <- 3 + #new_batch_size <- batch_size/samples_per_target + path <- directories + voc_len <- 4 + maxlen <- 7 + use_coverage <- 17 + reshape_mode <- "time_dist" + set_learning <- list(reshape_mode = reshape_mode, + maxlen = maxlen, + samples_per_target = samples_per_target) + + gen <- get_generator(path = directories, + train_type = "label_folder", + val = FALSE, + padding = TRUE, + format = "fasta", + batch_size = batch_size, + maxlen = maxlen, + vocabulary = c("a", "c", "g", "t"), + step = 4, + use_coverage = use_coverage, + set_learning = set_learning) + + arrays <- gen() + expect_equivalent(arrays[[1]][1, 1, , ], matrix( + c(7,0,0,0, + 7,0,0,0, + 0,0,0,0, + 0,7,0,0, + 0,7,0,0, + 0,0,0,0, + 0,0,7,0), + byrow = TRUE, ncol = 4 + )/use_coverage) + + expect_equivalent(arrays[[1]][1, 3, , ], matrix( + c(11,0,0,0, + 11,0,0,0, + 11,0,0,0, + 11,0,0,0, + 0,0,0,0, + 0,0,0,11, + 0,0,0,11), + byrow = TRUE, ncol = 4 + )/use_coverage) + + expect_equivalent(arrays[[1]][2, 1, , ], matrix( + c(0,0,0,0, + 0,0,1,0, + 0,0,1,0, + 0,0,1,0, + 0,0,1,0, + 0,0,0,1, + 0,0,0,1), + byrow = TRUE, ncol = 4 + )/use_coverage) + + expect_equivalent(arrays[[1]][3, 1, , ], matrix( + c(0,0,0,0, + 17,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0), + byrow = TRUE, ncol = 4 + )/use_coverage) + + expect_equivalent(arrays[[1]][3, 2, , ], matrix( + c(7,0,0,0, + 7,0,0,0, + 0,0,0,0, + 0,7,0,0, + 0,7,0,0, + 0,0,0,0, + 0,0,7,0), + byrow = TRUE, ncol = 4 + )/use_coverage) + + expect_equivalent(arrays[[1]][4, 1, , ], matrix( + c(2,0,0,0, + 0,2,0,0, + 0,0,2,0, + 0,0,0,2, + 2,0,0,0, + 2,0,0,0, + 0,2,0,0), + byrow = TRUE, ncol = 4 + )/use_coverage) + + expect_equivalent(arrays[[1]][5, 3, , ], matrix( + c(0,0,0,0, + 17,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0), + byrow = TRUE, ncol = 4 + )/use_coverage) + + expect_equivalent(arrays[[1]][6, 3, , ], matrix( + c(0,0,0,0, + 0,0,1,0, + 0,0,1,0, + 0,0,1,0, + 0,0,1,0, + 0,0,0,1, + 0,0,0,1), + byrow = TRUE, ncol = 4 + )/use_coverage) + + expect_equivalent(arrays[[2]], matrix( + c(1,0, + 1,0, + 1,0, + 0,1, + 0,1, + 0,1), + byrow = TRUE, ncol = 2 + )) + + arrays <- gen() + + expect_equivalent(arrays[[1]][1, 1, , ], matrix( + c(11,0,0,0, + 11,0,0,0, + 11,0,0,0, + 11,0,0,0, + 0,0,0,0, + 0,0,0,11, + 0,0,0,11), + byrow = TRUE, ncol = 4 + )/use_coverage) + + expect_equivalent(arrays[[1]][4, 3, , ], matrix( + c(0,0,0,0, + 17,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0), + byrow = TRUE, ncol = 4 + )/use_coverage) + + expect_equivalent(arrays[[2]], matrix( + c(1,0, + 1,0, + 1,0, + 0,1, + 0,1, + 0,1), + byrow = TRUE, ncol = 2 + )) + + # rds label generator + + gen <- generator_rds(rds_folder = "rds", batch_size = 1) + l_x <- list() + l_y <- list() + for (i in 1:40) { + z <- gen() + l_x[[i]] <- z[[1]][1,1,1] + l_y[[i]] <- which.max(z[[2]]) + } + expect_equivalent(sort(unlist(l_x)), rep(1:20, each=2)) + expect_equivalent(sort(unlist(l_y)), rep(1:20, each=2)) + + gen <- generator_rds(rds_folder = "rds", batch_size = 10) + l_x <- list() + l_y <- list() + for (i in 1:4) { + z <- gen() + l_x[[i]] <- z[[1]][,1,1] + l_y[[i]] <- apply(z[[2]], 1, which.max) + } + expect_equivalent(sort(unlist(l_x)), rep(1:20, each = 2)) + expect_equivalent(sort(unlist(l_y)), rep(1:20, each=2)) + + # rds lm generator + + target_len <- 3 + batch_size <- 1 + gen <- generator_rds(rds_folder = "rds_lm", batch_size = batch_size, target_len = target_len) + + for (one_iter in 1:3) { + first_input <- 1 + (100*(0:4)) + for (i in 1:5) { + z <- gen() + expect_equivalent(dim(z[[1]]), c(batch_size, 7 - target_len, 4)) + l_x <- z[[1]][1,1,1] + first_input <- setdiff(first_input, l_x) + l_y <- NULL + for (j in 1:target_len) { + l_y[[j]] <- z[[2]][[j]][1,1] + } + expect_equivalent(l_y, l_x + 3 + (1:target_len)) + } + expect_equivalent(length(first_input), 0) + } + + batch_size <- 5 + gen <- generator_rds(rds_folder = "rds_lm", batch_size = batch_size, target_len = target_len) + for (one_iter in 1:3) { + first_input <- 1 + (100*(0:4)) + z <- gen() + expect_equivalent(dim(z[[1]]), c(batch_size, 7 - target_len, 4)) + l_x <- z[[1]][ , 1, 1] + first_input <- setdiff(first_input, l_x) + l_y <- NULL + for (j in 1:target_len) { + l_y[[j]] <- z[[2]][[j]][,1] + } + expect_equivalent(sort(l_y[[1]]), 5 + (100*(0:4))) + expect_equivalent(sort(l_y[[2]]), 6 + (100*(0:4))) + expect_equivalent(sort(l_y[[3]]), 7 + (100*(0:4))) + expect_equivalent(length(first_input), 0) + } + + # n-gram rds + + n_gram <- 3 + gen <- generator_rds(rds_folder = "n_gram_rds", + batch_size = 1, + target_len = 6, + n_gram = n_gram, + n_gram_stride = n_gram) + + arrays <- gen() + y <- arrays[[2]] + y_1_n_gram <- apply(y[[1]], 1, which.max) + y_2_n_gram <- apply(y[[2]], 1, which.max) + + int_seq <- c(1,2,0) + expect_equivalent(y_1_n_gram[1], 1 + sum(4^((n_gram-1):0) * (int_seq))) # cga + int_seq <- c(0,0,1) + expect_equivalent(y_2_n_gram[1], 1 + sum(4^((n_gram-1):0) * (int_seq))) # aac + + # set learning concat with coverage encoding + + directories <- c("coverage_data/x", "coverage_data/y") + val <- FALSE + batch_size <- 8 + samples_per_target <- 3 + #new_batch_size <- batch_size/samples_per_target + path <- directories + voc_len <- 4 + maxlen <- 6 + use_coverage <- 17 + reshape_mode <- "concat" + set_learning <- list(reshape_mode = reshape_mode, + maxlen = maxlen, + buffer_len = NULL, + samples_per_target = samples_per_target) + buffer_size <- 0 + concat_maxlen <- (maxlen * samples_per_target) + (buffer_size * (samples_per_target - 1)) + + gen <- get_generator(path = directories, + train_type = "label_folder", + val = FALSE, + padding = TRUE, + format = "fasta", + batch_size = batch_size, + maxlen = maxlen, + vocabulary = c("a", "c", "g", "t"), + step = maxlen, + use_coverage = use_coverage, + set_learning = set_learning) + + m <- matrix( + c(0,0,0,0, + 0,0,1/17,0, + 0,0,1/17,0, + 0,0,1/17,0, + 0,0,1/17,0, + 0,0,0,1/17, + 13/17,0,0,0, + 0,13/17,0,0, + 0,0,13/17,0, + 0,0,0,13/17, + 13/17,0,0,0, + 0,13/17,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 1,0,0,0), + byrow = TRUE, ncol = 4 + ) + + m2 <- matrix( + c(0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 1,0,0,0, + 2/17,0,0,0, + 0,2/17,0,0, + 0,0,2/17,0, + 0,0,0,2/17, + 2/17,0,0,0, + 2/17,0,0,0, + 0,0,3/17,0, + 0,0,3/17,0, + 0,0,3/17,0, + 0,0,3/17,0, + 0,0,0,3/17, + 0,0,0,3/17), + byrow = TRUE, ncol = 4 + ) + + y <- matrix(c(1,0,1,0,1,0,1,0,0,1,0,1,0,1,0,1), ncol = 2, byrow = TRUE) + + arrays <- gen() + expect_true(all(arrays[[1]][1,,] == arrays[[1]][3,,])) + expect_true(all(arrays[[1]][2,,] == arrays[[1]][4,,])) + expect_equivalent(arrays[[2]], y) + + + expect_equivalent(arrays[[1]][4, , ], m) + expect_equivalent(arrays[[1]][8, , ], m2) + + arrays <- gen() + expect_true(all(arrays[[1]][1,,] == arrays[[1]][3,,])) + expect_true(all(arrays[[1]][2,,] == arrays[[1]][4,,])) + expect_equivalent(arrays[[2]], y) + expect_equivalent(arrays[[1]][4, , ], m) + + arrays <- gen() + expect_true(all(arrays[[1]][1,,] == arrays[[1]][3,,])) + expect_true(all(arrays[[1]][2,,] == arrays[[1]][4,,])) + expect_equivalent(arrays[[2]], y) + expect_equivalent(arrays[[1]][4, , ], m) + expect_equivalent(arrays[[1]][5, , ], m2) + + # rds generator with multi inputs/outputs + + x1 <- array(0, dim = c(9,5,4)) + x2 <- array(0, dim = c(9,5,3)) + y1 <- array(0, dim = c(9,2)) + y2 <- array(0, dim = c(9,6)) + + for (i in 1:dim(x1)[1]) { + x1[i,,] <- i + y1[i, ] <- i + x2[i,,] <- i + 10 + y2[i, ] <- i + 10 + } + + index_1 <- 1:5 + index_2 <- 6:9 + x_list_1 <- list(x1[index_1, , ], x2[index_1, , ]) + x_list_2 <- list(x1[index_2, , ], x2[index_2, , ]) + y_list_1 <- list(y1[index_1, ], y2[index_1, ]) + y_list_2 <- list(y1[index_2, ], y2[index_2, ]) + z1 <- list(x = x_list_1, y = y_list_1) + z2 <- list(x = x_list_2, y = y_list_2) + + temp_dir <- tempfile() + dir.create(temp_dir) + saveRDS(z1, paste0(temp_dir, "/file_1.rds")) + saveRDS(z2, paste0(temp_dir, "/file_2.rds")) + + gen <- generator_rds(rds_folder = temp_dir, + batch_size = 10, path_file_log = NULL, + max_samples = NULL, + proportion_per_seq = NULL, + target_len = NULL, + seed = 1, + reverse_complement = FALSE, + sample_by_file_size = FALSE, + n_gram = NULL, n_gram_stride = 1, + reverse_complement_encoding = FALSE, + add_noise = NULL) + + for (k in 1:5) { + z <- gen() + x1 <- z[[1]][[1]] %>% as.array() + x2 <- z[[1]][[2]] %>% as.array() + y1 <- z[[2]][[1]] %>% as.array() + y2 <- z[[2]][[2]] %>% as.array() + + + for (i in 1:dim(x1)[1]) { + expect_equivalent(min(x1[i,,]), max(y1[i,])) + expect_equivalent(min(x1[i,,]) + 10, max(x2[i,,])) + expect_equivalent(max(x2[i,,]), min(y2[i,])) + expect_equivalent(max(y1[i,]) + 10, min(y2[i,])) + } + } + + # integer encoding label header # + + testpath <- file.path("fasta_2") + gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = 5, maxlen = 3, step = 2, vocabulary = c("a", "c", "g", "t"), + reverse_complement = FALSE, vocabulary_label = c("w", "x", "y"), return_int = TRUE) + + arrays <- gen() + + expect_equivalent(arrays[[1]][1, 1], 1) # A + expect_equivalent(arrays[[1]][1, 2], 1) # A + expect_equivalent(arrays[[1]][1, 3], 2) # C + expect_equivalent(arrays[[2]][1, ], c(1, 0, 0)) # W + + expect_equivalent(arrays[[1]][5, 1], 1) # A + expect_equivalent(arrays[[1]][5, 2], 1) # A + expect_equivalent(arrays[[1]][5, 3], 4) # T + expect_equivalent(arrays[[2]][5, ], c(0, 1, 0)) # W + + + gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = 5, maxlen = 8, step = 2, vocabulary = c("a", "c", "g", "t"), + reverse_complement = FALSE, vocabulary_label = c("w", "x", "y"), return_int = TRUE) + + arrays <- gen() + expect_equivalent(arrays[[2]][1, ], c(1, 0, 0)) + expect_equivalent(arrays[[2]][2, ], c(0, 1, 0)) + expect_equivalent(arrays[[2]][3, ], c(0, 0, 1)) + expect_equivalent(arrays[[2]][4, ], c(0, 1, 0)) + expect_equivalent(arrays[[2]][5, ], c(0, 1, 0)) + + arrays <- gen() + expect_equivalent(arrays[[1]][5, 1], 3) + expect_equivalent(arrays[[1]][5, 2], 3) + expect_equivalent(arrays[[1]][5, 3], 3) + expect_equivalent(arrays[[1]][5, 4], 3) + expect_equivalent(arrays[[1]][5, 5], 4) + expect_equivalent(arrays[[1]][5, 6], 4) + expect_equivalent(arrays[[1]][5, 7], 4) + expect_equivalent(arrays[[1]][5, 8], 4) + expect_equivalent(arrays[[2]][1, ], c(0, 0, 1)) + expect_equivalent(arrays[[2]][2, ], c(0, 0, 1)) + expect_equivalent(arrays[[2]][3, ], c(1, 0, 0)) + expect_equivalent(arrays[[2]][4, ], c(0, 1, 0)) + expect_equivalent(arrays[[2]][5, ], c(0, 0, 1)) + + + gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = 8, maxlen = 7, step = 2, vocabulary = c("a", "c", "g", "t"), + reverse_complement = FALSE, vocabulary_label = c("w", "x", "y"), return_int = TRUE) + + arrays <- gen() + + # go through a/b.fasta once discard samples with target z + expect_equivalent(arrays[[1]][8, 1], 1) # A + expect_equivalent(arrays[[1]][8, 2], 1) # A + expect_equivalent(arrays[[1]][8, 3], 2) # C + expect_equivalent(arrays[[2]][8, ], c(1, 0, 0)) # W + + # label folder with integer encoding + + directories <- c("label_folder/x", "label_folder/y", "label_folder/z") + gen <- get_generator(path = directories, + train_type = "label_folder", + val = FALSE, + padding = TRUE, + format = "fasta", + batch_size = 6, + maxlen = 2, + return_int = TRUE, + vocabulary = c("a", "c", "g", "t"), + step = 2) + + arrays <- gen() + expect_equivalent(arrays[[1]][1, 1], 1) + expect_equivalent(arrays[[1]][1, 2], 2) + expect_equivalent(arrays[[1]][2, 1], 1) + expect_equivalent(arrays[[1]][2, 2], 2) + expect_equivalent(arrays[[1]][3, 1], 3) + expect_equivalent(arrays[[1]][3, 2], 2) + expect_equivalent(arrays[[1]][4, 1], 3) + expect_equivalent(arrays[[1]][4, 2], 2) + expect_equivalent(arrays[[1]][5, 1], 4) + expect_equivalent(arrays[[1]][5, 2], 4) + expect_equivalent(arrays[[1]][6, 1], 4) + expect_equivalent(arrays[[1]][6, 2], 4) + + expect_equivalent(arrays[[2]][1, ], c(1, 0, 0)) + expect_equivalent(arrays[[2]][2, ], c(1, 0, 0)) + expect_equivalent(arrays[[2]][3, ], c(0, 1, 0)) + expect_equivalent(arrays[[2]][4, ], c(0, 1, 0)) + expect_equivalent(arrays[[2]][5, ], c(0, 0, 1)) + expect_equivalent(arrays[[2]][6, ], c(0, 0, 1)) + + + # test skipping file + for (i in 1:2) { + arrays <- gen() + } + + expect_equivalent(arrays[[1]][1, 1], 1) + expect_equivalent(arrays[[1]][1, 2], 2) + expect_equivalent(arrays[[1]][2, 1], 1) + expect_equivalent(arrays[[1]][2, 2], 3) + expect_equivalent(arrays[[1]][3, 1], 2) + expect_equivalent(arrays[[1]][3, 2], 3) + expect_equivalent(arrays[[1]][4, 1], 2) + expect_equivalent(arrays[[1]][4, 2], 3) + expect_equivalent(arrays[[1]][5, 1], 1) + expect_equivalent(arrays[[1]][5, 2], 1) + expect_equivalent(arrays[[1]][6, 1], 1) + expect_equivalent(arrays[[1]][6, 2], 1) + + expect_equivalent(arrays[[2]][1, ], c(1, 0, 0)) + expect_equivalent(arrays[[2]][2, ], c(1, 0, 0)) + expect_equivalent(arrays[[2]][3, ], c(0, 1, 0)) + expect_equivalent(arrays[[2]][4, ], c(0, 1, 0)) + expect_equivalent(arrays[[2]][5, ], c(0, 0, 1)) + expect_equivalent(arrays[[2]][6, ], c(0, 0, 1)) + + + # n-gram integer encoding, label folder # + + directories <- c("label_folder/x", "label_folder/y", "label_folder/z") + gen <- get_generator(path = directories, + train_type = "label_folder", + batch_size = 6, + maxlen = 12, + padding = TRUE, + n_gram = 3, + n_gram_stride = 2, + return_int = TRUE, + vocabulary = c("a", "c", "g", "t"), + step = 2) + + arrays <- gen() + x <- arrays[[1]] + y <- arrays[[2]] + expect_equivalent(dim(x), c(6, 5)) + expect_equivalent(x[1, 1], 0) # padding + expect_equivalent(x[1, 2], 5) # ACA + expect_equivalent(unique(x[5, 1:4]), 0) # padding + expect_equivalent(x[5, 5], 64) # TTT = 4^3 + + # n-gram one-hot encoding, label folder # + + directories <- c("label_folder/x", "label_folder/y", "label_folder/z") + gen <- get_generator(path = directories, + train_type = "label_folder", + batch_size = 6, + maxlen = 12, + padding = TRUE, + n_gram = 3, + n_gram_stride = 2, + return_int = FALSE, + vocabulary = c("a", "c", "g", "t"), + step = 2) + + arrays <- gen() + x <- arrays[[1]] + y <- arrays[[2]] + expect_equivalent(dim(x), c(6, 5, 64)) + expect_equivalent(unique(x[1, 1, ]), 0) # padding + expect_equivalent(which.max(x[1, 2, ]), 5) # ACA + expect_equivalent(unique(as.vector(x[5, 1:4, ])), 0) # padding + expect_equivalent(which.max(x[5, 5, ]), 64) # TTT = 4^3 + + ##### masked lm ##### + + testpath <- file.path("a.fastq") + masked_lm <- list(mask_rate = 0.25, random_rate = 0.25, identity_rate = 0.25, include_sw = TRUE) + gen <- get_generator(path = testpath, + train_type = "masked_lm", + masked_lm = masked_lm, + batch_size = 1, + maxlen = 200, + format = "fastq", + padding = TRUE, + return_int = TRUE) + + z <- gen() + x <- z[[1]] + y <- z[[2]] + sw <- z[[3]] + + expect_equivalent(x[1,1:12], rep(0, 12)) # padding + expect_equivalent(sw[1,1:12], rep(0, 12)) # no sample weights in padding region + sw_pos <- which(sw[1,] == 1) + random_pos <- which(x[1,] %in% c(2,3,4)) + masked_pos <- which(x[1,] == 5) + # masked and random positions must have sw 1 + expect_contains(sw_pos, random_pos) + expect_contains(sw_pos, masked_pos) + + ### + + testpath <- file.path("fasta_2/b.fasta") + masked_lm <- list(mask_rate = 0.25, random_rate = 0.25, identity_rate = 0.25, include_sw = TRUE) + gen <- get_generator(path = testpath, + train_type = "masked_lm", + shuffle_input = FALSE, + masked_lm = masked_lm, + batch_size = 3, + maxlen = 10, + padding = TRUE, + return_int = TRUE) + + z <- gen() + x <- z[[1]] + y <- z[[2]] + sw <- z[[3]] + + expect_equivalent(sum(x[,1:2]), 0) # padding + expect_equivalent(sum(sw[,1:2]), 0) # no sample weights in padding region + for (i in 1:3) { + sw_pos <- which(sw[i,] == 1) + masked_pos <- which(x[i,] == 5) + expect_contains(sw_pos, masked_pos) # masked positions must have sw 1 + } + + #### test reshape #### + + directories <- c("fasta_2", "fasta_3") + fx <- function(x) {return(x)} + reshape_xy <- list(x = fx) + expect_error(gen <- get_generator(path = directories, + reshape_xy = reshape_xy, + train_type = "label_folder", + batch_size = 4, + maxlen = 3)) + + + directories <- c("fasta_2", "fasta_3") + fx <- function(x = NULL, y = NULL) { + return(x + 1) + } + fy <- function(x = NULL, y = NULL) { + return(x) + } + reshape_xy <- list(x = fx, y = fy) + gen <- get_generator(path = directories, + reshape_xy = reshape_xy, + val = FALSE, + train_type = "label_folder", + format = "fasta", + batch_size = 4, + maxlen = 3, + vocabulary = c("a", "c", "g", "t"), + reverse_complement = FALSE, + ambiguous_nuc = "zero", + step = 2) + + arrays <- gen() + arrays[[1]][1,,] + y <- arrays[[2]] + + expect_equivalent(arrays[[1]][1, 1, ], c(1, 0, 0, 0) + 1) + expect_equivalent(arrays[[1]][1, 2, ], c(1, 0, 0, 0) + 1) + expect_equivalent(arrays[[1]][1, 3, ], c(0, 1, 0, 0) + 1) + expect_equivalent(arrays[[2]][1, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[2]][1, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[2]][1, 3, ], c(0, 1, 0, 0)) + + expect_equivalent(arrays[[1]][4, 1, ], rep(0, 4) + 1) + expect_equivalent(arrays[[1]][4, 2, ], c(0, 1, 0, 0) + 1) + expect_equivalent(arrays[[1]][4, 3, ], c(0, 1, 0, 0) + 1) + expect_equivalent(arrays[[2]][4, 1, ], rep(0, 4)) + expect_equivalent(arrays[[2]][4, 2, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[2]][4, 3, ], c(0, 1, 0, 0)) + + + testpath <- file.path("fasta_2") + label_from_csv <- "output_label.csv" + fx <- function(x = NULL, y = NULL) { + return(y + 3) + } + fy <- function(x = NULL, y = NULL) { + return(x + 2) + } + reshape_xy <- list(x = fx, y = fy) + gen <- generator_fasta_label_header_csv(path_corpus = testpath, batch_size = 5, + reshape_xy = reshape_xy, + maxlen = 10, step = 10, + vocabulary = c("a", "c", "g", "t", "Z"), + reverse_complement = FALSE, + vocabulary_label = c("w", "x", "y"), + shuffle_file_order = FALSE, + seed = 1234, + shuffle_input = FALSE, + padding = TRUE, + concat_seq = "ZZ", + target_from_csv = label_from_csv) + + arrays <- gen() + + expect_equivalent(arrays[[2]][1, 8, ], c(0, 0, 0, 1, 0) + 2) + expect_equivalent(arrays[[2]][1, 9, ], c(0, 0, 0, 0, 1) + 2) + expect_equivalent(arrays[[2]][1, 10, ], c(0, 0, 0, 0, 1) + 2) + + expect_equivalent(arrays[[2]][4, 3, ], c(1, 0, 0, 0, 0) + 2) + expect_equivalent(arrays[[2]][4, 4, ], c(1, 0, 0, 0, 0) + 2) + + expect_equivalent(arrays[[1]][1, ], 1:4 + 3) + expect_equivalent(arrays[[1]][2, ], 1:4 + 3) + expect_equivalent(arrays[[1]][3, ], 1:4 + 3) + expect_equivalent(arrays[[1]][4, ], 11:14 + 3) + expect_equivalent(arrays[[1]][5, ], 11:14 + 3) + + arrays <- gen() + + expect_equivalent(arrays[[2]][1, 8, ], c(1, 0, 0, 0, 0) + 2) + expect_equivalent(arrays[[2]][2, 3, ], c(0, 1, 0, 0, 0) + 2) + + expect_equivalent(arrays[[1]][1, ], 11:14 + 3) + expect_equivalent(arrays[[1]][5, ], 11:14 + 3) + + # set learning + + directories = c("fasta_2", "fasta_3") + maxlen <- 3 + samples_per_target <- 3 + reshape_mode <- "time_dist" + set_learning <- list(reshape_mode = reshape_mode, + maxlen = maxlen, + samples_per_target = samples_per_target) + + gen <- get_generator(val = FALSE, + set_learning = set_learning, + train_type = "label_folder", + path = directories, + format = "fasta", + batch_size = 2, + maxlen = maxlen, + ambiguous_nuc = "discard", + vocabulary = c("a", "c", "g", "t"), + step = 2) + + arrays <- gen() + + # add axis to previous test + expect_equivalent(arrays[[1]][1, 1, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 1, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][1, 1, 3, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[2]][1, ], c(1, 0)) + + expect_equivalent(arrays[[1]][1, 2, 1, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][1, 2, 2, ], c(0, 1, 0, 0)) + expect_equivalent(arrays[[1]][1, 2, 3, ], c(0, 0, 1, 0)) + + expect_equivalent(arrays[[1]][1, 3, 1, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][1, 3, 2, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][1, 3, 3, ], c(0, 0, 0, 1)) + + expect_equivalent(arrays[[1]][2, 1, 1, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][2, 1, 2, ], c(0, 0, 1, 0)) + expect_equivalent(arrays[[1]][2, 1, 3, ], c(1, 0, 0, 0)) + + expect_equivalent(arrays[[1]][2, 2, 1, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 2, 2, ], c(1, 0, 0, 0)) + expect_equivalent(arrays[[1]][2, 2, 3, ], c(1, 0, 0, 0)) + + expect_equivalent(arrays[[1]][2, 3, 1, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][2, 3, 2, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[1]][2, 3, 3, ], c(0, 0, 0, 1)) + expect_equivalent(arrays[[2]][1, ], c(1, 0)) + expect_equivalent(arrays[[2]][2, ], c(0, 1)) + +})