Switch to side-by-side view

--- a
+++ b/docs/articles/data_generator.html
@@ -0,0 +1,1814 @@
+<!DOCTYPE html>
+<!-- Generated by pkgdown: do not edit by hand --><html lang="en">
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<meta charset="utf-8">
+<meta http-equiv="X-UA-Compatible" content="IE=edge">
+<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+<meta name="description" content="deepG">
+<title>Data Generator • deepG</title>
+<!-- favicons --><link rel="icon" type="image/png" sizes="16x16" href="../favicon-16x16.png">
+<link rel="icon" type="image/png" sizes="32x32" href="../favicon-32x32.png">
+<link rel="apple-touch-icon" type="image/png" sizes="180x180" href="../apple-touch-icon.png">
+<link rel="apple-touch-icon" type="image/png" sizes="120x120" href="../apple-touch-icon-120x120.png">
+<link rel="apple-touch-icon" type="image/png" sizes="76x76" href="../apple-touch-icon-76x76.png">
+<link rel="apple-touch-icon" type="image/png" sizes="60x60" href="../apple-touch-icon-60x60.png">
+<script src="../deps/jquery-3.6.0/jquery-3.6.0.min.js"></script><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+<link href="../deps/bootstrap-5.3.1/bootstrap.min.css" rel="stylesheet">
+<script src="../deps/bootstrap-5.3.1/bootstrap.bundle.min.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous">
+<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous">
+<!-- bootstrap-toc --><script src="https://cdn.jsdelivr.net/gh/afeld/bootstrap-toc@v1.0.1/dist/bootstrap-toc.min.js" integrity="sha256-4veVQbu7//Lk5TSmc7YV48MxtMy98e26cf5MrgZYnwo=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.11/clipboard.min.js" integrity="sha512-7O5pXpc0oCRrxk8RUfDYFgn0nO1t+jLuIOQdOMRp4APB7uZ4vSjspzp5y6YDtDs4VzUSTbWzBFZ/LKJhnyFOKw==" crossorigin="anonymous" referrerpolicy="no-referrer"></script><!-- search --><script src="https://cdnjs.cloudflare.com/ajax/libs/fuse.js/6.4.6/fuse.js" integrity="sha512-zv6Ywkjyktsohkbp9bb45V6tEMoWhzFzXis+LrMehmJZZSys19Yxf1dopHx7WzIKxr5tK2dVcYmaCk2uqdjF4A==" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/autocomplete.js/0.38.0/autocomplete.jquery.min.js" integrity="sha512-GU9ayf+66Xx2TmpxqJpliWbT5PiGYxpaG8rfnBEk1LL8l1KGkRShhngwdXK1UgqhAzWpZHSiYPc09/NwDQIGyg==" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mark.js/8.11.1/mark.min.js" integrity="sha512-5CYOlHXGh6QpOFA/TeTylKLWfB3ftPsde7AnmhuitiTX4K5SqCLBeKro6sPS8ilsz1Q4NRx3v8Ko2IBiszzdww==" crossorigin="anonymous"></script><!-- pkgdown --><script src="../pkgdown.js"></script><meta property="og:title" content="Data Generator">
+<meta property="og:description" content="deepG">
+<meta property="og:image" content="https://genomenet.github.io/deepG/logo.png">
+<!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
+<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
+<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+<![endif]-->
+</head>
+<body>
+    <a href="#main" class="visually-hidden-focusable">Skip to contents</a>
+    
+
+    <nav class="navbar fixed-top navbar-light navbar-expand-lg bg-light" data-bs-theme="light"><div class="container">
+    
+    <a class="navbar-brand me-2" href="../index.html">deepG</a>
+
+    <small class="nav-text text-default me-auto" data-bs-toggle="tooltip" data-bs-placement="bottom" title="Released version">0.3.0</small>
+
+    
+    <button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbar" aria-controls="navbar" aria-expanded="false" aria-label="Toggle navigation">
+      <span class="navbar-toggler-icon"></span>
+    </button>
+
+    <div id="navbar" class="collapse navbar-collapse ms-3">
+      <ul class="navbar-nav me-auto">
+<li class="nav-item">
+  <a class="nav-link" href="../reference/index.html">
+    <span class="fa fa fa fa-file-alt"></span>
+     
+    Reference
+  </a>
+</li>
+<li class="nav-item dropdown">
+  <a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false" aria-haspopup="true" id="dropdown-notebooks">Notebooks</a>
+  <div class="dropdown-menu" aria-labelledby="dropdown-notebooks">
+    <a class="external-link dropdown-item" href="https://colab.research.google.com/drive/175jIdXcDcgPUvaBo2rH2Lupbpjnp5O7G?usp=sharing">deepG tutorial</a>
+    <a class="external-link dropdown-item" href="https://colab.research.google.com/drive/1Eolc0koMNM1zkuO4XyVM58ImeF1BpRiH?usp=sharing">Read-length level: Human contamination</a>
+    <a class="external-link dropdown-item" href="https://colab.research.google.com/drive/1yiXSwFafXpMLHaov9iBTQLIDZ6bK1zYX?usp=sharing">Locus level: CRISPR detection</a>
+    <a class="external-link dropdown-item" href="https://colab.research.google.com/drive/1G7bOFEX87cZNrM2tdRtTdkrZn5fM__g0?usp=sharing">Gene level: 16S rRNA detection</a>
+    <a class="external-link dropdown-item" href="https://colab.research.google.com/drive/1BCggL-tfQF136YeJ8cKKi-zoBEDMgkNh?usp=sharing">Genome level: Bacterial morphology (Sporulation)</a>
+    <a class="external-link dropdown-item" href="https://colab.research.google.com/drive/10xpRzGd3JeBAbqQYSCxzQUMctt01sx9D?usp=sharing">Full metagenome level: Colorectal cancer prediction</a>
+    <a class="external-link dropdown-item" href="https://colab.research.google.com/drive/1kyYK7IU7GSfdpDzO_a8U3_qD4i3zTu6w?usp=sharing">BERT with deepG</a>
+  </div>
+</li>
+<li class="active nav-item dropdown">
+  <a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false" aria-haspopup="true" id="dropdown-tutorials">Tutorials</a>
+  <div class="dropdown-menu" aria-labelledby="dropdown-tutorials">
+    <a class="dropdown-item" href="../articles/getting_started.html">Getting Started</a>
+    <a class="dropdown-item" href="../articles/training_types.html">Training types</a>
+    <a class="dropdown-item" href="../articles/data_generator.html">Data generator</a>
+    <a class="dropdown-item" href="../articles/using_tb.html">Using tensorboard</a>
+    <a class="dropdown-item" href="../articles/integrated_gradient.html">Integrated Gradient</a>
+  </div>
+</li>
+      </ul>
+<form class="form-inline my-2 my-lg-0" role="search">
+        <input type="search" class="form-control me-sm-2" aria-label="Toggle navigation" name="search-input" data-search-index="../search.json" id="search-input" placeholder="Search for" autocomplete="off">
+</form>
+
+      <ul class="navbar-nav">
+<li class="nav-item">
+  <a class="external-link nav-link" href="https://github.com/GenomeNet/deepG/" aria-label="github">
+    <span class="fab fa fab fa-github fa-lg"></span>
+     
+  </a>
+</li>
+      </ul>
+</div>
+
+    
+  </div>
+</nav><div class="container template-article">
+
+
+
+
+<div class="row">
+  <main id="main" class="col-md-9"><div class="page-header">
+      <img src="../logo.png" class="logo" alt=""><h1>Data Generator</h1>
+            
+      
+      <small class="dont-index">Source: <a href="https://github.com/GenomeNet/deepG/blob/HEAD/vignettes/data_generator.Rmd" class="external-link"><code>vignettes/data_generator.Rmd</code></a></small>
+      <div class="d-none name"><code>data_generator.Rmd</code></div>
+    </div>
+
+    
+    
+<div class="sourceCode" id="cb1"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="co">#devtools::install_github("GenomeNet/deepG")</span></span>
+<span><span class="co">#library(deepG)</span></span>
+<span><span class="co">#library(magrittr)</span></span></code></pre></div>
+<style type="text/css">
+mark.in {
+  background-color: CornflowerBlue;
+}
+
+mark.out {
+  background-color: IndianRed;
+}
+
+</style>
+<div class="section level2">
+<h2 id="introduction">Introduction<a class="anchor" aria-label="anchor" href="#introduction"></a>
+</h2>
+<p>The most common use case for the deepG data generator is to extract
+samples from a collection of fasta (or fastq) files. The generator will
+always return a list of length 2. The first element is the input <span class="math inline">\(X\)</span> and the second the target <span class="math inline">\(Y\)</span>. We can differentiate between 2
+approaches</p>
+<ul>
+<li>
+<strong>Language model</strong>: Part of a sequence is the input and
+other part the target.
+<ul>
+<li>Example: Predict the next nucleotide given the previous 100
+nucleotides.</li>
+</ul>
+</li>
+<li>
+<strong>Label classification</strong>: Assign a label to a sequence.
+<ul>
+<li>Example: Assign a label “virus” or “bacteria” to a sequence of
+length 100.</li>
+</ul>
+</li>
+</ul>
+<p>Suppose we are given 2 fasta files called “a.fasta” and “b.fasta”
+that look as follows:</p>
+<div style="float: left;margin-right:10px">
+<table class="table"><tr>
+<td>
+<strong>a.fasta</strong> <br><tt> &gt;header_a1 <br> AACCAAGG <br>
+&gt;header_a2 <br> TTTGGG <br> &gt;header_a3 <br> ACGTACGT <br></tt>
+</td>
+</tr></table>
+</div>
+<div style="float: left">
+<table class="table"><tr>
+<td>
+<strong>b.fasta</strong> <br><tt> &gt;header_b1 <br> GTGTGT <br>
+&gt;header_b2 <br> AAGG <br></tt>
+</td>
+</tr></table>
+</div>
+<p><br><br><br><br><br><br><br><br><br></p>
+<p>If we want to extract sequences of length 4 from these files, there
+would be 17 possible samples (5 from <tt>AACCAAGG</tt>, 3 from
+<tt>TTTGGG</tt>, …). A naive approach would be to extract the samples in
+a sequential manner:</p>
+<p><em>1. sample</em>:</p>
+<div style="float: left;margin-right:10px">
+<table class="table"><tr>
+<td>
+<strong>a.fasta</strong> <br><tt> &gt;header_a1 <br><mark class="in">AACC</mark>AAGG <br> &gt;header_a2 <br> TTTGGG <br>
+&gt;header_a3 <br> ACGTACGT <br></tt>
+</td>
+</tr></table>
+</div>
+<div style="float: left">
+<table class="table"><tr>
+<td>
+<strong>b.fasta</strong> <br><tt> &gt;header_b1 <br> GTGTGT <br>
+&gt;header_b2 <br> AAGG <br></tt>
+</td>
+</tr></table>
+</div>
+<p><br><br><br><br><br><br><br><br><br></p>
+<p><em>2. sample</em>:</p>
+<div style="float: left;margin-right:10px">
+<table class="table"><tr>
+<td>
+<strong>a.fasta</strong> <br><tt> &gt;header_a1 <br>
+A<mark class="in">ACCA</mark>AGG <br> &gt;header_a2 <br> TTTGGG <br>
+&gt;header_a3 <br> ACGTACGT <br></tt>
+</td>
+</tr></table>
+</div>
+<div style="float: left">
+<table class="table"><tr>
+<td>
+<strong>b.fasta</strong> <br><tt> &gt;header_b1 <br> GTGTGT <br>
+&gt;header_b2 <br> AAGG <br></tt>
+</td>
+</tr></table>
+</div>
+<p><br><br><br><br><br><br><br><br><br></p>
+<p>…</p>
+<p><br></p>
+<p><em>17. sample</em>:</p>
+<div style="float: left;margin-right:10px">
+<table class="table"><tr>
+<td>
+<strong>a.fasta</strong> <br><tt> &gt;header_a1 <br> AACCAAGG <br>
+&gt;header_a2 <br> TTTGGG <br> &gt;header_a3 <br> ACGTACGT <br></tt>
+</td>
+</tr></table>
+</div>
+<div style="float: left">
+<table class="table"><tr>
+<td>
+<strong>b.fasta</strong> <br><tt> &gt;header_b1 <br> GTGTGT <br>
+&gt;header_b2 <br><mark class="in">AAGG</mark><br></tt>
+</td>
+</tr></table>
+</div>
+<p><br><br><br><br><br><br><br><br><br></p>
+<p><em>18. sample</em>:</p>
+<div style="float: left;margin-right:10px">
+<table class="table"><tr>
+<td>
+<strong>a.fasta</strong> <br><tt> &gt;header_a1 <br><mark class="in">AACC</mark>AAGG <br> &gt;header_a2 <br> TTTGGG <br>
+&gt;header_a3 <br> ACGTACGT <br></tt>
+</td>
+</tr></table>
+</div>
+<div style="float: left">
+<table class="table"><tr>
+<td>
+<strong>b.fasta</strong> <br><tt> &gt;header_b1 <br> GTGTGT <br>
+&gt;header_b2 <br> AAGG <br></tt>
+</td>
+</tr></table>
+</div>
+<p><br><br><br><br><br><br><br><br><br></p>
+<p>… <br><br></p>
+<p>For longer sequences this is not a desirable strategy since the data
+is very redundant (often just one nucleotide difference) and the model
+would often see long stretches of data from the same source. Choosing
+the samples completely at random can also be problematic since we would
+constantly have to open new files. The deepG generators offers several
+option to navigate the data sampling strategy to achieve a good balance
+between the two approaches.</p>
+</div>
+<div class="section level2">
+<h2 id="data-generator-options">Data generator options<a class="anchor" aria-label="anchor" href="#data-generator-options"></a>
+</h2>
+<p>In the following code examples, we will mostly use the sequence <tt>
+<strong>abcdefghiiii</strong> </tt> to demonstrate some of the deepG
+data generator options. (In real world application you would usually
+have sequences from the <tt>ACGT</tt> vocabulary.)</p>
+<div class="sourceCode" id="cb2"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">sequence</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste0</a></span><span class="op">(</span><span class="st">"a"</span>, <span class="st">"b"</span>, <span class="st">"c"</span>, <span class="st">"d"</span>, <span class="st">"e"</span>, <span class="st">"f"</span>, <span class="st">"g"</span>, <span class="st">"h"</span>, <span class="st">"i"</span>, <span class="st">"i"</span>, <span class="st">"i"</span>, <span class="st">"i"</span><span class="op">)</span></span>
+<span><span class="va">vocabulary</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"a"</span>, <span class="st">"b"</span>, <span class="st">"c"</span>, <span class="st">"d"</span>, <span class="st">"e"</span>, <span class="st">"f"</span>, <span class="st">"g"</span>, <span class="st">"h"</span>, <span class="st">"i"</span><span class="op">)</span>  </span></code></pre></div>
+<p>We may store this sequence in a fasta file</p>
+<div class="sourceCode" id="cb3"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">temp_dir</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/files2.html" class="external-link">dir.create</a></span><span class="op">(</span><span class="va">temp_dir</span><span class="op">)</span></span>
+<span><span class="va">dir_path</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste0</a></span><span class="op">(</span><span class="va">temp_dir</span>, <span class="st">"/dummy_data"</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/files2.html" class="external-link">dir.create</a></span><span class="op">(</span><span class="va">dir_path</span><span class="op">)</span></span>
+<span><span class="va">df</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="va">sequence</span>, Header <span class="op">=</span> <span class="st">"label_1"</span>, stringsAsFactors <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span>
+<span><span class="va">file_path</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/file.path.html" class="external-link">file.path</a></span><span class="op">(</span><span class="va">dir_path</span>, <span class="st">"a.fasta"</span><span class="op">)</span></span>
+<span><span class="co"># sequence as fasta file</span></span>
+<span><span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span>fdta <span class="op">=</span> <span class="fu">dplyr</span><span class="fu">::</span><span class="fu"><a href="https://tibble.tidyverse.org/reference/as_tibble.html" class="external-link">as_tibble</a></span><span class="op">(</span><span class="va">df</span><span class="op">)</span>, out.file <span class="op">=</span> <span class="va">file_path</span><span class="op">)</span></span></code></pre></div>
+<p>Since neural networks can only work with numeric data, we have to
+encode sequences of characters with numeric data. Usually this is
+achieved by one-hot-encoding; there are some other approaches
+implemented: see <code>use_coverage</code>,
+<code>use_quality_score</code> and <code>ambiguous_nuc</code>
+sections.</p>
+<div class="sourceCode" id="cb4"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="co"># one-hot encoding example</span></span>
+<span><span class="va">s</span> <span class="op">&lt;-</span>  <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"a"</span>, <span class="st">"c"</span>, <span class="st">"a"</span>, <span class="st">"f"</span>, <span class="st">"i"</span>, <span class="st">"b"</span><span class="op">)</span></span>
+<span><span class="va">s_as_int_seq</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/vector.html" class="external-link">vector</a></span><span class="op">(</span><span class="st">"integer"</span>, <span class="fu"><a href="https://rdrr.io/r/base/length.html" class="external-link">length</a></span><span class="op">(</span><span class="va">s</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="kw">for</span> <span class="op">(</span><span class="va">i</span> <span class="kw">in</span> <span class="fl">1</span><span class="op">:</span><span class="fu"><a href="https://rdrr.io/r/base/length.html" class="external-link">length</a></span><span class="op">(</span><span class="va">s</span><span class="op">)</span><span class="op">)</span> <span class="op">{</span></span>
+<span>  <span class="va">s_as_int_seq</span><span class="op">[</span><span class="va">i</span><span class="op">]</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/which.html" class="external-link">which</a></span><span class="op">(</span><span class="va">s</span><span class="op">[</span><span class="va">i</span><span class="op">]</span> <span class="op">==</span> <span class="va">vocabulary</span><span class="op">)</span> <span class="op">-</span> <span class="fl">1</span></span>
+<span><span class="op">}</span></span>
+<span><span class="va">one_hot_sample</span> <span class="op">&lt;-</span> <span class="fu">keras</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/keras/man/to_categorical.html" class="external-link">to_categorical</a></span><span class="op">(</span><span class="va">s_as_int_seq</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">one_hot_sample</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="va">one_hot_sample</span></span></code></pre></div>
+<pre><code><span><span class="co">##      a b c d e f g h i</span></span>
+<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [2,] 0 0 1 0 0 0 0 0 0</span></span>
+<span><span class="co">## [3,] 1 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [4,] 0 0 0 0 0 1 0 0 0</span></span>
+<span><span class="co">## [5,] 0 0 0 0 0 0 0 0 1</span></span>
+<span><span class="co">## [6,] 0 1 0 0 0 0 0 0 0</span></span></code></pre>
+<div class="section level3">
+<h3 id="maxlen">maxlen<a class="anchor" aria-label="anchor" href="#maxlen"></a>
+</h3>
+<p>The length of the input sequence.</p>
+</div>
+<div class="section level3">
+<h3 id="vocabulary">vocabulary<a class="anchor" aria-label="anchor" href="#vocabulary"></a>
+</h3>
+<p>The set of allowed characters in a sequence. What happens to
+characters outside the vocabulary can be controlled with the
+<code>ambiguous_nuc</code> argument.</p>
+</div>
+<div class="section level3">
+<h3 id="train_type">train_type<a class="anchor" aria-label="anchor" href="#train_type"></a>
+</h3>
+<p>The generator will always return a list of length 2. The first
+element is the input <span class="math inline">\(X\)</span> and the
+second the target <span class="math inline">\(Y\)</span>. The
+<code>train_type</code> argument determines how <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> get extracted. Possible arguments for
+<u> <em>language models</em> </u> are:</p>
+<ul>
+<li>
+<strong>“lm”</strong> or <strong>“lm_rds”</strong>: Given some
+sequence <span class="math inline">\(s\)</span>, we take some subset of
+that sequence as input and the rest as target. How to split <span class="math inline">\(s\)</span> can be specified in
+<code>output_format</code> argument.</li>
+</ul>
+<p>Besides the language model approach, we can use <u> <em>label
+classification</em> </u>. This means we map some label to a sequence.
+For example, the target for some nucleotide sequence could be one of the
+labels “bacteria” or “virus”. We have to specify how to extract a label
+corresponding to a sequence. Possible arguments are:</p>
+<ul>
+<li><p><strong>“label_header”</strong>: get label from fasta
+headers.</p></li>
+<li><p><strong>“label_folder”</strong>: get label from folder, i.e. all
+files in one folder must belong to the same class.</p></li>
+<li>
+<p><strong>“label_csv”</strong>: get label from csv file. Csv file
+should have one column named “file”. The targets then correspond to
+entries in that row (except “file” column). Example: if we are currently
+working with a file called “a.fasta”, there should be a row in our csv
+file with some target information for that file <br></p>
+<table class="table">
+<thead><tr class="header">
+<th>file</th>
+<th>label_1</th>
+<th>label_2</th>
+</tr></thead>
+<tbody><tr class="odd">
+<td>“a.fasta”</td>
+<td>1</td>
+<td>0</td>
+</tr></tbody>
+</table>
+</li>
+<li><p><strong>“label_rds”</strong>: rds file contains preprocessed list
+of input and target tensors.</p></li>
+</ul>
+<p>Another option is <strong>“dummy_gen”</strong>: generator creates
+random data once and repeatedly returns them.</p>
+<p>Extract target from fasta header (fasta header is “label_1” in
+example file):</p>
+<div class="sourceCode" id="cb6"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="co"># get target from header</span></span>
+<span><span class="va">vocabulary_label</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste0</a></span><span class="op">(</span><span class="st">"label_"</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">5</span><span class="op">)</span></span>
+<span><span class="va">gen</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
+<span>                     train_type <span class="op">=</span> <span class="st">"label_header"</span>,</span>
+<span>                     batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                     maxlen <span class="op">=</span> <span class="fl">6</span>,</span>
+<span>                     vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
+<span>                     vocabulary_label <span class="op">=</span> <span class="va">vocabulary_label</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> </span>
+<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> </span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary_label</span> </span>
+<span><span class="va">x</span> <span class="co"># abcdef</span></span></code></pre></div>
+<pre><code><span><span class="co">##      a b c d e f g h i</span></span>
+<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span>
+<span><span class="co">## [4,] 0 0 0 1 0 0 0 0 0</span></span>
+<span><span class="co">## [5,] 0 0 0 0 1 0 0 0 0</span></span>
+<span><span class="co">## [6,] 0 0 0 0 0 1 0 0 0</span></span></code></pre>
+<div class="sourceCode" id="cb8"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># label_1 </span></span></code></pre></div>
+<pre><code><span><span class="co">##      label_1 label_2 label_3 label_4 label_5</span></span>
+<span><span class="co">## [1,]       1       0       0       0       0</span></span></code></pre>
+<p>Extract target from fasta folder:</p>
+<div class="sourceCode" id="cb10"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="co"># create data for second class</span></span>
+<span><span class="va">df</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="st">"AABAACAADAAE"</span>, Header <span class="op">=</span> <span class="st">"header_1"</span><span class="op">)</span></span>
+<span><span class="va">file_path_2</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fasta"</span><span class="op">)</span></span>
+<span><span class="va">fasta_file</span> <span class="op">&lt;-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">file_path_2</span><span class="op">)</span></span>
+<span></span>
+<span><span class="co"># get target from folder</span></span>
+<span><span class="va">vocabulary_label</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste0</a></span><span class="op">(</span><span class="st">"label_"</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">2</span><span class="op">)</span></span>
+<span><span class="va">gen</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="va">file_path</span>, <span class="va">file_path_2</span><span class="op">)</span>, <span class="co"># one entry for each class</span></span>
+<span>                     train_type <span class="op">=</span> <span class="st">"label_folder"</span>,</span>
+<span>                     batch_size <span class="op">=</span> <span class="fl">8</span>,</span>
+<span>                     maxlen <span class="op">=</span> <span class="fl">6</span>,</span>
+<span>                     vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
+<span>                     vocabulary_label <span class="op">=</span> <span class="va">vocabulary_label</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span>
+<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> </span>
+<span><span class="va">x_1_1</span> <span class="op">&lt;-</span> <span class="va">x</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x_1_1</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="va">x_1_1</span> <span class="co"># first sample from first class</span></span></code></pre></div>
+<pre><code><span><span class="co">##      a b c d e f g h i</span></span>
+<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span>
+<span><span class="co">## [4,] 0 0 0 1 0 0 0 0 0</span></span>
+<span><span class="co">## [5,] 0 0 0 0 1 0 0 0 0</span></span>
+<span><span class="co">## [6,] 0 0 0 0 0 1 0 0 0</span></span></code></pre>
+<div class="sourceCode" id="cb12"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">x_2_1</span> <span class="op">&lt;-</span> <span class="va">x</span><span class="op">[</span><span class="fl">5</span>, , <span class="op">]</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x_2_1</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="va">x_2_1</span> <span class="co"># first sample from second class</span></span></code></pre></div>
+<pre><code><span><span class="co">##      a b c d e f g h i</span></span>
+<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [2,] 1 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [3,] 0 1 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [4,] 1 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [5,] 1 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [6,] 0 0 1 0 0 0 0 0 0</span></span></code></pre>
+<div class="sourceCode" id="cb14"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary_label</span> </span>
+<span><span class="va">y</span> <span class="co"># 4 samples from each class  </span></span></code></pre></div>
+<pre><code><span><span class="co">##      label_1 label_2</span></span>
+<span><span class="co">## [1,]       1       0</span></span>
+<span><span class="co">## [2,]       1       0</span></span>
+<span><span class="co">## [3,]       1       0</span></span>
+<span><span class="co">## [4,]       1       0</span></span>
+<span><span class="co">## [5,]       0       1</span></span>
+<span><span class="co">## [6,]       0       1</span></span>
+<span><span class="co">## [7,]       0       1</span></span>
+<span><span class="co">## [8,]       0       1</span></span></code></pre>
+<p>Extract target from csv file:</p>
+<div class="sourceCode" id="cb16"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="co"># get target from csv</span></span>
+<span><span class="va">file</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/basename.html" class="external-link">basename</a></span><span class="op">(</span><span class="va">file_path</span><span class="op">)</span>, <span class="st">"xyz.fasta"</span>, <span class="st">"abc.fasta"</span>, <span class="st">"x_123.fasta"</span><span class="op">)</span></span>
+<span><span class="va">vocabulary_label</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste0</a></span><span class="op">(</span><span class="st">"label"</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">4</span><span class="op">)</span></span>
+<span><span class="va">label_1</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">1</span>, <span class="fl">0</span>, <span class="fl">0</span>, <span class="fl">0</span><span class="op">)</span></span>
+<span><span class="va">label_2</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fl">1</span>, <span class="fl">0</span>, <span class="fl">0</span><span class="op">)</span></span>
+<span><span class="va">label_3</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fl">0</span>, <span class="fl">1</span>, <span class="fl">0</span><span class="op">)</span></span>
+<span><span class="va">label_4</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fl">0</span>, <span class="fl">0</span>, <span class="fl">1</span><span class="op">)</span></span>
+<span><span class="va">df</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span><span class="va">file</span>, <span class="va">label_1</span>, <span class="va">label_2</span>, <span class="va">label_3</span>, <span class="va">label_4</span><span class="op">)</span></span>
+<span><span class="va">df</span></span></code></pre></div>
+<pre><code><span><span class="co">##          file label_1 label_2 label_3 label_4</span></span>
+<span><span class="co">## 1     a.fasta       1       0       0       0</span></span>
+<span><span class="co">## 2   xyz.fasta       0       1       0       0</span></span>
+<span><span class="co">## 3   abc.fasta       0       0       1       0</span></span>
+<span><span class="co">## 4 x_123.fasta       0       0       0       1</span></span></code></pre>
+<div class="sourceCode" id="cb18"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">csv_file</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".csv"</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/utils/write.table.html" class="external-link">write.csv</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">csv_file</span>, row.names <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">gen</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
+<span>                     train_type <span class="op">=</span> <span class="st">"label_csv"</span>,</span>
+<span>                     batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                     maxlen <span class="op">=</span> <span class="fl">6</span>,</span>
+<span>                     target_from_csv <span class="op">=</span> <span class="va">csv_file</span>,</span>
+<span>                     vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
+<span>                     vocabulary_label <span class="op">=</span> <span class="va">vocabulary_label</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> </span>
+<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> </span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary_label</span> </span>
+<span><span class="va">x</span> <span class="co"># abcdef</span></span></code></pre></div>
+<pre><code><span><span class="co">##      a b c d e f g h i</span></span>
+<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span>
+<span><span class="co">## [4,] 0 0 0 1 0 0 0 0 0</span></span>
+<span><span class="co">## [5,] 0 0 0 0 1 0 0 0 0</span></span>
+<span><span class="co">## [6,] 0 0 0 0 0 1 0 0 0</span></span></code></pre>
+<div class="sourceCode" id="cb20"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># label_1 </span></span></code></pre></div>
+<pre><code><span><span class="co">##      label1 label2 label3 label4</span></span>
+<span><span class="co">## [1,]      1      0      0      0</span></span></code></pre>
+<p>Examples for language models follow in the next section.</p>
+</div>
+<div class="section level3">
+<h3 id="output_format">output_format<a class="anchor" aria-label="anchor" href="#output_format"></a>
+</h3>
+<p>The <code>output_format</code> determines the shape of the output for
+a language model, i.e. part of a sequence is the input <span class="math inline">\(X\)</span> and another the target <span class="math inline">\(Y\)</span>. Assume a sequence <tt>abcdefg</tt> and
+<code>maxlen = 6</code>. Output correspond as follows</p>
+<p><strong>“target_right”</strong>: <span class="math inline">\(X=\)</span> <tt>abcdef</tt>, <span class="math inline">\(Y=\)</span> <tt>g</tt></p>
+<p><strong>“target_middle_lstm”</strong>: <span class="math inline">\(X
+=\)</span> (<span class="math inline">\(X_1 =\)</span> <tt>abc</tt>,
+<span class="math inline">\(X_2 =\)</span> <tt>gfe</tt>), <span class="math inline">\(Y=\)</span> <tt>d</tt> (note reversed order of
+<span class="math inline">\(X_2\)</span>)</p>
+<p><strong>“target_middle_cnn”</strong>: <span class="math inline">\(X
+=\)</span> <tt>abcefg</tt>, <span class="math inline">\(Y =\)</span>
+<tt>d</tt></p>
+<p><strong>“wavenet”</strong>: <span class="math inline">\(X =\)</span>
+<tt>abcdef</tt>, <span class="math inline">\(Y =\)</span>
+<tt>bcdefg</tt></p>
+<div class="sourceCode" id="cb22"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="co"># target_right</span></span>
+<span><span class="va">gen</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
+<span>                     train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
+<span>                     batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                     maxlen <span class="op">=</span> <span class="fl">6</span>,</span>
+<span>                     vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
+<span>                     output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> </span>
+<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> </span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="va">x</span> <span class="co"># abcdef</span></span></code></pre></div>
+<pre><code><span><span class="co">##      a b c d e f g h i</span></span>
+<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span>
+<span><span class="co">## [4,] 0 0 0 1 0 0 0 0 0</span></span>
+<span><span class="co">## [5,] 0 0 0 0 1 0 0 0 0</span></span>
+<span><span class="co">## [6,] 0 0 0 0 0 1 0 0 0</span></span></code></pre>
+<div class="sourceCode" id="cb24"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># g </span></span></code></pre></div>
+<pre><code><span><span class="co">##      a b c d e f g h i</span></span>
+<span><span class="co">## [1,] 0 0 0 0 0 0 1 0 0</span></span></code></pre>
+<div class="sourceCode" id="cb26"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="co"># target_middle_lstm</span></span>
+<span><span class="va">gen</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
+<span>                     train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
+<span>                     batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                     maxlen <span class="op">=</span> <span class="fl">6</span>,</span>
+<span>                     vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
+<span>                     output_format <span class="op">=</span> <span class="st">"target_middle_lstm"</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x_1</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> </span>
+<span><span class="va">x_2</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> </span>
+<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> </span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x_1</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x_2</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="va">x_1</span> <span class="co"># abc</span></span></code></pre></div>
+<pre><code><span><span class="co">##      a b c d e f g h i</span></span>
+<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span></code></pre>
+<div class="sourceCode" id="cb28"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">x_2</span> <span class="co"># gfe</span></span></code></pre></div>
+<pre><code><span><span class="co">##      a b c d e f g h i</span></span>
+<span><span class="co">## [1,] 0 0 0 0 0 0 1 0 0</span></span>
+<span><span class="co">## [2,] 0 0 0 0 0 1 0 0 0</span></span>
+<span><span class="co">## [3,] 0 0 0 0 1 0 0 0 0</span></span></code></pre>
+<div class="sourceCode" id="cb30"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># d </span></span></code></pre></div>
+<pre><code><span><span class="co">##      a b c d e f g h i</span></span>
+<span><span class="co">## [1,] 0 0 0 1 0 0 0 0 0</span></span></code></pre>
+<div class="sourceCode" id="cb32"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="co"># target_middle_cnn</span></span>
+<span><span class="va">gen</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
+<span>                     train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
+<span>                     batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                     maxlen <span class="op">=</span> <span class="fl">6</span>,</span>
+<span>                     vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
+<span>                     output_format <span class="op">=</span> <span class="st">"target_middle_cnn"</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span></span>
+<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="va">x</span> <span class="co"># abcefg</span></span></code></pre></div>
+<pre><code><span><span class="co">##      a b c d e f g h i</span></span>
+<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span>
+<span><span class="co">## [4,] 0 0 0 0 1 0 0 0 0</span></span>
+<span><span class="co">## [5,] 0 0 0 0 0 1 0 0 0</span></span>
+<span><span class="co">## [6,] 0 0 0 0 0 0 1 0 0</span></span></code></pre>
+<div class="sourceCode" id="cb34"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># d</span></span></code></pre></div>
+<pre><code><span><span class="co">##      a b c d e f g h i</span></span>
+<span><span class="co">## [1,] 0 0 0 1 0 0 0 0 0</span></span></code></pre>
+<div class="sourceCode" id="cb36"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="co"># wavenet</span></span>
+<span><span class="va">gen</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
+<span>                     train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
+<span>                     batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                     maxlen <span class="op">=</span> <span class="fl">6</span>,</span>
+<span>                     vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
+<span>                     output_format <span class="op">=</span> <span class="st">"wavenet"</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> </span>
+<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="va">x</span> <span class="co"># abcdef</span></span></code></pre></div>
+<pre><code><span><span class="co">##      a b c d e f g h i</span></span>
+<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span>
+<span><span class="co">## [4,] 0 0 0 1 0 0 0 0 0</span></span>
+<span><span class="co">## [5,] 0 0 0 0 1 0 0 0 0</span></span>
+<span><span class="co">## [6,] 0 0 0 0 0 1 0 0 0</span></span></code></pre>
+<div class="sourceCode" id="cb38"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># bcdefg</span></span></code></pre></div>
+<pre><code><span><span class="co">##      a b c d e f g h i</span></span>
+<span><span class="co">## [1,] 0 1 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [2,] 0 0 1 0 0 0 0 0 0</span></span>
+<span><span class="co">## [3,] 0 0 0 1 0 0 0 0 0</span></span>
+<span><span class="co">## [4,] 0 0 0 0 1 0 0 0 0</span></span>
+<span><span class="co">## [5,] 0 0 0 0 0 1 0 0 0</span></span>
+<span><span class="co">## [6,] 0 0 0 0 0 0 1 0 0</span></span></code></pre>
+</div>
+<div class="section level3">
+<h3 id="batch_size">batch_size<a class="anchor" aria-label="anchor" href="#batch_size"></a>
+</h3>
+<p>Number of samples in one batch.</p>
+<div class="sourceCode" id="cb40"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="co"># target_right</span></span>
+<span><span class="va">gen</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
+<span>                     train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
+<span>                     batch_size <span class="op">=</span> <span class="fl">7</span>,</span>
+<span>                     maxlen <span class="op">=</span> <span class="fl">6</span>,</span>
+<span>                     vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
+<span>                     output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span>
+<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> </span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/dim.html" class="external-link">dim</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span></code></pre></div>
+<pre><code><span><span class="co">## [1] 7 6 9</span></span></code></pre>
+<div class="sourceCode" id="cb42"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/dim.html" class="external-link">dim</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span></span></code></pre></div>
+<pre><code><span><span class="co">## [1] 7 9</span></span></code></pre>
+</div>
+<div class="section level3">
+<h3 id="step">step<a class="anchor" aria-label="anchor" href="#step"></a>
+</h3>
+<p>We may determine how frequently we want to take a sample. If
+<code>step = 1</code> we take a sample at every possible step. Let’s
+assume we want to predict the next character, i.e. part of the sequence
+is the <mark class="in">input</mark> and next character the
+<mark class="out">target</mark>. If
+<code>maxlen = 3, step = 1</code>:</p>
+<ol style="list-style-type: decimal">
+<li><p>sample:
+<tt><mark class="in">abc</mark><mark class="out">d</mark>efghiiii</tt></p></li>
+<li><p>sample:
+<tt>a<mark class="in">bcd</mark><mark class="out">e</mark>fghiiii</tt></p></li>
+<li><p>sample:
+<tt>ab<mark class="in">cde</mark><mark class="out">f</mark>ghiiii</tt></p></li>
+</ol>
+<p>if <code>step = 3</code></p>
+<ol style="list-style-type: decimal">
+<li><p>sample:
+<tt><mark class="in">abc</mark><mark class="out">d</mark>efghiiii</tt></p></li>
+<li><p>sample:
+<tt>abc<mark class="in">def</mark><mark class="out">g</mark>hiiii</tt></p></li>
+<li><p>sample:
+<tt>abcdef<mark class="in">ghi</mark><mark class="out">i</mark>ii</tt></p></li>
+</ol>
+<div class="sourceCode" id="cb44"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">gen</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
+<span>                     train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
+<span>                     batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                     maxlen <span class="op">=</span> <span class="fl">3</span>,</span>
+<span>                     vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
+<span>                     step <span class="op">=</span> <span class="fl">3</span>, </span>
+<span>                     output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> <span class="co">#encodes abc</span></span>
+<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> <span class="co"># encodes d</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="va">x</span></span></code></pre></div>
+<pre><code><span><span class="co">##      a b c d e f g h i</span></span>
+<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span></code></pre>
+<div class="sourceCode" id="cb46"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">y</span></span></code></pre></div>
+<pre><code><span><span class="co">##      a b c d e f g h i</span></span>
+<span><span class="co">## [1,] 0 0 0 1 0 0 0 0 0</span></span></code></pre>
+<div class="sourceCode" id="cb48"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="co"># go 3 steps forward</span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> <span class="co">#encodes def</span></span>
+<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> <span class="co"># encodes g</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="va">x</span></span></code></pre></div>
+<pre><code><span><span class="co">##      a b c d e f g h i</span></span>
+<span><span class="co">## [1,] 0 0 0 1 0 0 0 0 0</span></span>
+<span><span class="co">## [2,] 0 0 0 0 1 0 0 0 0</span></span>
+<span><span class="co">## [3,] 0 0 0 0 0 1 0 0 0</span></span></code></pre>
+<div class="sourceCode" id="cb50"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">y</span></span></code></pre></div>
+<pre><code><span><span class="co">##      a b c d e f g h i</span></span>
+<span><span class="co">## [1,] 0 0 0 0 0 0 1 0 0</span></span></code></pre>
+</div>
+<div class="section level3">
+<h3 id="padding">padding<a class="anchor" aria-label="anchor" href="#padding"></a>
+</h3>
+<p>If the sequence is too short to create a single sample, we can pad
+the sequence with zero-vectors. If <code>padding = FALSE</code> the
+generator will go to next file/ fasta entry until it finds a sequence
+long enough for a sample.</p>
+<div class="sourceCode" id="cb52"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">gen</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
+<span>                     train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
+<span>                     batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                     maxlen <span class="op">=</span> <span class="fl">15</span>, <span class="co"># maxlen is longer than sequence</span></span>
+<span>                     vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
+<span>                     step <span class="op">=</span> <span class="fl">3</span>,</span>
+<span>                     padding <span class="op">=</span> <span class="cn">TRUE</span>,</span>
+<span>                     output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> </span>
+<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> </span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="va">x</span> <span class="co"># first 4 entries are zero-vectors</span></span></code></pre></div>
+<pre><code><span><span class="co">##       a b c d e f g h i</span></span>
+<span><span class="co">##  [1,] 0 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">##  [2,] 0 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">##  [3,] 0 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">##  [4,] 0 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">##  [5,] 1 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">##  [6,] 0 1 0 0 0 0 0 0 0</span></span>
+<span><span class="co">##  [7,] 0 0 1 0 0 0 0 0 0</span></span>
+<span><span class="co">##  [8,] 0 0 0 1 0 0 0 0 0</span></span>
+<span><span class="co">##  [9,] 0 0 0 0 1 0 0 0 0</span></span>
+<span><span class="co">## [10,] 0 0 0 0 0 1 0 0 0</span></span>
+<span><span class="co">## [11,] 0 0 0 0 0 0 1 0 0</span></span>
+<span><span class="co">## [12,] 0 0 0 0 0 0 0 1 0</span></span>
+<span><span class="co">## [13,] 0 0 0 0 0 0 0 0 1</span></span>
+<span><span class="co">## [14,] 0 0 0 0 0 0 0 0 1</span></span>
+<span><span class="co">## [15,] 0 0 0 0 0 0 0 0 1</span></span></code></pre>
+<div class="sourceCode" id="cb54"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">y</span></span></code></pre></div>
+<pre><code><span><span class="co">##      a b c d e f g h i</span></span>
+<span><span class="co">## [1,] 0 0 0 0 0 0 0 0 1</span></span></code></pre>
+</div>
+<div class="section level3">
+<h3 id="ambiguous_nuc">ambiguous_nuc<a class="anchor" aria-label="anchor" href="#ambiguous_nuc"></a>
+</h3>
+<p>A sequence might contain a character that does not lie inside our
+vocabulary. For example, let’s assume we discard <tt>e</tt> from our
+vocabulary. We have 4 options to handle this situation</p>
+<ol style="list-style-type: decimal">
+<li>encode as zero vector</li>
+</ol>
+<div class="sourceCode" id="cb56"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">vocabulary_2</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"a"</span>, <span class="st">"b"</span>, <span class="st">"c"</span>, <span class="st">"d"</span>, <span class="st">"f"</span>, <span class="st">"g"</span>, <span class="st">"h"</span>, <span class="st">"i"</span><span class="op">)</span> <span class="co"># exclude "e" from vocabulary</span></span>
+<span></span>
+<span><span class="co"># zero</span></span>
+<span><span class="va">gen</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
+<span>                     train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
+<span>                     batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                     maxlen <span class="op">=</span> <span class="fl">6</span>,</span>
+<span>                     vocabulary <span class="op">=</span> <span class="va">vocabulary_2</span>,</span>
+<span>                     output_format <span class="op">=</span> <span class="st">"target_right"</span>,</span>
+<span>                     ambiguous_nuc <span class="op">=</span> <span class="st">"zeros"</span><span class="op">)</span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> </span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary_2</span></span>
+<span><span class="va">x</span> <span class="co"># fifth row is zero vector </span></span></code></pre></div>
+<pre><code><span><span class="co">##      a b c d f g h i</span></span>
+<span><span class="co">## [1,] 1 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [2,] 0 1 0 0 0 0 0 0</span></span>
+<span><span class="co">## [3,] 0 0 1 0 0 0 0 0</span></span>
+<span><span class="co">## [4,] 0 0 0 1 0 0 0 0</span></span>
+<span><span class="co">## [5,] 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [6,] 0 0 0 0 1 0 0 0</span></span></code></pre>
+<ol start="2" style="list-style-type: decimal">
+<li>equal probability</li>
+</ol>
+<div class="sourceCode" id="cb58"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="co"># equal</span></span>
+<span><span class="va">gen</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
+<span>                    train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
+<span>                    batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                    maxlen <span class="op">=</span> <span class="fl">6</span>,</span>
+<span>                    vocabulary <span class="op">=</span> <span class="va">vocabulary_2</span>,</span>
+<span>                    output_format <span class="op">=</span> <span class="st">"target_right"</span>,</span>
+<span>                    ambiguous_nuc <span class="op">=</span> <span class="st">"equal"</span><span class="op">)</span> </span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary_2</span></span>
+<span><span class="va">x</span> <span class="co"># fifth row is 1/8 for every entry </span></span></code></pre></div>
+<pre><code><span><span class="co">##          a     b     c     d     f     g     h     i</span></span>
+<span><span class="co">## [1,] 1.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000</span></span>
+<span><span class="co">## [2,] 0.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000</span></span>
+<span><span class="co">## [3,] 0.000 0.000 1.000 0.000 0.000 0.000 0.000 0.000</span></span>
+<span><span class="co">## [4,] 0.000 0.000 0.000 1.000 0.000 0.000 0.000 0.000</span></span>
+<span><span class="co">## [5,] 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125</span></span>
+<span><span class="co">## [6,] 0.000 0.000 0.000 0.000 1.000 0.000 0.000 0.000</span></span></code></pre>
+<ol start="3" style="list-style-type: decimal">
+<li>use distribution of current file</li>
+</ol>
+<div class="sourceCode" id="cb60"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="co"># empirical</span></span>
+<span><span class="va">gen</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
+<span>                     train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
+<span>                     batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                     maxlen <span class="op">=</span> <span class="fl">6</span>,</span>
+<span>                     vocabulary <span class="op">=</span> <span class="va">vocabulary_2</span>,</span>
+<span>                     output_format <span class="op">=</span> <span class="st">"target_right"</span>,</span>
+<span>                     ambiguous_nuc <span class="op">=</span> <span class="st">"empirical"</span><span class="op">)</span> </span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> </span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary_2</span></span>
+<span><span class="va">x</span> <span class="co"># fifth row is distribuation of file</span></span></code></pre></div>
+<pre><code><span><span class="co">##               a          b          c          d          f          g</span></span>
+<span><span class="co">## [1,] 1.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000</span></span>
+<span><span class="co">## [2,] 0.00000000 1.00000000 0.00000000 0.00000000 0.00000000 0.00000000</span></span>
+<span><span class="co">## [3,] 0.00000000 0.00000000 1.00000000 0.00000000 0.00000000 0.00000000</span></span>
+<span><span class="co">## [4,] 0.00000000 0.00000000 0.00000000 1.00000000 0.00000000 0.00000000</span></span>
+<span><span class="co">## [5,] 0.09090909 0.09090909 0.09090909 0.09090909 0.09090909 0.09090909</span></span>
+<span><span class="co">## [6,] 0.00000000 0.00000000 0.00000000 0.00000000 1.00000000 0.00000000</span></span>
+<span><span class="co">##               h         i</span></span>
+<span><span class="co">## [1,] 0.00000000 0.0000000</span></span>
+<span><span class="co">## [2,] 0.00000000 0.0000000</span></span>
+<span><span class="co">## [3,] 0.00000000 0.0000000</span></span>
+<span><span class="co">## [4,] 0.00000000 0.0000000</span></span>
+<span><span class="co">## [5,] 0.09090909 0.3636364</span></span>
+<span><span class="co">## [6,] 0.00000000 0.0000000</span></span></code></pre>
+<ol start="4" style="list-style-type: decimal">
+<li>discard</li>
+</ol>
+<div class="sourceCode" id="cb62"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="co"># discard</span></span>
+<span><span class="va">gen</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
+<span>                     train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
+<span>                     batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                     maxlen <span class="op">=</span> <span class="fl">6</span>,</span>
+<span>                     vocabulary <span class="op">=</span> <span class="va">vocabulary_2</span>,</span>
+<span>                     output_format <span class="op">=</span> <span class="st">"target_right"</span>,</span>
+<span>                     ambiguous_nuc <span class="op">=</span> <span class="st">"discard"</span><span class="op">)</span> </span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary_2</span></span>
+<span><span class="va">x</span> <span class="co"># first sample with only characters from vocabulary is fghiii|i</span></span></code></pre></div>
+<pre><code><span><span class="co">##      a b c d f g h i</span></span>
+<span><span class="co">## [1,] 0 0 0 0 1 0 0 0</span></span>
+<span><span class="co">## [2,] 0 0 0 0 0 1 0 0</span></span>
+<span><span class="co">## [3,] 0 0 0 0 0 0 1 0</span></span>
+<span><span class="co">## [4,] 0 0 0 0 0 0 0 1</span></span>
+<span><span class="co">## [5,] 0 0 0 0 0 0 0 1</span></span>
+<span><span class="co">## [6,] 0 0 0 0 0 0 0 1</span></span></code></pre>
+</div>
+<div class="section level3">
+<h3 id="proportion_per_seq">proportion_per_seq<a class="anchor" aria-label="anchor" href="#proportion_per_seq"></a>
+</h3>
+<p>The <code>proportion_per_seq</code> argument gives the option to use
+a random subset instead of the full sequence.</p>
+<div class="sourceCode" id="cb64"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/cat.html" class="external-link">cat</a></span><span class="op">(</span><span class="st">"sequence is "</span>, <span class="fu"><a href="https://rdrr.io/r/base/nchar.html" class="external-link">nchar</a></span><span class="op">(</span><span class="va">sequence</span><span class="op">)</span>, <span class="st">"characters long \n"</span><span class="op">)</span></span></code></pre></div>
+<pre><code><span><span class="co">## sequence is  12 characters long</span></span></code></pre>
+<div class="sourceCode" id="cb66"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">gen</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
+<span>                     train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
+<span>                     batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                     maxlen <span class="op">=</span> <span class="fl">5</span>,</span>
+<span>                     seed <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                     vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
+<span>                     output_format <span class="op">=</span> <span class="st">"target_right"</span>,</span>
+<span>                     <span class="co"># take random subsequence using 50% of sequence </span></span>
+<span>                     proportion_per_seq <span class="op">=</span> <span class="fl">0.5</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span>
+<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="va">x</span> <span class="co"># defgh</span></span></code></pre></div>
+<pre><code><span><span class="co">##      a b c d e f g h i</span></span>
+<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span>
+<span><span class="co">## [4,] 0 0 0 1 0 0 0 0 0</span></span>
+<span><span class="co">## [5,] 0 0 0 0 1 0 0 0 0</span></span></code></pre>
+<div class="sourceCode" id="cb68"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># i</span></span></code></pre></div>
+<pre><code><span><span class="co">##      a b c d e f g h i</span></span>
+<span><span class="co">## [1,] 0 0 0 0 0 1 0 0 0</span></span></code></pre>
+</div>
+<div class="section level3">
+<h3 id="file_limit">file_limit<a class="anchor" aria-label="anchor" href="#file_limit"></a>
+</h3>
+<p>Integer or NULL. If integer, use only specified number of randomly
+sampled files for training.</p>
+</div>
+<div class="section level3">
+<h3 id="delete_used_files">delete_used_files<a class="anchor" aria-label="anchor" href="#delete_used_files"></a>
+</h3>
+<p>If true, delete file once used. Only applies for rds files.</p>
+<div class="sourceCode" id="cb70"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/array.html" class="external-link">array</a></span><span class="op">(</span><span class="fl">0</span>, dim <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">1</span>,<span class="fl">5</span>,<span class="fl">4</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/matrix.html" class="external-link">matrix</a></span><span class="op">(</span><span class="fl">0</span>, ncol <span class="op">=</span> <span class="fl">1</span><span class="op">)</span></span>
+<span><span class="va">rds_path</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".rds"</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/readRDS.html" class="external-link">saveRDS</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span><span class="va">x</span>, <span class="va">y</span><span class="op">)</span>, <span class="va">rds_path</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">gen</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">rds_path</span>,</span>
+<span>                     delete_used_files <span class="op">=</span> <span class="cn">TRUE</span>,</span>
+<span>                     train_type <span class="op">=</span> <span class="st">"label_rds"</span>,</span>
+<span>                     batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                     maxlen <span class="op">=</span> <span class="fl">5</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/files.html" class="external-link">file.exists</a></span><span class="op">(</span><span class="va">rds_path</span><span class="op">)</span></span></code></pre></div>
+<pre><code><span><span class="co">## [1] FALSE</span></span></code></pre>
+<div class="sourceCode" id="cb72"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="co"># z &lt;- gen()</span></span>
+<span><span class="co"># When calling the generator again, it will wait until it finds a file again from the files listed in </span></span>
+<span><span class="co"># the initial `path` argument. Can be used if another process(es) create rds files.</span></span></code></pre></div>
+</div>
+<div class="section level3">
+<h3 id="max_samples">max_samples<a class="anchor" aria-label="anchor" href="#max_samples"></a>
+</h3>
+<p>Only use fixed number of samples per file. Randomly choose which
+samples to use. (If <code>random_sampling = FALSE</code>, samples are
+consecutive.)</p>
+<div class="sourceCode" id="cb73"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">gen</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
+<span>                     train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
+<span>                     batch_size <span class="op">=</span> <span class="fl">2</span>,</span>
+<span>                     maxlen <span class="op">=</span> <span class="fl">5</span>,</span>
+<span>                     step <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                     seed <span class="op">=</span> <span class="fl">3</span>,</span>
+<span>                     vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
+<span>                     output_format <span class="op">=</span> <span class="st">"target_right"</span>,</span>
+<span>                     max_samples <span class="op">=</span> <span class="fl">2</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x1</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span>
+<span><span class="va">x2</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">2</span>, , <span class="op">]</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x1</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x2</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="va">x1</span> <span class="co"># bcdef</span></span></code></pre></div>
+<pre><code><span><span class="co">##      a b c d e f g h i</span></span>
+<span><span class="co">## [1,] 0 0 0 0 1 0 0 0 0</span></span>
+<span><span class="co">## [2,] 0 0 0 0 0 1 0 0 0</span></span>
+<span><span class="co">## [3,] 0 0 0 0 0 0 1 0 0</span></span>
+<span><span class="co">## [4,] 0 0 0 0 0 0 0 1 0</span></span>
+<span><span class="co">## [5,] 0 0 0 0 0 0 0 0 1</span></span></code></pre>
+<div class="sourceCode" id="cb75"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">x2</span> <span class="co"># cdefg</span></span></code></pre></div>
+<pre><code><span><span class="co">##      a b c d e f g h i</span></span>
+<span><span class="co">## [1,] 0 0 0 0 0 1 0 0 0</span></span>
+<span><span class="co">## [2,] 0 0 0 0 0 0 1 0 0</span></span>
+<span><span class="co">## [3,] 0 0 0 0 0 0 0 1 0</span></span>
+<span><span class="co">## [4,] 0 0 0 0 0 0 0 0 1</span></span>
+<span><span class="co">## [5,] 0 0 0 0 0 0 0 0 1</span></span></code></pre>
+</div>
+<div class="section level3">
+<h3 id="random_sampling">random_sampling<a class="anchor" aria-label="anchor" href="#random_sampling"></a>
+</h3>
+<p>If you use <code>max_samples</code>, generator will randomly choose
+subset from all possible samples, but those samples are consecutive.
+With <code>random_sampling = TRUE</code>, samples are completely
+random.</p>
+<div class="sourceCode" id="cb77"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">gen</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
+<span>                     train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
+<span>                     batch_size <span class="op">=</span> <span class="fl">2</span>,</span>
+<span>                     maxlen <span class="op">=</span> <span class="fl">5</span>,</span>
+<span>                     seed <span class="op">=</span> <span class="fl">66</span>,</span>
+<span>                     random_sampling <span class="op">=</span> <span class="cn">TRUE</span>,</span>
+<span>                     vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
+<span>                     output_format <span class="op">=</span> <span class="st">"target_right"</span>,</span>
+<span>                     max_samples <span class="op">=</span> <span class="fl">2</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x1</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span>
+<span><span class="va">x2</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">2</span>, , <span class="op">]</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x1</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x2</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="va">x1</span> <span class="co"># efghi</span></span></code></pre></div>
+<pre><code><span><span class="co">##      a b c d e f g h i</span></span>
+<span><span class="co">## [1,] 0 0 0 0 0 0 1 0 0</span></span>
+<span><span class="co">## [2,] 0 0 0 0 0 0 0 1 0</span></span>
+<span><span class="co">## [3,] 0 0 0 0 0 0 0 0 1</span></span>
+<span><span class="co">## [4,] 0 0 0 0 0 0 0 0 1</span></span>
+<span><span class="co">## [5,] 0 0 0 0 0 0 0 0 1</span></span></code></pre>
+<div class="sourceCode" id="cb79"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">x2</span> <span class="co"># defgh</span></span></code></pre></div>
+<pre><code><span><span class="co">##      a b c d e f g h i</span></span>
+<span><span class="co">## [1,] 0 0 0 0 1 0 0 0 0</span></span>
+<span><span class="co">## [2,] 0 0 0 0 0 1 0 0 0</span></span>
+<span><span class="co">## [3,] 0 0 0 0 0 0 1 0 0</span></span>
+<span><span class="co">## [4,] 0 0 0 0 0 0 0 1 0</span></span>
+<span><span class="co">## [5,] 0 0 0 0 0 0 0 0 1</span></span></code></pre>
+</div>
+<div class="section level3">
+<h3 id="target_len">target_len<a class="anchor" aria-label="anchor" href="#target_len"></a>
+</h3>
+<p>Target length for language model.</p>
+<div class="sourceCode" id="cb81"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">gen</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
+<span>                     train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
+<span>                     batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                     target_len <span class="op">=</span> <span class="fl">3</span>, </span>
+<span>                     maxlen <span class="op">=</span> <span class="fl">5</span>,</span>
+<span>                     vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
+<span>                     output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span>
+<span><span class="va">y1</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span><span class="op">[</span> , <span class="fl">1</span>, <span class="op">]</span></span>
+<span><span class="va">y2</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span><span class="op">[</span> , <span class="fl">2</span>, <span class="op">]</span></span>
+<span><span class="va">y3</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span><span class="op">[</span> , <span class="fl">3</span>, <span class="op">]</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/names.html" class="external-link">names</a></span><span class="op">(</span><span class="va">y1</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/names.html" class="external-link">names</a></span><span class="op">(</span><span class="va">y2</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/names.html" class="external-link">names</a></span><span class="op">(</span><span class="va">y3</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="va">x</span> <span class="co"># abcde</span></span></code></pre></div>
+<pre><code><span><span class="co">##      a b c d e f g h i</span></span>
+<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span>
+<span><span class="co">## [4,] 0 0 0 1 0 0 0 0 0</span></span>
+<span><span class="co">## [5,] 0 0 0 0 1 0 0 0 0</span></span></code></pre>
+<div class="sourceCode" id="cb83"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">y1</span> <span class="co"># f</span></span></code></pre></div>
+<pre><code><span><span class="co">## a b c d e f g h i </span></span>
+<span><span class="co">## 0 0 0 0 0 1 0 0 0</span></span></code></pre>
+<div class="sourceCode" id="cb85"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">y2</span> <span class="co"># g</span></span></code></pre></div>
+<pre><code><span><span class="co">## a b c d e f g h i </span></span>
+<span><span class="co">## 0 0 0 0 0 0 1 0 0</span></span></code></pre>
+<div class="sourceCode" id="cb87"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">y3</span> <span class="co"># h</span></span></code></pre></div>
+<pre><code><span><span class="co">## a b c d e f g h i </span></span>
+<span><span class="co">## 0 0 0 0 0 0 0 1 0</span></span></code></pre>
+</div>
+<div class="section level3">
+<h3 id="n_gram-n_gram_stride">n_gram / n_gram_stride<a class="anchor" aria-label="anchor" href="#n_gram-n_gram_stride"></a>
+</h3>
+<p>Encode target in language model not character wise but combine n
+characters to one target. <code>n_gram_stride</code> determines the
+frequency of the n-gram encoding.</p>
+<div class="sourceCode" id="cb89"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">gen</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
+<span>                     train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
+<span>                     batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                     target_len <span class="op">=</span> <span class="fl">6</span>, </span>
+<span>                     n_gram <span class="op">=</span> <span class="fl">3</span>,</span>
+<span>                     n_gram_stride <span class="op">=</span> <span class="fl">3</span>,</span>
+<span>                     maxlen <span class="op">=</span> <span class="fl">3</span>,</span>
+<span>                     vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
+<span>                     output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span>
+<span><span class="va">y1</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span><span class="op">[</span> , <span class="fl">1</span>, <span class="op">]</span></span>
+<span><span class="va">y2</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span><span class="op">[</span> , <span class="fl">2</span>, <span class="op">]</span></span>
+<span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/dim.html" class="external-link">dim</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">[</span><span class="fl">3</span><span class="op">]</span> <span class="op">==</span> <span class="fu"><a href="https://rdrr.io/r/base/length.html" class="external-link">length</a></span><span class="op">(</span><span class="va">vocabulary</span><span class="op">)</span><span class="op">^</span><span class="fl">3</span></span></code></pre></div>
+<pre><code><span><span class="co">## [1] TRUE</span></span></code></pre>
+<div class="sourceCode" id="cb91"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="co"># x = abc as 3-gram</span></span>
+<span><span class="co"># y1 = def as 3-gram</span></span>
+<span><span class="co"># y2 = ghi as 3-gram</span></span></code></pre></div>
+</div>
+<div class="section level3">
+<h3 id="add_noise">add_noise<a class="anchor" aria-label="anchor" href="#add_noise"></a>
+</h3>
+<p>Add noise to input. Must be a list that specifies noise distribution
+or NULL (no noise). List contains arguments <code>noise_type</code>:
+either <code>"normal"</code> or <code>"uniform"</code>. Optional
+arguments <code>sd</code> or <code>mean</code> if
+<code>noise_type</code> is <code>"normal"</code> (default is
+<code>sd=1</code> and <code>mean=0</code>) or <code>min</code>,
+<code>max</code> if <code>noise_type</code> is <code>"uniform"</code>
+(default is <code>min=0</code>, <code>max=1</code>).</p>
+<div class="sourceCode" id="cb92"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">gen</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
+<span>                     train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
+<span>                     batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                     add_noise <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span>noise_type <span class="op">=</span> <span class="st">"normal"</span>, mean <span class="op">=</span> <span class="fl">0</span>, sd <span class="op">=</span> <span class="fl">0.01</span><span class="op">)</span>,</span>
+<span>                     maxlen <span class="op">=</span> <span class="fl">5</span>,</span>
+<span>                     vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
+<span>                     output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span>
+<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span>
+<span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/Round.html" class="external-link">round</a></span><span class="op">(</span><span class="va">x</span>, <span class="fl">3</span><span class="op">)</span> <span class="co"># abcde + noise</span></span></code></pre></div>
+<pre><code><span><span class="co">##           a      b      c     d      e      f      g      h      i</span></span>
+<span><span class="co">## [1,]  0.994  0.005 -0.006 0.008  0.006  0.014 -0.004  0.007 -0.001</span></span>
+<span><span class="co">## [2,]  0.002  1.007 -0.022 0.006 -0.001 -0.001 -0.001  0.006  0.009</span></span>
+<span><span class="co">## [3,] -0.008  0.006  1.011 0.009 -0.002  0.004  0.011 -0.007  0.004</span></span>
+<span><span class="co">## [4,]  0.016 -0.003  0.000 1.008 -0.015 -0.001  0.008 -0.007 -0.006</span></span>
+<span><span class="co">## [5,]  0.003  0.015  0.000 0.001  0.995 -0.014 -0.002  0.004  0.003</span></span></code></pre>
+<div class="sourceCode" id="cb94"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># f</span></span></code></pre></div>
+<pre><code><span><span class="co">##                 a           b           c           d           e         f</span></span>
+<span><span class="co">## [1,] -0.008204684 0.003898432 0.009438362 -0.01989352 0.004179416 0.9958501</span></span>
+<span><span class="co">##                 g           h           i</span></span>
+<span><span class="co">## [1,] -0.002533617 0.007685329 -0.01129363</span></span></code></pre>
+</div>
+<div class="section level3">
+<h3 id="proportion_entries">proportion_entries<a class="anchor" aria-label="anchor" href="#proportion_entries"></a>
+</h3>
+<p>If a fasta file has multiple entries, you can randomly choose a
+subset. For example, if the file has 6 entries and
+<code>proportion_entries = 0.5</code> the generator will randomly choose
+only 3 of the entries.</p>
+</div>
+<div class="section level3">
+<h3 id="shuffle_file_order">shuffle_file_order<a class="anchor" aria-label="anchor" href="#shuffle_file_order"></a>
+</h3>
+<p>Shuffle file order before iterating through files. Order gets
+reshuffled after every iteration.</p>
+</div>
+<div class="section level3">
+<h3 id="shuffle_input">shuffle_input<a class="anchor" aria-label="anchor" href="#shuffle_input"></a>
+</h3>
+<p>Whether to shuffle fasta entries if fasta file has multiple
+entries.</p>
+</div>
+<div class="section level3">
+<h3 id="reverse_complement">reverse_complement<a class="anchor" aria-label="anchor" href="#reverse_complement"></a>
+</h3>
+<p>If <code>TRUE</code>, randomly decide for every batch to use original
+sequence or its reverse complement. Only implemented for <tt>ACGT</tt>
+vocabulary.</p>
+</div>
+<div class="section level3">
+<h3 id="sample_by_file_size">sample_by_file_size<a class="anchor" aria-label="anchor" href="#sample_by_file_size"></a>
+</h3>
+<p>Randomly choose new file by sampling according to file size (bigger
+files more likely).</p>
+</div>
+<div class="section level3">
+<h3 id="concat_seq">concat_seq<a class="anchor" aria-label="anchor" href="#concat_seq"></a>
+</h3>
+<p>Character string or <code>NULL</code>. If not <code>NULL</code> all
+entries from file get concatenated to one sequence with
+<code>concat_seq</code> string between them. Use
+<code>concat_seq = ""</code> if you don’t want to add a new token.</p>
+<div class="sourceCode" id="cb96"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">df</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"AC"</span>, <span class="st">"AG"</span>, <span class="st">"AT"</span><span class="op">)</span>, Header <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste0</a></span><span class="op">(</span><span class="st">"header"</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">3</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="va">fasta_path</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fasta"</span><span class="op">)</span></span>
+<span><span class="va">fasta_file</span> <span class="op">&lt;-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">fasta_path</span><span class="op">)</span></span>
+<span><span class="va">gen</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">fasta_path</span>,</span>
+<span>                     train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
+<span>                     batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                     maxlen <span class="op">=</span> <span class="fl">9</span>,</span>
+<span>                     vocabulary <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span>, <span class="st">"Z"</span><span class="op">)</span>,</span>
+<span>                     concat_seq <span class="op">=</span> <span class="st">"ZZ"</span>,</span>
+<span>                     output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span>
+<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span>
+<span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span>, <span class="st">"Z"</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span>, <span class="st">"Z"</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="co"># ACZZAGZZA</span></span></code></pre></div>
+<pre><code><span><span class="co">##       A C G T Z</span></span>
+<span><span class="co">##  [1,] 1 0 0 0 0</span></span>
+<span><span class="co">##  [2,] 0 1 0 0 0</span></span>
+<span><span class="co">##  [3,] 0 0 0 0 1</span></span>
+<span><span class="co">##  [4,] 0 0 0 0 1</span></span>
+<span><span class="co">##  [5,] 1 0 0 0 0</span></span>
+<span><span class="co">##  [6,] 0 0 1 0 0</span></span>
+<span><span class="co">##  [7,] 0 0 0 0 1</span></span>
+<span><span class="co">##  [8,] 0 0 0 0 1</span></span>
+<span><span class="co">##  [9,] 1 0 0 0 0</span></span></code></pre>
+<div class="sourceCode" id="cb98"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># T</span></span></code></pre></div>
+<pre><code><span><span class="co">##      A C G T Z</span></span>
+<span><span class="co">## [1,] 0 0 0 1 0</span></span></code></pre>
+</div>
+<div class="section level3">
+<h3 id="set_learning">set_learning<a class="anchor" aria-label="anchor" href="#set_learning"></a>
+</h3>
+<p>When you want to assign one label to set of samples. Only implemented
+for <code>train_type = "label_folder"</code>. Input is a list with the
+following parameters</p>
+<ul>
+<li>
+<code>samples_per_target</code> how many samples to use for one
+target</li>
+<li>
+<code>maxlen</code> length of one sample</li>
+<li>
+<code>reshape_mode</code>: <code>"time_dist", "multi_input"</code>
+or <code>"concat"</code>.
+<ul>
+<li>If <code>reshape_mode = "multi_input"</code>, generator will produce
+<code>samples_per_target</code> separate inputs, each of length
+<code>maxlen</code>.</li>
+<li>If <code>reshape_mode = "time_dist"</code>, generator will produce a
+4D input array. The dimensions correspond to
+<code>(batch_size, samples_per_target, maxlen, length(vocabulary))</code>.<br>
+</li>
+<li>If <code>reshape_mode</code> is <code>"concat"</code>, generator
+will concatenate <code>samples_per_target</code> sequences of length
+<code>maxlen</code> to one long sequence.</li>
+</ul>
+</li>
+<li>If <code>reshape_mode = "concat"</code>, there is an additional
+<code>buffer_len</code> argument: add new token between concatenated
+samples
+<ul>
+<li>If <code>buffer_len</code> is an integer, the sub-sequences are
+inter spaced with <code>buffer_len</code> rows. The input length is
+(<code>maxlen</code> * <code>samples_per_target</code>) +
+<code>buffer_len</code> * (<code>samples_per_target</code> - 1)</li>
+</ul>
+</li>
+</ul>
+<div class="sourceCode" id="cb100"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="co"># create data for second label</span></span>
+<span><span class="va">df</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="st">"AABAACAADAAE"</span>, Header <span class="op">=</span> <span class="st">"header_1"</span><span class="op">)</span></span>
+<span><span class="va">file_path_2</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fasta"</span><span class="op">)</span></span>
+<span><span class="va">fasta_file</span> <span class="op">&lt;-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">file_path_2</span><span class="op">)</span></span>
+<span></span>
+<span><span class="co"># multi_input </span></span>
+<span><span class="va">set_learning</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span>reshape_mode <span class="op">=</span> <span class="st">"multi_input"</span>,</span>
+<span>                     maxlen <span class="op">=</span> <span class="fl">4</span>,</span>
+<span>                     samples_per_target <span class="op">=</span> <span class="fl">3</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">gen</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="va">file_path</span>, <span class="va">file_path_2</span><span class="op">)</span>, <span class="co"># path has length 2 =&gt; 2 classes</span></span>
+<span>                     train_type <span class="op">=</span> <span class="st">"label_folder"</span>,</span>
+<span>                     batch_size <span class="op">=</span> <span class="fl">2</span>,</span>
+<span>                     maxlen <span class="op">=</span> <span class="fl">4</span>,</span>
+<span>                     step <span class="op">=</span> <span class="fl">1</span>, </span>
+<span>                     vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
+<span>                     set_learning <span class="op">=</span> <span class="va">set_learning</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span>
+<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/length.html" class="external-link">length</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="co"># 3 samples per target</span></span></code></pre></div>
+<pre><code><span><span class="co">## [1] 3</span></span></code></pre>
+<div class="sourceCode" id="cb102"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">x_1_1</span> <span class="op">&lt;-</span> <span class="va">x</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span>
+<span><span class="va">x_1_1</span> <span class="co"># abcd</span></span></code></pre></div>
+<pre><code><span><span class="co">##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]</span></span>
+<span><span class="co">## [1,]    1    0    0    0    0    0    0    0    0</span></span>
+<span><span class="co">## [2,]    0    1    0    0    0    0    0    0    0</span></span>
+<span><span class="co">## [3,]    0    0    1    0    0    0    0    0    0</span></span>
+<span><span class="co">## [4,]    0    0    0    1    0    0    0    0    0</span></span></code></pre>
+<div class="sourceCode" id="cb104"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">x_1_2</span> <span class="op">&lt;-</span> <span class="va">x</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span>
+<span><span class="va">x_1_2</span> <span class="co"># bcde</span></span></code></pre></div>
+<pre><code><span><span class="co">##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]</span></span>
+<span><span class="co">## [1,]    0    1    0    0    0    0    0    0    0</span></span>
+<span><span class="co">## [2,]    0    0    1    0    0    0    0    0    0</span></span>
+<span><span class="co">## [3,]    0    0    0    1    0    0    0    0    0</span></span>
+<span><span class="co">## [4,]    0    0    0    0    1    0    0    0    0</span></span></code></pre>
+<div class="sourceCode" id="cb106"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">x_1_3</span> <span class="op">&lt;-</span> <span class="va">x</span><span class="op">[[</span><span class="fl">3</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span>
+<span><span class="va">x_1_3</span> <span class="co"># cdef</span></span></code></pre></div>
+<pre><code><span><span class="co">##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]</span></span>
+<span><span class="co">## [1,]    0    0    1    0    0    0    0    0    0</span></span>
+<span><span class="co">## [2,]    0    0    0    1    0    0    0    0    0</span></span>
+<span><span class="co">## [3,]    0    0    0    0    1    0    0    0    0</span></span>
+<span><span class="co">## [4,]    0    0    0    0    0    1    0    0    0</span></span></code></pre>
+<div class="sourceCode" id="cb108"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">x_2_1</span> <span class="op">&lt;-</span> <span class="va">x</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">2</span>, , <span class="op">]</span></span>
+<span><span class="va">x_2_1</span> <span class="co"># aaba</span></span></code></pre></div>
+<pre><code><span><span class="co">##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]</span></span>
+<span><span class="co">## [1,]    1    0    0    0    0    0    0    0    0</span></span>
+<span><span class="co">## [2,]    1    0    0    0    0    0    0    0    0</span></span>
+<span><span class="co">## [3,]    0    1    0    0    0    0    0    0    0</span></span>
+<span><span class="co">## [4,]    1    0    0    0    0    0    0    0    0</span></span></code></pre>
+<div class="sourceCode" id="cb110"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">x_2_2</span> <span class="op">&lt;-</span> <span class="va">x</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">2</span>, , <span class="op">]</span></span>
+<span><span class="va">x_2_2</span> <span class="co"># abaa</span></span></code></pre></div>
+<pre><code><span><span class="co">##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]</span></span>
+<span><span class="co">## [1,]    1    0    0    0    0    0    0    0    0</span></span>
+<span><span class="co">## [2,]    0    1    0    0    0    0    0    0    0</span></span>
+<span><span class="co">## [3,]    1    0    0    0    0    0    0    0    0</span></span>
+<span><span class="co">## [4,]    1    0    0    0    0    0    0    0    0</span></span></code></pre>
+<div class="sourceCode" id="cb112"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">x_2_3</span> <span class="op">&lt;-</span> <span class="va">x</span><span class="op">[[</span><span class="fl">3</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">2</span>, , <span class="op">]</span></span>
+<span><span class="va">x_2_3</span> <span class="co"># baac</span></span></code></pre></div>
+<pre><code><span><span class="co">##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]</span></span>
+<span><span class="co">## [1,]    0    1    0    0    0    0    0    0    0</span></span>
+<span><span class="co">## [2,]    1    0    0    0    0    0    0    0    0</span></span>
+<span><span class="co">## [3,]    1    0    0    0    0    0    0    0    0</span></span>
+<span><span class="co">## [4,]    0    0    1    0    0    0    0    0    0</span></span></code></pre>
+<div class="sourceCode" id="cb114"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"label_1"</span>, <span class="st">"label_2"</span><span class="op">)</span></span>
+<span><span class="va">y</span> </span></code></pre></div>
+<pre><code><span><span class="co">##      label_1 label_2</span></span>
+<span><span class="co">## [1,]       1       0</span></span>
+<span><span class="co">## [2,]       0       1</span></span></code></pre>
+<div class="sourceCode" id="cb116"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="co"># concat </span></span>
+<span><span class="va">set_learning</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span>reshape_mode <span class="op">=</span> <span class="st">"concat"</span>,</span>
+<span>                     maxlen <span class="op">=</span> <span class="fl">4</span>,</span>
+<span>                     samples_per_target <span class="op">=</span> <span class="fl">3</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">gen</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="va">file_path</span>, <span class="va">file_path_2</span><span class="op">)</span>, <span class="co"># path has length 2 =&gt; 2 classes</span></span>
+<span>                     train_type <span class="op">=</span> <span class="st">"label_folder"</span>,</span>
+<span>                     batch_size <span class="op">=</span> <span class="fl">2</span>,</span>
+<span>                     maxlen <span class="op">=</span> <span class="fl">4</span>,</span>
+<span>                     step <span class="op">=</span> <span class="fl">2</span>, </span>
+<span>                     vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
+<span>                     set_learning <span class="op">=</span> <span class="va">set_learning</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span>
+<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/dim.html" class="external-link">dim</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> </span></code></pre></div>
+<pre><code><span><span class="co">## [1]  2 12  9</span></span></code></pre>
+<div class="sourceCode" id="cb118"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">x_1</span> <span class="op">&lt;-</span> <span class="va">x</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x_1</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="va">x_1</span> <span class="co"># abcd | cdef | efgh</span></span></code></pre></div>
+<pre><code><span><span class="co">##       a b c d e f g h i</span></span>
+<span><span class="co">##  [1,] 1 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">##  [2,] 0 1 0 0 0 0 0 0 0</span></span>
+<span><span class="co">##  [3,] 0 0 1 0 0 0 0 0 0</span></span>
+<span><span class="co">##  [4,] 0 0 0 1 0 0 0 0 0</span></span>
+<span><span class="co">##  [5,] 0 0 1 0 0 0 0 0 0</span></span>
+<span><span class="co">##  [6,] 0 0 0 1 0 0 0 0 0</span></span>
+<span><span class="co">##  [7,] 0 0 0 0 1 0 0 0 0</span></span>
+<span><span class="co">##  [8,] 0 0 0 0 0 1 0 0 0</span></span>
+<span><span class="co">##  [9,] 0 0 0 0 1 0 0 0 0</span></span>
+<span><span class="co">## [10,] 0 0 0 0 0 1 0 0 0</span></span>
+<span><span class="co">## [11,] 0 0 0 0 0 0 1 0 0</span></span>
+<span><span class="co">## [12,] 0 0 0 0 0 0 0 1 0</span></span></code></pre>
+<div class="sourceCode" id="cb120"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">x_2</span> <span class="op">&lt;-</span> <span class="va">x</span><span class="op">[</span><span class="fl">2</span>, , <span class="op">]</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x_2</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">vocabulary</span></span>
+<span><span class="va">x_2</span> <span class="co"># aaba | baac | acaa</span></span></code></pre></div>
+<pre><code><span><span class="co">##       a b c d e f g h i</span></span>
+<span><span class="co">##  [1,] 1 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">##  [2,] 1 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">##  [3,] 0 1 0 0 0 0 0 0 0</span></span>
+<span><span class="co">##  [4,] 1 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">##  [5,] 0 1 0 0 0 0 0 0 0</span></span>
+<span><span class="co">##  [6,] 1 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">##  [7,] 1 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">##  [8,] 0 0 1 0 0 0 0 0 0</span></span>
+<span><span class="co">##  [9,] 1 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [10,] 0 0 1 0 0 0 0 0 0</span></span>
+<span><span class="co">## [11,] 1 0 0 0 0 0 0 0 0</span></span>
+<span><span class="co">## [12,] 1 0 0 0 0 0 0 0 0</span></span></code></pre>
+<div class="sourceCode" id="cb122"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"label_1"</span>, <span class="st">"label_2"</span><span class="op">)</span></span>
+<span><span class="va">y</span> </span></code></pre></div>
+<pre><code><span><span class="co">##      label_1 label_2</span></span>
+<span><span class="co">## [1,]       1       0</span></span>
+<span><span class="co">## [2,]       0       1</span></span></code></pre>
+</div>
+<div class="section level3">
+<h3 id="use_quality_score">use_quality_score<a class="anchor" aria-label="anchor" href="#use_quality_score"></a>
+</h3>
+<p>If <code>TRUE</code>, instead of one-hot encoding, use quality score
+of fastq file.</p>
+<div class="sourceCode" id="cb124"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">df</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="st">"ACAGAT"</span>, Header <span class="op">=</span> <span class="st">"header_1"</span>, Quality <span class="op">=</span> <span class="st">"!#*=?I"</span><span class="op">)</span></span>
+<span><span class="va">fastq_path</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fastq"</span><span class="op">)</span></span>
+<span><span class="va">fastq_file</span> <span class="op">&lt;-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFastq.html" class="external-link">writeFastq</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">fastq_path</span><span class="op">)</span></span>
+<span><span class="va">gen</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">fastq_path</span>,</span>
+<span>                     train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
+<span>                     batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                     maxlen <span class="op">=</span> <span class="fl">5</span>,</span>
+<span>                     format <span class="op">=</span> <span class="st">"fastq"</span>,</span>
+<span>                     vocabulary <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span>,</span>
+<span>                     use_quality_score <span class="op">=</span> <span class="cn">TRUE</span>,</span>
+<span>                     output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span>
+<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span>
+<span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="co"># ACAGA</span></span></code></pre></div>
+<pre><code><span><span class="co">##                 A            C            G            T</span></span>
+<span><span class="co">## [1,] 0.0000000000 0.3333333333 0.3333333333 0.3333333333</span></span>
+<span><span class="co">## [2,] 0.2103191148 0.3690426555 0.2103191148 0.2103191148</span></span>
+<span><span class="co">## [3,] 0.8741074588 0.0419641804 0.0419641804 0.0419641804</span></span>
+<span><span class="co">## [4,] 0.0005282977 0.0005282977 0.9984151068 0.0005282977</span></span>
+<span><span class="co">## [5,] 0.9990000000 0.0003333333 0.0003333333 0.0003333333</span></span></code></pre>
+<div class="sourceCode" id="cb126"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># T</span></span></code></pre></div>
+<pre><code><span><span class="co">##                 A            C            G      T</span></span>
+<span><span class="co">## [1,] 3.333333e-05 3.333333e-05 3.333333e-05 0.9999</span></span></code></pre>
+</div>
+<div class="section level3">
+<h3 id="use_coverage">use_coverage<a class="anchor" aria-label="anchor" href="#use_coverage"></a>
+</h3>
+<p>Integer or <code>NULL</code>. If not <code>NULL</code>, use coverage
+as encoding rather than one-hot encoding. Coverage information must be
+contained in fasta header: there must be a string “cov_n” in the header,
+where n is some integer.</p>
+<div class="sourceCode" id="cb128"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">df</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="st">"ACAGAT"</span>, Header <span class="op">=</span> <span class="st">"header_1_cov_8"</span><span class="op">)</span></span>
+<span><span class="va">fasta_path</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fasta"</span><span class="op">)</span></span>
+<span><span class="va">fasta_file</span> <span class="op">&lt;-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">fasta_path</span><span class="op">)</span></span>
+<span><span class="va">gen</span> <span class="op">&lt;-</span>  <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">fasta_path</span>,</span>
+<span>                      train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
+<span>                      batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                      maxlen <span class="op">=</span> <span class="fl">5</span>,</span>
+<span>                      vocabulary <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span>,</span>
+<span>                      use_coverage <span class="op">=</span> <span class="fl">25</span>,</span>
+<span>                      output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span>
+<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span>
+<span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="co"># ACAGA; 0.32 = 8/25</span></span></code></pre></div>
+<pre><code><span><span class="co">##         A    C    G T</span></span>
+<span><span class="co">## [1,] 0.32 0.00 0.00 0</span></span>
+<span><span class="co">## [2,] 0.00 0.32 0.00 0</span></span>
+<span><span class="co">## [3,] 0.32 0.00 0.00 0</span></span>
+<span><span class="co">## [4,] 0.00 0.00 0.32 0</span></span>
+<span><span class="co">## [5,] 0.32 0.00 0.00 0</span></span></code></pre>
+<div class="sourceCode" id="cb130"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># T</span></span></code></pre></div>
+<pre><code><span><span class="co">##      A C G    T</span></span>
+<span><span class="co">## [1,] 0 0 0 0.32</span></span></code></pre>
+</div>
+<div class="section level3">
+<h3 id="added_label_path">added_label_path<a class="anchor" aria-label="anchor" href="#added_label_path"></a>
+</h3>
+<p>It is possible to feed a network additional information associated to
+a sequence. This information needs to be in a csv file. If all sequences
+in one file share the same label, the csv file should have one column
+named “file”.</p>
+<p>We may add some additional input to our dummy data</p>
+<div class="sourceCode" id="cb132"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">file</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/basename.html" class="external-link">basename</a></span><span class="op">(</span><span class="va">file_path</span><span class="op">)</span>, <span class="st">"some_file_name.fasta"</span><span class="op">)</span></span>
+<span><span class="va">df</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>file <span class="op">=</span> <span class="va">file</span>,</span>
+<span>                 label_1 <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fl">1</span><span class="op">)</span>, label_2 <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">1</span>, <span class="fl">0</span><span class="op">)</span>, label_3 <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">1</span>, <span class="fl">0</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="va">df</span></span></code></pre></div>
+<pre><code><span><span class="co">##                   file label_1 label_2 label_3</span></span>
+<span><span class="co">## 1              a.fasta       0       1       1</span></span>
+<span><span class="co">## 2 some_file_name.fasta       1       0       0</span></span></code></pre>
+<div class="sourceCode" id="cb134"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/utils/write.table.html" class="external-link">write.csv</a></span><span class="op">(</span>x <span class="op">=</span> <span class="va">df</span>, file <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/file.path.html" class="external-link">file.path</a></span><span class="op">(</span><span class="va">dir_path</span>, <span class="st">"add_input.csv"</span><span class="op">)</span>, row.names <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span></code></pre></div>
+<p>If we add the path to the csv file, the generator will map additional
+input to sequences:</p>
+<div class="sourceCode" id="cb135"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">gen</span> <span class="op">&lt;-</span>  <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">dir_path</span>,</span>
+<span>                      train_type <span class="op">=</span> <span class="st">"lm"</span>, </span>
+<span>                      batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                      maxlen <span class="op">=</span> <span class="fl">5</span>,</span>
+<span>                      output_format <span class="op">=</span> <span class="st">"target_right"</span>,</span>
+<span>                      vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
+<span>                      added_label_path <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/file.path.html" class="external-link">file.path</a></span><span class="op">(</span><span class="va">dir_path</span>, <span class="st">"add_input.csv"</span><span class="op">)</span>,</span>
+<span>                      add_input_as_seq <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span>  <span class="co"># don't treat added input as sequence</span></span>
+<span>                      </span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">added_label_input</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span>
+<span><span class="va">added_label_input</span></span></code></pre></div>
+<pre><code><span><span class="co">##      [,1] [,2] [,3]</span></span>
+<span><span class="co">## [1,]    0    1    1</span></span></code></pre>
+<div class="sourceCode" id="cb137"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span>
+<span><span class="va">x</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span></code></pre></div>
+<pre><code><span><span class="co">##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]</span></span>
+<span><span class="co">## [1,]    1    0    0    0    0    0    0    0    0</span></span>
+<span><span class="co">## [2,]    0    1    0    0    0    0    0    0    0</span></span>
+<span><span class="co">## [3,]    0    0    1    0    0    0    0    0    0</span></span>
+<span><span class="co">## [4,]    0    0    0    1    0    0    0    0    0</span></span>
+<span><span class="co">## [5,]    0    0    0    0    1    0    0    0    0</span></span></code></pre>
+<div class="sourceCode" id="cb139"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> </span>
+<span><span class="va">y</span></span></code></pre></div>
+<pre><code><span><span class="co">##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]</span></span>
+<span><span class="co">## [1,]    0    0    0    0    0    1    0    0    0</span></span></code></pre>
+<p>If we want to train a network with additional labels, we have to add
+an additional input layer.</p>
+<div class="sourceCode" id="cb141"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">model</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/create_model_lstm_cnn.html">create_model_lstm_cnn</a></span><span class="op">(</span></span>
+<span>  maxlen <span class="op">=</span> <span class="fl">5</span>,</span>
+<span>  layer_lstm <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">8</span>, <span class="fl">8</span><span class="op">)</span>,</span>
+<span>  layer_dense <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">4</span><span class="op">)</span>,</span>
+<span>  label_input <span class="op">=</span> <span class="fl">3</span> <span class="co"># additional input vector has length 3</span></span>
+<span><span class="op">)</span></span></code></pre></div>
+<pre><code><span><span class="co">## Model: "model"</span></span>
+<span><span class="co">## __________________________________________________________________________________________________</span></span>
+<span><span class="co">##  Layer (type)                Output Shape                 Param #   Connected to                  </span></span>
+<span><span class="co">## ==================================================================================================</span></span>
+<span><span class="co">##  input_1 (InputLayer)        [(None, 5, 4)]               0         []                            </span></span>
+<span><span class="co">##                                                                                                   </span></span>
+<span><span class="co">##  lstm (LSTM)                 (None, 5, 8)                 416       ['input_1[0][0]']             </span></span>
+<span><span class="co">##                                                                                                   </span></span>
+<span><span class="co">##  input_2 (InputLayer)        [(None, 3)]                  0         []                            </span></span>
+<span><span class="co">##                                                                                                   </span></span>
+<span><span class="co">##  lstm_1 (LSTM)               (None, 8)                    544       ['lstm[0][0]']                </span></span>
+<span><span class="co">##                                                                                                   </span></span>
+<span><span class="co">##  concatenate (Concatenate)   (None, 11)                   0         ['input_2[0][0]',             </span></span>
+<span><span class="co">##                                                                      'lstm_1[0][0]']              </span></span>
+<span><span class="co">##                                                                                                   </span></span>
+<span><span class="co">##  dense (Dense)               (None, 4)                    48        ['concatenate[0][0]']         </span></span>
+<span><span class="co">##                                                                                                   </span></span>
+<span><span class="co">## ==================================================================================================</span></span>
+<span><span class="co">## Total params: 1008 (3.94 KB)</span></span>
+<span><span class="co">## Trainable params: 1008 (3.94 KB)</span></span>
+<span><span class="co">## Non-trainable params: 0 (0.00 Byte)</span></span>
+<span><span class="co">## __________________________________________________________________________________________________</span></span></code></pre>
+<div class="sourceCode" id="cb143"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="co"># train_model(train_type = "lm", </span></span>
+<span><span class="co">#             model = model,</span></span>
+<span><span class="co">#             path = file.path(dir_path, "train_files_1"),</span></span>
+<span><span class="co">#             path_val = file.path(dir_path, "validation_files_1"),</span></span>
+<span><span class="co">#             added_label_path = file.path(dir_path, "add_input.csv"),</span></span>
+<span><span class="co">#             steps_per_epoch = 5,</span></span>
+<span><span class="co">#             batch_size = 8,</span></span>
+<span><span class="co">#             epochs = 2)</span></span></code></pre></div>
+</div>
+<div class="section level3">
+<h3 id="return_int">return_int<a class="anchor" aria-label="anchor" href="#return_int"></a>
+</h3>
+<p>Whether to return integer encoding rather than one-hot encoding.</p>
+<div class="sourceCode" id="cb144"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">df</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="st">"ATCGC"</span>, Header <span class="op">=</span> <span class="st">"seq_1"</span><span class="op">)</span></span>
+<span><span class="va">fasta_path</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fasta"</span><span class="op">)</span></span>
+<span><span class="va">fasta_file</span> <span class="op">&lt;-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">fasta_path</span><span class="op">)</span></span>
+<span><span class="va">gen</span> <span class="op">&lt;-</span>  <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">fasta_path</span>,</span>
+<span>                      train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
+<span>                      batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                      return_int <span class="op">=</span> <span class="cn">TRUE</span>,</span>
+<span>                      padding <span class="op">=</span> <span class="cn">TRUE</span>,</span>
+<span>                      maxlen <span class="op">=</span> <span class="fl">8</span>,</span>
+<span>                      vocabulary <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span>,</span>
+<span>                      output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span>
+<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"pad"</span>, <span class="st">"pad"</span>, <span class="st">"pad"</span>, <span class="st">"pad"</span>, <span class="st">"A"</span>, <span class="st">"T"</span>, <span class="st">"C"</span>, <span class="st">"G"</span><span class="op">)</span></span>
+<span><span class="va">x</span></span></code></pre></div>
+<pre><code><span><span class="co">##      pad pad pad pad A T C G</span></span>
+<span><span class="co">## [1,]   0   0   0   0 1 4 2 3</span></span></code></pre>
+<div class="sourceCode" id="cb146"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="st">"C"</span></span>
+<span><span class="va">y</span></span></code></pre></div>
+<pre><code><span><span class="co">##      C</span></span>
+<span><span class="co">## [1,] 2</span></span></code></pre>
+<p>Can also be combined with n-gram encoding:</p>
+<div class="sourceCode" id="cb148"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">df</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="st">"AAACCCTTT"</span>, Header <span class="op">=</span> <span class="st">"seq_1"</span><span class="op">)</span></span>
+<span><span class="va">fasta_path</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fasta"</span><span class="op">)</span></span>
+<span><span class="va">fasta_file</span> <span class="op">&lt;-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">fasta_path</span><span class="op">)</span></span>
+<span><span class="va">gen</span> <span class="op">&lt;-</span>  <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">fasta_path</span>,</span>
+<span>                      train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
+<span>                      batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                      n_gram <span class="op">=</span> <span class="fl">3</span>,</span>
+<span>                      n_gram_stride <span class="op">=</span> <span class="fl">3</span>,</span>
+<span>                      return_int <span class="op">=</span> <span class="cn">TRUE</span>,</span>
+<span>                      maxlen <span class="op">=</span> <span class="fl">6</span>,</span>
+<span>                      target_len <span class="op">=</span> <span class="fl">3</span>,</span>
+<span>                      vocabulary <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span>,</span>
+<span>                      output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span>
+<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"AAA"</span>, <span class="st">"CCC"</span><span class="op">)</span></span>
+<span><span class="va">x</span></span></code></pre></div>
+<pre><code><span><span class="co">##      AAA CCC</span></span>
+<span><span class="co">## [1,]   1  22</span></span></code></pre>
+<div class="sourceCode" id="cb150"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="st">"TTT"</span></span>
+<span><span class="va">y</span></span></code></pre></div>
+<pre><code><span><span class="co">##      TTT</span></span>
+<span><span class="co">## [1,]  64</span></span></code></pre>
+</div>
+<div class="section level3">
+<h3 id="reshape_xy">reshape_xy<a class="anchor" aria-label="anchor" href="#reshape_xy"></a>
+</h3>
+<p>Apply some function to the output of a generator call.</p>
+<div class="sourceCode" id="cb152"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">df</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="st">"AAAATTTT"</span>, Header <span class="op">=</span> <span class="st">"header_1"</span><span class="op">)</span></span>
+<span><span class="va">fasta_path</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fasta"</span><span class="op">)</span></span>
+<span><span class="va">fasta_file</span> <span class="op">&lt;-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">fasta_path</span><span class="op">)</span></span>
+<span><span class="va">fx</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">x</span> <span class="op">=</span> <span class="cn">NULL</span>, <span class="va">y</span> <span class="op">=</span> <span class="cn">NULL</span><span class="op">)</span> <span class="op">{</span></span>
+<span>  <span class="kw"><a href="https://rdrr.io/r/base/function.html" class="external-link">return</a></span><span class="op">(</span><span class="va">x</span> <span class="op">-</span> <span class="fl">1</span><span class="op">)</span></span>
+<span><span class="op">}</span></span>
+<span><span class="va">fy</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">x</span> <span class="op">=</span> <span class="cn">NULL</span>, <span class="va">y</span> <span class="op">=</span> <span class="cn">NULL</span><span class="op">)</span> <span class="op">{</span></span>
+<span>  <span class="kw"><a href="https://rdrr.io/r/base/function.html" class="external-link">return</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/Log.html" class="external-link">exp</a></span><span class="op">(</span><span class="va">y</span> <span class="op">*</span> <span class="fl">5</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="op">}</span></span>
+<span></span>
+<span><span class="va">gen</span> <span class="op">&lt;-</span>  <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">fasta_path</span>,</span>
+<span>                      reshape_xy <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span>x <span class="op">=</span> <span class="va">fx</span>, y <span class="op">=</span> <span class="va">fy</span><span class="op">)</span>,</span>
+<span>                      train_type <span class="op">=</span> <span class="st">"label_folder"</span>,</span>
+<span>                      batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                      maxlen <span class="op">=</span> <span class="fl">8</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span>
+<span><span class="va">x</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span></span></code></pre></div>
+<pre><code><span><span class="co">##      [,1] [,2] [,3] [,4]</span></span>
+<span><span class="co">## [1,]    0   -1   -1   -1</span></span>
+<span><span class="co">## [2,]    0   -1   -1   -1</span></span>
+<span><span class="co">## [3,]    0   -1   -1   -1</span></span>
+<span><span class="co">## [4,]    0   -1   -1   -1</span></span>
+<span><span class="co">## [5,]   -1   -1   -1    0</span></span>
+<span><span class="co">## [6,]   -1   -1   -1    0</span></span>
+<span><span class="co">## [7,]   -1   -1   -1    0</span></span>
+<span><span class="co">## [8,]   -1   -1   -1    0</span></span></code></pre>
+<div class="sourceCode" id="cb154"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span>
+<span><span class="va">y</span></span></code></pre></div>
+<pre><code><span><span class="co">##          [,1]</span></span>
+<span><span class="co">## [1,] 148.4132</span></span></code></pre>
+</div>
+<div class="section level3">
+<h3 id="masked_lm">masked_lm<a class="anchor" aria-label="anchor" href="#masked_lm"></a>
+</h3>
+<p>Masks some parts of input sequence. Can be used for training
+BERT-like models.</p>
+<div class="sourceCode" id="cb156"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">nt_seq</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/rep.html" class="external-link">rep</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span>, each <span class="op">=</span> <span class="fl">25</span><span class="op">)</span> <span class="op"><a href="../reference/pipe.html">%&gt;%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste</a></span><span class="op">(</span>collapse <span class="op">=</span> <span class="st">""</span><span class="op">)</span></span>
+<span><span class="va">df</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="va">nt_seq</span>, Header <span class="op">=</span> <span class="st">"seq_1"</span><span class="op">)</span></span>
+<span><span class="va">fasta_path</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fasta"</span><span class="op">)</span></span>
+<span><span class="va">fasta_file</span> <span class="op">&lt;-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">fasta_path</span><span class="op">)</span></span>
+<span><span class="va">masked_lm</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span>mask_rate <span class="op">=</span> <span class="fl">0.10</span>, <span class="co"># replace 10% of input with special mask token</span></span>
+<span>                  random_rate <span class="op">=</span> <span class="fl">0.025</span>, <span class="co"># set 2.5% of input to random value</span></span>
+<span>                  identity_rate <span class="op">=</span> <span class="fl">0.05</span>, <span class="co"># leave 5% unchanged</span></span>
+<span>                  include_sw <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span> <span class="co"># 0,1 matrix showing where masking was applied</span></span>
+<span><span class="va">gen</span> <span class="op">&lt;-</span>  <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">fasta_path</span>,</span>
+<span>                      train_type <span class="op">=</span> <span class="st">"masked_lm"</span>,</span>
+<span>                      masked_lm <span class="op">=</span> <span class="va">masked_lm</span>,</span>
+<span>                      batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                      n_gram <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                      n_gram_stride <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                      return_int <span class="op">=</span> <span class="cn">TRUE</span>,</span>
+<span>                      maxlen <span class="op">=</span> <span class="fl">100</span>,</span>
+<span>                      vocabulary <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span>
+<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span>
+<span><span class="va">sw</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">3</span><span class="op">]</span><span class="op">]</span></span>
+<span><span class="va">df</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>x <span class="op">=</span> <span class="va">x</span><span class="op">[</span><span class="fl">1</span>, <span class="op">]</span>, y <span class="op">=</span> <span class="va">y</span><span class="op">[</span><span class="fl">1</span>, <span class="op">]</span>, sw <span class="op">=</span> <span class="va">sw</span><span class="op">[</span><span class="fl">1</span>, <span class="op">]</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/utils/head.html" class="external-link">head</a></span><span class="op">(</span><span class="va">df</span><span class="op">)</span></span></code></pre></div>
+<pre><code><span><span class="co">##   x y sw</span></span>
+<span><span class="co">## 1 5 1  1</span></span>
+<span><span class="co">## 2 1 1  0</span></span>
+<span><span class="co">## 3 1 1  0</span></span>
+<span><span class="co">## 4 1 1  0</span></span>
+<span><span class="co">## 5 1 1  0</span></span>
+<span><span class="co">## 6 1 1  0</span></span></code></pre>
+<p>Whenever sw (sample weight) column is 0, x and y columns are
+identical. Let’s look at rows where sw is 1:</p>
+<div class="sourceCode" id="cb158"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">df</span> <span class="op"><a href="../reference/pipe.html">%&gt;%</a></span> <span class="fu">dplyr</span><span class="fu">::</span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html" class="external-link">filter</a></span><span class="op">(</span><span class="va">sw</span> <span class="op">==</span> <span class="fl">1</span><span class="op">)</span></span></code></pre></div>
+<pre><code><span><span class="co">##    x y sw</span></span>
+<span><span class="co">## 1  5 1  1</span></span>
+<span><span class="co">## 2  1 1  1</span></span>
+<span><span class="co">## 3  1 1  1</span></span>
+<span><span class="co">## 4  5 1  1</span></span>
+<span><span class="co">## 5  5 2  1</span></span>
+<span><span class="co">## 6  5 2  1</span></span>
+<span><span class="co">## 7  5 2  1</span></span>
+<span><span class="co">## 8  3 3  1</span></span>
+<span><span class="co">## 9  2 3  1</span></span>
+<span><span class="co">## 10 3 3  1</span></span>
+<span><span class="co">## 11 5 3  1</span></span>
+<span><span class="co">## 12 5 3  1</span></span>
+<span><span class="co">## 13 4 4  1</span></span>
+<span><span class="co">## 14 5 4  1</span></span>
+<span><span class="co">## 15 4 4  1</span></span>
+<span><span class="co">## 16 5 4  1</span></span>
+<span><span class="co">## 17 5 4  1</span></span>
+<span><span class="co">## 18 4 4  1</span></span></code></pre>
+<p>Here 5 is the mask token, this is always the size of the vocabulary +
+1.</p>
+<div class="sourceCode" id="cb160"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">df</span> <span class="op"><a href="../reference/pipe.html">%&gt;%</a></span> <span class="fu">dplyr</span><span class="fu">::</span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html" class="external-link">filter</a></span><span class="op">(</span><span class="va">sw</span> <span class="op">==</span> <span class="fl">1</span> <span class="op">&amp;</span> <span class="va">x</span> <span class="op">==</span> <span class="fl">5</span><span class="op">)</span> <span class="co"># 10% masked part</span></span></code></pre></div>
+<pre><code><span><span class="co">##    x y sw</span></span>
+<span><span class="co">## 1  5 1  1</span></span>
+<span><span class="co">## 2  5 1  1</span></span>
+<span><span class="co">## 3  5 2  1</span></span>
+<span><span class="co">## 4  5 2  1</span></span>
+<span><span class="co">## 5  5 2  1</span></span>
+<span><span class="co">## 6  5 3  1</span></span>
+<span><span class="co">## 7  5 3  1</span></span>
+<span><span class="co">## 8  5 4  1</span></span>
+<span><span class="co">## 9  5 4  1</span></span>
+<span><span class="co">## 10 5 4  1</span></span></code></pre>
+<div class="sourceCode" id="cb162"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">df</span> <span class="op"><a href="../reference/pipe.html">%&gt;%</a></span> <span class="fu">dplyr</span><span class="fu">::</span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html" class="external-link">filter</a></span><span class="op">(</span><span class="va">sw</span> <span class="op">==</span> <span class="fl">1</span> <span class="op">&amp;</span> <span class="va">x</span> <span class="op">!=</span> <span class="fl">5</span><span class="op">)</span> <span class="co"># 5% identity part and 2.5% random part (can randomly be the true value)</span></span></code></pre></div>
+<pre><code><span><span class="co">##   x y sw</span></span>
+<span><span class="co">## 1 1 1  1</span></span>
+<span><span class="co">## 2 1 1  1</span></span>
+<span><span class="co">## 3 3 3  1</span></span>
+<span><span class="co">## 4 2 3  1</span></span>
+<span><span class="co">## 5 3 3  1</span></span>
+<span><span class="co">## 6 4 4  1</span></span>
+<span><span class="co">## 7 4 4  1</span></span>
+<span><span class="co">## 8 4 4  1</span></span></code></pre>
+<p>Can be combined with n-gram encoding and masking of fixed block
+size:</p>
+<div class="sourceCode" id="cb164"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">nt_seq</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/rep.html" class="external-link">rep</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span>, each <span class="op">=</span> <span class="fl">25</span><span class="op">)</span> <span class="op"><a href="../reference/pipe.html">%&gt;%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste</a></span><span class="op">(</span>collapse <span class="op">=</span> <span class="st">""</span><span class="op">)</span></span>
+<span><span class="va">df</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="va">nt_seq</span>, Header <span class="op">=</span> <span class="st">"seq_1"</span><span class="op">)</span></span>
+<span><span class="va">fasta_path</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fasta"</span><span class="op">)</span></span>
+<span><span class="va">fasta_file</span> <span class="op">&lt;-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">fasta_path</span><span class="op">)</span></span>
+<span><span class="va">masked_lm</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span>mask_rate <span class="op">=</span> <span class="fl">0.10</span>, <span class="co"># replace 10% of input with special mask token</span></span>
+<span>                  random_rate <span class="op">=</span> <span class="fl">0.05</span>, <span class="co"># set 5% of input to random value</span></span>
+<span>                  identity_rate <span class="op">=</span> <span class="fl">0.05</span>, <span class="co"># leave 5% unchanged</span></span>
+<span>                  include_sw <span class="op">=</span> <span class="cn">TRUE</span>, <span class="co"># 0,1 matrix showing where masking was applied</span></span>
+<span>                  block_len <span class="op">=</span> <span class="fl">3</span><span class="op">)</span> <span class="co"># always mask at least 3 tokens in a row </span></span>
+<span><span class="va">gen</span> <span class="op">&lt;-</span>  <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">fasta_path</span>,</span>
+<span>                      train_type <span class="op">=</span> <span class="st">"masked_lm"</span>,</span>
+<span>                      masked_lm <span class="op">=</span> <span class="va">masked_lm</span>,</span>
+<span>                      batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                      n_gram <span class="op">=</span> <span class="fl">3</span>,</span>
+<span>                      seed <span class="op">=</span> <span class="fl">12</span>,</span>
+<span>                      n_gram_stride <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>                      return_int <span class="op">=</span> <span class="cn">TRUE</span>,</span>
+<span>                      maxlen <span class="op">=</span> <span class="fl">100</span>,</span>
+<span>                      vocabulary <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span>
+<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span>
+<span><span class="va">sw</span> <span class="op">&lt;-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">3</span><span class="op">]</span><span class="op">]</span></span>
+<span><span class="va">df</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>x <span class="op">=</span> <span class="va">x</span><span class="op">[</span><span class="fl">1</span>, <span class="op">]</span>, y <span class="op">=</span> <span class="va">y</span><span class="op">[</span><span class="fl">1</span>, <span class="op">]</span>, sw <span class="op">=</span> <span class="va">sw</span><span class="op">[</span><span class="fl">1</span>, <span class="op">]</span>, position <span class="op">=</span> <span class="fl">1</span><span class="op">:</span><span class="fu"><a href="https://rdrr.io/r/base/nrow.html" class="external-link">ncol</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/utils/head.html" class="external-link">head</a></span><span class="op">(</span><span class="va">df</span><span class="op">)</span></span></code></pre></div>
+<pre><code><span><span class="co">##    x y sw position</span></span>
+<span><span class="co">## 1  1 1  0        1</span></span>
+<span><span class="co">## 2  1 1  0        2</span></span>
+<span><span class="co">## 3  1 1  0        3</span></span>
+<span><span class="co">## 4 39 1  1        4</span></span>
+<span><span class="co">## 5 48 1  1        5</span></span>
+<span><span class="co">## 6 13 1  1        6</span></span></code></pre>
+<div class="sourceCode" id="cb166"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/utils/head.html" class="external-link">tail</a></span><span class="op">(</span><span class="va">df</span><span class="op">)</span></span></code></pre></div>
+<pre><code><span><span class="co">##     x  y sw position</span></span>
+<span><span class="co">## 93 65 64  1       93</span></span>
+<span><span class="co">## 94 64 64  0       94</span></span>
+<span><span class="co">## 95 64 64  0       95</span></span>
+<span><span class="co">## 96 64 64  0       96</span></span>
+<span><span class="co">## 97 64 64  0       97</span></span>
+<span><span class="co">## 98 64 64  0       98</span></span></code></pre>
+<p>We can check that sample weights appear only in blocks.</p>
+<div class="sourceCode" id="cb168"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/which.html" class="external-link">which</a></span><span class="op">(</span><span class="va">sw</span> <span class="op">==</span> <span class="fl">1</span><span class="op">)</span></span></code></pre></div>
+<pre><code><span><span class="co">##  [1]  4  5  6 13 14 15 22 23 24 40 41 42 52 53 54 79 80 81 82 83 84 91 92 93</span></span></code></pre>
+<p>Here 65 is the mask token (4^3 + 1 = size of the vocabulary + 1).</p>
+<div class="sourceCode" id="cb170"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">df</span> <span class="op"><a href="../reference/pipe.html">%&gt;%</a></span> <span class="fu">dplyr</span><span class="fu">::</span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html" class="external-link">filter</a></span><span class="op">(</span><span class="va">sw</span> <span class="op">==</span> <span class="fl">1</span> <span class="op">&amp;</span> <span class="va">x</span> <span class="op">==</span> <span class="fl">65</span><span class="op">)</span> <span class="co"># 10% masked part</span></span></code></pre></div>
+<pre><code><span><span class="co">##     x  y sw position</span></span>
+<span><span class="co">## 1  65 22  1       40</span></span>
+<span><span class="co">## 2  65 22  1       41</span></span>
+<span><span class="co">## 3  65 22  1       42</span></span>
+<span><span class="co">## 4  65 64  1       79</span></span>
+<span><span class="co">## 5  65 64  1       80</span></span>
+<span><span class="co">## 6  65 64  1       81</span></span>
+<span><span class="co">## 7  65 64  1       82</span></span>
+<span><span class="co">## 8  65 64  1       83</span></span>
+<span><span class="co">## 9  65 64  1       84</span></span>
+<span><span class="co">## 10 65 64  1       91</span></span>
+<span><span class="co">## 11 65 64  1       92</span></span>
+<span><span class="co">## 12 65 64  1       93</span></span></code></pre>
+<div class="sourceCode" id="cb172"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">df</span> <span class="op"><a href="../reference/pipe.html">%&gt;%</a></span> <span class="fu">dplyr</span><span class="fu">::</span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html" class="external-link">filter</a></span><span class="op">(</span><span class="va">sw</span> <span class="op">==</span> <span class="fl">1</span> <span class="op">&amp;</span> <span class="va">x</span> <span class="op">!=</span> <span class="fl">65</span><span class="op">)</span> <span class="co"># 5% identity part and 5% random part (can randomly be the true value)</span></span></code></pre></div>
+<pre><code><span><span class="co">##     x  y sw position</span></span>
+<span><span class="co">## 1  39  1  1        4</span></span>
+<span><span class="co">## 2  48  1  1        5</span></span>
+<span><span class="co">## 3  13  1  1        6</span></span>
+<span><span class="co">## 4   1  1  1       13</span></span>
+<span><span class="co">## 5   1  1  1       14</span></span>
+<span><span class="co">## 6   1  1  1       15</span></span>
+<span><span class="co">## 7   1  1  1       22</span></span>
+<span><span class="co">## 8   1  1  1       23</span></span>
+<span><span class="co">## 9   2  2  1       24</span></span>
+<span><span class="co">## 10 56 43  1       52</span></span>
+<span><span class="co">## 11  4 43  1       53</span></span>
+<span><span class="co">## 12 24 43  1       54</span></span></code></pre>
+</div>
+</div>
+  </main><aside class="col-md-3"><nav id="toc"><h2>On this page</h2>
+    </nav></aside>
+</div>
+
+
+
+    <footer><div class="pkgdown-footer-left">
+  <p>Developed by Philipp Münch, René Mreches, Martin Binder, Hüseyin Anil Gündüz, Xiao-Yin To, Alice McHardy.</p>
+</div>
+
+<div class="pkgdown-footer-right">
+  <p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.9.</p>
+</div>
+
+    </footer>
+</div>
+
+  
+
+  
+
+  </body>
+</html>