<!DOCTYPE html>
<!-- Generated by pkgdown: do not edit by hand --><html lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<meta name="description" content="deepG">
<title>Data Generator • deepG</title>
<!-- favicons --><link rel="icon" type="image/png" sizes="16x16" href="../favicon-16x16.png">
<link rel="icon" type="image/png" sizes="32x32" href="../favicon-32x32.png">
<link rel="apple-touch-icon" type="image/png" sizes="180x180" href="../apple-touch-icon.png">
<link rel="apple-touch-icon" type="image/png" sizes="120x120" href="../apple-touch-icon-120x120.png">
<link rel="apple-touch-icon" type="image/png" sizes="76x76" href="../apple-touch-icon-76x76.png">
<link rel="apple-touch-icon" type="image/png" sizes="60x60" href="../apple-touch-icon-60x60.png">
<script src="../deps/jquery-3.6.0/jquery-3.6.0.min.js"></script><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<link href="../deps/bootstrap-5.3.1/bootstrap.min.css" rel="stylesheet">
<script src="../deps/bootstrap-5.3.1/bootstrap.bundle.min.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous">
<!-- bootstrap-toc --><script src="https://cdn.jsdelivr.net/gh/afeld/bootstrap-toc@v1.0.1/dist/bootstrap-toc.min.js" integrity="sha256-4veVQbu7//Lk5TSmc7YV48MxtMy98e26cf5MrgZYnwo=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.11/clipboard.min.js" integrity="sha512-7O5pXpc0oCRrxk8RUfDYFgn0nO1t+jLuIOQdOMRp4APB7uZ4vSjspzp5y6YDtDs4VzUSTbWzBFZ/LKJhnyFOKw==" crossorigin="anonymous" referrerpolicy="no-referrer"></script><!-- search --><script src="https://cdnjs.cloudflare.com/ajax/libs/fuse.js/6.4.6/fuse.js" integrity="sha512-zv6Ywkjyktsohkbp9bb45V6tEMoWhzFzXis+LrMehmJZZSys19Yxf1dopHx7WzIKxr5tK2dVcYmaCk2uqdjF4A==" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/autocomplete.js/0.38.0/autocomplete.jquery.min.js" integrity="sha512-GU9ayf+66Xx2TmpxqJpliWbT5PiGYxpaG8rfnBEk1LL8l1KGkRShhngwdXK1UgqhAzWpZHSiYPc09/NwDQIGyg==" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mark.js/8.11.1/mark.min.js" integrity="sha512-5CYOlHXGh6QpOFA/TeTylKLWfB3ftPsde7AnmhuitiTX4K5SqCLBeKro6sPS8ilsz1Q4NRx3v8Ko2IBiszzdww==" crossorigin="anonymous"></script><!-- pkgdown --><script src="../pkgdown.js"></script><meta property="og:title" content="Data Generator">
<meta property="og:description" content="deepG">
<meta property="og:image" content="https://genomenet.github.io/deepG/logo.png">
<!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
<![endif]-->
</head>
<body>
<a href="#main" class="visually-hidden-focusable">Skip to contents</a>
<nav class="navbar fixed-top navbar-light navbar-expand-lg bg-light" data-bs-theme="light"><div class="container">
<a class="navbar-brand me-2" href="../index.html">deepG</a>
<small class="nav-text text-default me-auto" data-bs-toggle="tooltip" data-bs-placement="bottom" title="Released version">0.3.0</small>
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbar" aria-controls="navbar" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div id="navbar" class="collapse navbar-collapse ms-3">
<ul class="navbar-nav me-auto">
<li class="nav-item">
<a class="nav-link" href="../reference/index.html">
<span class="fa fa fa fa-file-alt"></span>
Reference
</a>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false" aria-haspopup="true" id="dropdown-notebooks">Notebooks</a>
<div class="dropdown-menu" aria-labelledby="dropdown-notebooks">
<a class="external-link dropdown-item" href="https://colab.research.google.com/drive/175jIdXcDcgPUvaBo2rH2Lupbpjnp5O7G?usp=sharing">deepG tutorial</a>
<a class="external-link dropdown-item" href="https://colab.research.google.com/drive/1Eolc0koMNM1zkuO4XyVM58ImeF1BpRiH?usp=sharing">Read-length level: Human contamination</a>
<a class="external-link dropdown-item" href="https://colab.research.google.com/drive/1yiXSwFafXpMLHaov9iBTQLIDZ6bK1zYX?usp=sharing">Locus level: CRISPR detection</a>
<a class="external-link dropdown-item" href="https://colab.research.google.com/drive/1G7bOFEX87cZNrM2tdRtTdkrZn5fM__g0?usp=sharing">Gene level: 16S rRNA detection</a>
<a class="external-link dropdown-item" href="https://colab.research.google.com/drive/1BCggL-tfQF136YeJ8cKKi-zoBEDMgkNh?usp=sharing">Genome level: Bacterial morphology (Sporulation)</a>
<a class="external-link dropdown-item" href="https://colab.research.google.com/drive/10xpRzGd3JeBAbqQYSCxzQUMctt01sx9D?usp=sharing">Full metagenome level: Colorectal cancer prediction</a>
<a class="external-link dropdown-item" href="https://colab.research.google.com/drive/1kyYK7IU7GSfdpDzO_a8U3_qD4i3zTu6w?usp=sharing">BERT with deepG</a>
</div>
</li>
<li class="active nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false" aria-haspopup="true" id="dropdown-tutorials">Tutorials</a>
<div class="dropdown-menu" aria-labelledby="dropdown-tutorials">
<a class="dropdown-item" href="../articles/getting_started.html">Getting Started</a>
<a class="dropdown-item" href="../articles/training_types.html">Training types</a>
<a class="dropdown-item" href="../articles/data_generator.html">Data generator</a>
<a class="dropdown-item" href="../articles/using_tb.html">Using tensorboard</a>
<a class="dropdown-item" href="../articles/integrated_gradient.html">Integrated Gradient</a>
</div>
</li>
</ul>
<form class="form-inline my-2 my-lg-0" role="search">
<input type="search" class="form-control me-sm-2" aria-label="Toggle navigation" name="search-input" data-search-index="../search.json" id="search-input" placeholder="Search for" autocomplete="off">
</form>
<ul class="navbar-nav">
<li class="nav-item">
<a class="external-link nav-link" href="https://github.com/GenomeNet/deepG/" aria-label="github">
<span class="fab fa fab fa-github fa-lg"></span>
</a>
</li>
</ul>
</div>
</div>
</nav><div class="container template-article">
<div class="row">
<main id="main" class="col-md-9"><div class="page-header">
<img src="../logo.png" class="logo" alt=""><h1>Data Generator</h1>
<small class="dont-index">Source: <a href="https://github.com/GenomeNet/deepG/blob/HEAD/vignettes/data_generator.Rmd" class="external-link"><code>vignettes/data_generator.Rmd</code></a></small>
<div class="d-none name"><code>data_generator.Rmd</code></div>
</div>
<div class="sourceCode" id="cb1"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="co">#devtools::install_github("GenomeNet/deepG")</span></span>
<span><span class="co">#library(deepG)</span></span>
<span><span class="co">#library(magrittr)</span></span></code></pre></div>
<style type="text/css">
mark.in {
background-color: CornflowerBlue;
}
mark.out {
background-color: IndianRed;
}
</style>
<div class="section level2">
<h2 id="introduction">Introduction<a class="anchor" aria-label="anchor" href="#introduction"></a>
</h2>
<p>The most common use case for the deepG data generator is to extract
samples from a collection of fasta (or fastq) files. The generator will
always return a list of length 2. The first element is the input <span class="math inline">\(X\)</span> and the second the target <span class="math inline">\(Y\)</span>. We can differentiate between 2
approaches</p>
<ul>
<li>
<strong>Language model</strong>: Part of a sequence is the input and
other part the target.
<ul>
<li>Example: Predict the next nucleotide given the previous 100
nucleotides.</li>
</ul>
</li>
<li>
<strong>Label classification</strong>: Assign a label to a sequence.
<ul>
<li>Example: Assign a label “virus” or “bacteria” to a sequence of
length 100.</li>
</ul>
</li>
</ul>
<p>Suppose we are given 2 fasta files called “a.fasta” and “b.fasta”
that look as follows:</p>
<div style="float: left;margin-right:10px">
<table class="table"><tr>
<td>
<strong>a.fasta</strong> <br><tt> >header_a1 <br> AACCAAGG <br>
>header_a2 <br> TTTGGG <br> >header_a3 <br> ACGTACGT <br></tt>
</td>
</tr></table>
</div>
<div style="float: left">
<table class="table"><tr>
<td>
<strong>b.fasta</strong> <br><tt> >header_b1 <br> GTGTGT <br>
>header_b2 <br> AAGG <br></tt>
</td>
</tr></table>
</div>
<p><br><br><br><br><br><br><br><br><br></p>
<p>If we want to extract sequences of length 4 from these files, there
would be 17 possible samples (5 from <tt>AACCAAGG</tt>, 3 from
<tt>TTTGGG</tt>, …). A naive approach would be to extract the samples in
a sequential manner:</p>
<p><em>1. sample</em>:</p>
<div style="float: left;margin-right:10px">
<table class="table"><tr>
<td>
<strong>a.fasta</strong> <br><tt> >header_a1 <br><mark class="in">AACC</mark>AAGG <br> >header_a2 <br> TTTGGG <br>
>header_a3 <br> ACGTACGT <br></tt>
</td>
</tr></table>
</div>
<div style="float: left">
<table class="table"><tr>
<td>
<strong>b.fasta</strong> <br><tt> >header_b1 <br> GTGTGT <br>
>header_b2 <br> AAGG <br></tt>
</td>
</tr></table>
</div>
<p><br><br><br><br><br><br><br><br><br></p>
<p><em>2. sample</em>:</p>
<div style="float: left;margin-right:10px">
<table class="table"><tr>
<td>
<strong>a.fasta</strong> <br><tt> >header_a1 <br>
A<mark class="in">ACCA</mark>AGG <br> >header_a2 <br> TTTGGG <br>
>header_a3 <br> ACGTACGT <br></tt>
</td>
</tr></table>
</div>
<div style="float: left">
<table class="table"><tr>
<td>
<strong>b.fasta</strong> <br><tt> >header_b1 <br> GTGTGT <br>
>header_b2 <br> AAGG <br></tt>
</td>
</tr></table>
</div>
<p><br><br><br><br><br><br><br><br><br></p>
<p>…</p>
<p><br></p>
<p><em>17. sample</em>:</p>
<div style="float: left;margin-right:10px">
<table class="table"><tr>
<td>
<strong>a.fasta</strong> <br><tt> >header_a1 <br> AACCAAGG <br>
>header_a2 <br> TTTGGG <br> >header_a3 <br> ACGTACGT <br></tt>
</td>
</tr></table>
</div>
<div style="float: left">
<table class="table"><tr>
<td>
<strong>b.fasta</strong> <br><tt> >header_b1 <br> GTGTGT <br>
>header_b2 <br><mark class="in">AAGG</mark><br></tt>
</td>
</tr></table>
</div>
<p><br><br><br><br><br><br><br><br><br></p>
<p><em>18. sample</em>:</p>
<div style="float: left;margin-right:10px">
<table class="table"><tr>
<td>
<strong>a.fasta</strong> <br><tt> >header_a1 <br><mark class="in">AACC</mark>AAGG <br> >header_a2 <br> TTTGGG <br>
>header_a3 <br> ACGTACGT <br></tt>
</td>
</tr></table>
</div>
<div style="float: left">
<table class="table"><tr>
<td>
<strong>b.fasta</strong> <br><tt> >header_b1 <br> GTGTGT <br>
>header_b2 <br> AAGG <br></tt>
</td>
</tr></table>
</div>
<p><br><br><br><br><br><br><br><br><br></p>
<p>… <br><br></p>
<p>For longer sequences this is not a desirable strategy since the data
is very redundant (often just one nucleotide difference) and the model
would often see long stretches of data from the same source. Choosing
the samples completely at random can also be problematic since we would
constantly have to open new files. The deepG generators offers several
option to navigate the data sampling strategy to achieve a good balance
between the two approaches.</p>
</div>
<div class="section level2">
<h2 id="data-generator-options">Data generator options<a class="anchor" aria-label="anchor" href="#data-generator-options"></a>
</h2>
<p>In the following code examples, we will mostly use the sequence <tt>
<strong>abcdefghiiii</strong> </tt> to demonstrate some of the deepG
data generator options. (In real world application you would usually
have sequences from the <tt>ACGT</tt> vocabulary.)</p>
<div class="sourceCode" id="cb2"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">sequence</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste0</a></span><span class="op">(</span><span class="st">"a"</span>, <span class="st">"b"</span>, <span class="st">"c"</span>, <span class="st">"d"</span>, <span class="st">"e"</span>, <span class="st">"f"</span>, <span class="st">"g"</span>, <span class="st">"h"</span>, <span class="st">"i"</span>, <span class="st">"i"</span>, <span class="st">"i"</span>, <span class="st">"i"</span><span class="op">)</span></span>
<span><span class="va">vocabulary</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"a"</span>, <span class="st">"b"</span>, <span class="st">"c"</span>, <span class="st">"d"</span>, <span class="st">"e"</span>, <span class="st">"f"</span>, <span class="st">"g"</span>, <span class="st">"h"</span>, <span class="st">"i"</span><span class="op">)</span> </span></code></pre></div>
<p>We may store this sequence in a fasta file</p>
<div class="sourceCode" id="cb3"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">temp_dir</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span><span class="op">)</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/files2.html" class="external-link">dir.create</a></span><span class="op">(</span><span class="va">temp_dir</span><span class="op">)</span></span>
<span><span class="va">dir_path</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste0</a></span><span class="op">(</span><span class="va">temp_dir</span>, <span class="st">"/dummy_data"</span><span class="op">)</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/files2.html" class="external-link">dir.create</a></span><span class="op">(</span><span class="va">dir_path</span><span class="op">)</span></span>
<span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="va">sequence</span>, Header <span class="op">=</span> <span class="st">"label_1"</span>, stringsAsFactors <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span>
<span><span class="va">file_path</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/file.path.html" class="external-link">file.path</a></span><span class="op">(</span><span class="va">dir_path</span>, <span class="st">"a.fasta"</span><span class="op">)</span></span>
<span><span class="co"># sequence as fasta file</span></span>
<span><span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span>fdta <span class="op">=</span> <span class="fu">dplyr</span><span class="fu">::</span><span class="fu"><a href="https://tibble.tidyverse.org/reference/as_tibble.html" class="external-link">as_tibble</a></span><span class="op">(</span><span class="va">df</span><span class="op">)</span>, out.file <span class="op">=</span> <span class="va">file_path</span><span class="op">)</span></span></code></pre></div>
<p>Since neural networks can only work with numeric data, we have to
encode sequences of characters with numeric data. Usually this is
achieved by one-hot-encoding; there are some other approaches
implemented: see <code>use_coverage</code>,
<code>use_quality_score</code> and <code>ambiguous_nuc</code>
sections.</p>
<div class="sourceCode" id="cb4"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="co"># one-hot encoding example</span></span>
<span><span class="va">s</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"a"</span>, <span class="st">"c"</span>, <span class="st">"a"</span>, <span class="st">"f"</span>, <span class="st">"i"</span>, <span class="st">"b"</span><span class="op">)</span></span>
<span><span class="va">s_as_int_seq</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/vector.html" class="external-link">vector</a></span><span class="op">(</span><span class="st">"integer"</span>, <span class="fu"><a href="https://rdrr.io/r/base/length.html" class="external-link">length</a></span><span class="op">(</span><span class="va">s</span><span class="op">)</span><span class="op">)</span></span>
<span><span class="kw">for</span> <span class="op">(</span><span class="va">i</span> <span class="kw">in</span> <span class="fl">1</span><span class="op">:</span><span class="fu"><a href="https://rdrr.io/r/base/length.html" class="external-link">length</a></span><span class="op">(</span><span class="va">s</span><span class="op">)</span><span class="op">)</span> <span class="op">{</span></span>
<span> <span class="va">s_as_int_seq</span><span class="op">[</span><span class="va">i</span><span class="op">]</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/which.html" class="external-link">which</a></span><span class="op">(</span><span class="va">s</span><span class="op">[</span><span class="va">i</span><span class="op">]</span> <span class="op">==</span> <span class="va">vocabulary</span><span class="op">)</span> <span class="op">-</span> <span class="fl">1</span></span>
<span><span class="op">}</span></span>
<span><span class="va">one_hot_sample</span> <span class="op"><-</span> <span class="fu">keras</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/keras/man/to_categorical.html" class="external-link">to_categorical</a></span><span class="op">(</span><span class="va">s_as_int_seq</span><span class="op">)</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">one_hot_sample</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="va">one_hot_sample</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [2,] 0 0 1 0 0 0 0 0 0</span></span>
<span><span class="co">## [3,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [4,] 0 0 0 0 0 1 0 0 0</span></span>
<span><span class="co">## [5,] 0 0 0 0 0 0 0 0 1</span></span>
<span><span class="co">## [6,] 0 1 0 0 0 0 0 0 0</span></span></code></pre>
<div class="section level3">
<h3 id="maxlen">maxlen<a class="anchor" aria-label="anchor" href="#maxlen"></a>
</h3>
<p>The length of the input sequence.</p>
</div>
<div class="section level3">
<h3 id="vocabulary">vocabulary<a class="anchor" aria-label="anchor" href="#vocabulary"></a>
</h3>
<p>The set of allowed characters in a sequence. What happens to
characters outside the vocabulary can be controlled with the
<code>ambiguous_nuc</code> argument.</p>
</div>
<div class="section level3">
<h3 id="train_type">train_type<a class="anchor" aria-label="anchor" href="#train_type"></a>
</h3>
<p>The generator will always return a list of length 2. The first
element is the input <span class="math inline">\(X\)</span> and the
second the target <span class="math inline">\(Y\)</span>. The
<code>train_type</code> argument determines how <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> get extracted. Possible arguments for
<u> <em>language models</em> </u> are:</p>
<ul>
<li>
<strong>“lm”</strong> or <strong>“lm_rds”</strong>: Given some
sequence <span class="math inline">\(s\)</span>, we take some subset of
that sequence as input and the rest as target. How to split <span class="math inline">\(s\)</span> can be specified in
<code>output_format</code> argument.</li>
</ul>
<p>Besides the language model approach, we can use <u> <em>label
classification</em> </u>. This means we map some label to a sequence.
For example, the target for some nucleotide sequence could be one of the
labels “bacteria” or “virus”. We have to specify how to extract a label
corresponding to a sequence. Possible arguments are:</p>
<ul>
<li><p><strong>“label_header”</strong>: get label from fasta
headers.</p></li>
<li><p><strong>“label_folder”</strong>: get label from folder, i.e. all
files in one folder must belong to the same class.</p></li>
<li>
<p><strong>“label_csv”</strong>: get label from csv file. Csv file
should have one column named “file”. The targets then correspond to
entries in that row (except “file” column). Example: if we are currently
working with a file called “a.fasta”, there should be a row in our csv
file with some target information for that file <br></p>
<table class="table">
<thead><tr class="header">
<th>file</th>
<th>label_1</th>
<th>label_2</th>
</tr></thead>
<tbody><tr class="odd">
<td>“a.fasta”</td>
<td>1</td>
<td>0</td>
</tr></tbody>
</table>
</li>
<li><p><strong>“label_rds”</strong>: rds file contains preprocessed list
of input and target tensors.</p></li>
</ul>
<p>Another option is <strong>“dummy_gen”</strong>: generator creates
random data once and repeatedly returns them.</p>
<p>Extract target from fasta header (fasta header is “label_1” in
example file):</p>
<div class="sourceCode" id="cb6"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="co"># get target from header</span></span>
<span><span class="va">vocabulary_label</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste0</a></span><span class="op">(</span><span class="st">"label_"</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">5</span><span class="op">)</span></span>
<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
<span> train_type <span class="op">=</span> <span class="st">"label_header"</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">6</span>,</span>
<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
<span> vocabulary_label <span class="op">=</span> <span class="va">vocabulary_label</span><span class="op">)</span></span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> </span>
<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> </span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary_label</span> </span>
<span><span class="va">x</span> <span class="co"># abcdef</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span>
<span><span class="co">## [4,] 0 0 0 1 0 0 0 0 0</span></span>
<span><span class="co">## [5,] 0 0 0 0 1 0 0 0 0</span></span>
<span><span class="co">## [6,] 0 0 0 0 0 1 0 0 0</span></span></code></pre>
<div class="sourceCode" id="cb8"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># label_1 </span></span></code></pre></div>
<pre><code><span><span class="co">## label_1 label_2 label_3 label_4 label_5</span></span>
<span><span class="co">## [1,] 1 0 0 0 0</span></span></code></pre>
<p>Extract target from fasta folder:</p>
<div class="sourceCode" id="cb10"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="co"># create data for second class</span></span>
<span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="st">"AABAACAADAAE"</span>, Header <span class="op">=</span> <span class="st">"header_1"</span><span class="op">)</span></span>
<span><span class="va">file_path_2</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fasta"</span><span class="op">)</span></span>
<span><span class="va">fasta_file</span> <span class="op"><-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">file_path_2</span><span class="op">)</span></span>
<span></span>
<span><span class="co"># get target from folder</span></span>
<span><span class="va">vocabulary_label</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste0</a></span><span class="op">(</span><span class="st">"label_"</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">2</span><span class="op">)</span></span>
<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="va">file_path</span>, <span class="va">file_path_2</span><span class="op">)</span>, <span class="co"># one entry for each class</span></span>
<span> train_type <span class="op">=</span> <span class="st">"label_folder"</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">8</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">6</span>,</span>
<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
<span> vocabulary_label <span class="op">=</span> <span class="va">vocabulary_label</span><span class="op">)</span></span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span>
<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> </span>
<span><span class="va">x_1_1</span> <span class="op"><-</span> <span class="va">x</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x_1_1</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="va">x_1_1</span> <span class="co"># first sample from first class</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span>
<span><span class="co">## [4,] 0 0 0 1 0 0 0 0 0</span></span>
<span><span class="co">## [5,] 0 0 0 0 1 0 0 0 0</span></span>
<span><span class="co">## [6,] 0 0 0 0 0 1 0 0 0</span></span></code></pre>
<div class="sourceCode" id="cb12"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">x_2_1</span> <span class="op"><-</span> <span class="va">x</span><span class="op">[</span><span class="fl">5</span>, , <span class="op">]</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x_2_1</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="va">x_2_1</span> <span class="co"># first sample from second class</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [2,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [3,] 0 1 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [4,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [5,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [6,] 0 0 1 0 0 0 0 0 0</span></span></code></pre>
<div class="sourceCode" id="cb14"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary_label</span> </span>
<span><span class="va">y</span> <span class="co"># 4 samples from each class </span></span></code></pre></div>
<pre><code><span><span class="co">## label_1 label_2</span></span>
<span><span class="co">## [1,] 1 0</span></span>
<span><span class="co">## [2,] 1 0</span></span>
<span><span class="co">## [3,] 1 0</span></span>
<span><span class="co">## [4,] 1 0</span></span>
<span><span class="co">## [5,] 0 1</span></span>
<span><span class="co">## [6,] 0 1</span></span>
<span><span class="co">## [7,] 0 1</span></span>
<span><span class="co">## [8,] 0 1</span></span></code></pre>
<p>Extract target from csv file:</p>
<div class="sourceCode" id="cb16"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="co"># get target from csv</span></span>
<span><span class="va">file</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/basename.html" class="external-link">basename</a></span><span class="op">(</span><span class="va">file_path</span><span class="op">)</span>, <span class="st">"xyz.fasta"</span>, <span class="st">"abc.fasta"</span>, <span class="st">"x_123.fasta"</span><span class="op">)</span></span>
<span><span class="va">vocabulary_label</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste0</a></span><span class="op">(</span><span class="st">"label"</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">4</span><span class="op">)</span></span>
<span><span class="va">label_1</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">1</span>, <span class="fl">0</span>, <span class="fl">0</span>, <span class="fl">0</span><span class="op">)</span></span>
<span><span class="va">label_2</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fl">1</span>, <span class="fl">0</span>, <span class="fl">0</span><span class="op">)</span></span>
<span><span class="va">label_3</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fl">0</span>, <span class="fl">1</span>, <span class="fl">0</span><span class="op">)</span></span>
<span><span class="va">label_4</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fl">0</span>, <span class="fl">0</span>, <span class="fl">1</span><span class="op">)</span></span>
<span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span><span class="va">file</span>, <span class="va">label_1</span>, <span class="va">label_2</span>, <span class="va">label_3</span>, <span class="va">label_4</span><span class="op">)</span></span>
<span><span class="va">df</span></span></code></pre></div>
<pre><code><span><span class="co">## file label_1 label_2 label_3 label_4</span></span>
<span><span class="co">## 1 a.fasta 1 0 0 0</span></span>
<span><span class="co">## 2 xyz.fasta 0 1 0 0</span></span>
<span><span class="co">## 3 abc.fasta 0 0 1 0</span></span>
<span><span class="co">## 4 x_123.fasta 0 0 0 1</span></span></code></pre>
<div class="sourceCode" id="cb18"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">csv_file</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".csv"</span><span class="op">)</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/utils/write.table.html" class="external-link">write.csv</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">csv_file</span>, row.names <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span>
<span></span>
<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
<span> train_type <span class="op">=</span> <span class="st">"label_csv"</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">6</span>,</span>
<span> target_from_csv <span class="op">=</span> <span class="va">csv_file</span>,</span>
<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
<span> vocabulary_label <span class="op">=</span> <span class="va">vocabulary_label</span><span class="op">)</span></span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> </span>
<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> </span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary_label</span> </span>
<span><span class="va">x</span> <span class="co"># abcdef</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span>
<span><span class="co">## [4,] 0 0 0 1 0 0 0 0 0</span></span>
<span><span class="co">## [5,] 0 0 0 0 1 0 0 0 0</span></span>
<span><span class="co">## [6,] 0 0 0 0 0 1 0 0 0</span></span></code></pre>
<div class="sourceCode" id="cb20"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># label_1 </span></span></code></pre></div>
<pre><code><span><span class="co">## label1 label2 label3 label4</span></span>
<span><span class="co">## [1,] 1 0 0 0</span></span></code></pre>
<p>Examples for language models follow in the next section.</p>
</div>
<div class="section level3">
<h3 id="output_format">output_format<a class="anchor" aria-label="anchor" href="#output_format"></a>
</h3>
<p>The <code>output_format</code> determines the shape of the output for
a language model, i.e. part of a sequence is the input <span class="math inline">\(X\)</span> and another the target <span class="math inline">\(Y\)</span>. Assume a sequence <tt>abcdefg</tt> and
<code>maxlen = 6</code>. Output correspond as follows</p>
<p><strong>“target_right”</strong>: <span class="math inline">\(X=\)</span> <tt>abcdef</tt>, <span class="math inline">\(Y=\)</span> <tt>g</tt></p>
<p><strong>“target_middle_lstm”</strong>: <span class="math inline">\(X
=\)</span> (<span class="math inline">\(X_1 =\)</span> <tt>abc</tt>,
<span class="math inline">\(X_2 =\)</span> <tt>gfe</tt>), <span class="math inline">\(Y=\)</span> <tt>d</tt> (note reversed order of
<span class="math inline">\(X_2\)</span>)</p>
<p><strong>“target_middle_cnn”</strong>: <span class="math inline">\(X
=\)</span> <tt>abcefg</tt>, <span class="math inline">\(Y =\)</span>
<tt>d</tt></p>
<p><strong>“wavenet”</strong>: <span class="math inline">\(X =\)</span>
<tt>abcdef</tt>, <span class="math inline">\(Y =\)</span>
<tt>bcdefg</tt></p>
<div class="sourceCode" id="cb22"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="co"># target_right</span></span>
<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">6</span>,</span>
<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
<span> output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> </span>
<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> </span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="va">x</span> <span class="co"># abcdef</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span>
<span><span class="co">## [4,] 0 0 0 1 0 0 0 0 0</span></span>
<span><span class="co">## [5,] 0 0 0 0 1 0 0 0 0</span></span>
<span><span class="co">## [6,] 0 0 0 0 0 1 0 0 0</span></span></code></pre>
<div class="sourceCode" id="cb24"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># g </span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 0 0 0 0 0 0 1 0 0</span></span></code></pre>
<div class="sourceCode" id="cb26"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="co"># target_middle_lstm</span></span>
<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">6</span>,</span>
<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
<span> output_format <span class="op">=</span> <span class="st">"target_middle_lstm"</span><span class="op">)</span></span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x_1</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> </span>
<span><span class="va">x_2</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> </span>
<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> </span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x_1</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x_2</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="va">x_1</span> <span class="co"># abc</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span></code></pre>
<div class="sourceCode" id="cb28"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">x_2</span> <span class="co"># gfe</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 0 0 0 0 0 0 1 0 0</span></span>
<span><span class="co">## [2,] 0 0 0 0 0 1 0 0 0</span></span>
<span><span class="co">## [3,] 0 0 0 0 1 0 0 0 0</span></span></code></pre>
<div class="sourceCode" id="cb30"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># d </span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 0 0 0 1 0 0 0 0 0</span></span></code></pre>
<div class="sourceCode" id="cb32"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="co"># target_middle_cnn</span></span>
<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">6</span>,</span>
<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
<span> output_format <span class="op">=</span> <span class="st">"target_middle_cnn"</span><span class="op">)</span></span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span></span>
<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="va">x</span> <span class="co"># abcefg</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span>
<span><span class="co">## [4,] 0 0 0 0 1 0 0 0 0</span></span>
<span><span class="co">## [5,] 0 0 0 0 0 1 0 0 0</span></span>
<span><span class="co">## [6,] 0 0 0 0 0 0 1 0 0</span></span></code></pre>
<div class="sourceCode" id="cb34"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># d</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 0 0 0 1 0 0 0 0 0</span></span></code></pre>
<div class="sourceCode" id="cb36"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="co"># wavenet</span></span>
<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">6</span>,</span>
<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
<span> output_format <span class="op">=</span> <span class="st">"wavenet"</span><span class="op">)</span></span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> </span>
<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="va">x</span> <span class="co"># abcdef</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span>
<span><span class="co">## [4,] 0 0 0 1 0 0 0 0 0</span></span>
<span><span class="co">## [5,] 0 0 0 0 1 0 0 0 0</span></span>
<span><span class="co">## [6,] 0 0 0 0 0 1 0 0 0</span></span></code></pre>
<div class="sourceCode" id="cb38"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># bcdefg</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 0 1 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [2,] 0 0 1 0 0 0 0 0 0</span></span>
<span><span class="co">## [3,] 0 0 0 1 0 0 0 0 0</span></span>
<span><span class="co">## [4,] 0 0 0 0 1 0 0 0 0</span></span>
<span><span class="co">## [5,] 0 0 0 0 0 1 0 0 0</span></span>
<span><span class="co">## [6,] 0 0 0 0 0 0 1 0 0</span></span></code></pre>
</div>
<div class="section level3">
<h3 id="batch_size">batch_size<a class="anchor" aria-label="anchor" href="#batch_size"></a>
</h3>
<p>Number of samples in one batch.</p>
<div class="sourceCode" id="cb40"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="co"># target_right</span></span>
<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">7</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">6</span>,</span>
<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
<span> output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span>
<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> </span>
<span><span class="fu"><a href="https://rdrr.io/r/base/dim.html" class="external-link">dim</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span></code></pre></div>
<pre><code><span><span class="co">## [1] 7 6 9</span></span></code></pre>
<div class="sourceCode" id="cb42"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/dim.html" class="external-link">dim</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span></span></code></pre></div>
<pre><code><span><span class="co">## [1] 7 9</span></span></code></pre>
</div>
<div class="section level3">
<h3 id="step">step<a class="anchor" aria-label="anchor" href="#step"></a>
</h3>
<p>We may determine how frequently we want to take a sample. If
<code>step = 1</code> we take a sample at every possible step. Let’s
assume we want to predict the next character, i.e. part of the sequence
is the <mark class="in">input</mark> and next character the
<mark class="out">target</mark>. If
<code>maxlen = 3, step = 1</code>:</p>
<ol style="list-style-type: decimal">
<li><p>sample:
<tt><mark class="in">abc</mark><mark class="out">d</mark>efghiiii</tt></p></li>
<li><p>sample:
<tt>a<mark class="in">bcd</mark><mark class="out">e</mark>fghiiii</tt></p></li>
<li><p>sample:
<tt>ab<mark class="in">cde</mark><mark class="out">f</mark>ghiiii</tt></p></li>
</ol>
<p>if <code>step = 3</code></p>
<ol style="list-style-type: decimal">
<li><p>sample:
<tt><mark class="in">abc</mark><mark class="out">d</mark>efghiiii</tt></p></li>
<li><p>sample:
<tt>abc<mark class="in">def</mark><mark class="out">g</mark>hiiii</tt></p></li>
<li><p>sample:
<tt>abcdef<mark class="in">ghi</mark><mark class="out">i</mark>ii</tt></p></li>
</ol>
<div class="sourceCode" id="cb44"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">3</span>,</span>
<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
<span> step <span class="op">=</span> <span class="fl">3</span>, </span>
<span> output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> <span class="co">#encodes abc</span></span>
<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> <span class="co"># encodes d</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="va">x</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span></code></pre>
<div class="sourceCode" id="cb46"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">y</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 0 0 0 1 0 0 0 0 0</span></span></code></pre>
<div class="sourceCode" id="cb48"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="co"># go 3 steps forward</span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> <span class="co">#encodes def</span></span>
<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> <span class="co"># encodes g</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="va">x</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 0 0 0 1 0 0 0 0 0</span></span>
<span><span class="co">## [2,] 0 0 0 0 1 0 0 0 0</span></span>
<span><span class="co">## [3,] 0 0 0 0 0 1 0 0 0</span></span></code></pre>
<div class="sourceCode" id="cb50"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">y</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 0 0 0 0 0 0 1 0 0</span></span></code></pre>
</div>
<div class="section level3">
<h3 id="padding">padding<a class="anchor" aria-label="anchor" href="#padding"></a>
</h3>
<p>If the sequence is too short to create a single sample, we can pad
the sequence with zero-vectors. If <code>padding = FALSE</code> the
generator will go to next file/ fasta entry until it finds a sequence
long enough for a sample.</p>
<div class="sourceCode" id="cb52"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">15</span>, <span class="co"># maxlen is longer than sequence</span></span>
<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
<span> step <span class="op">=</span> <span class="fl">3</span>,</span>
<span> padding <span class="op">=</span> <span class="cn">TRUE</span>,</span>
<span> output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> </span>
<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> </span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="va">x</span> <span class="co"># first 4 entries are zero-vectors</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 0 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [2,] 0 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [3,] 0 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [4,] 0 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [5,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [6,] 0 1 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [7,] 0 0 1 0 0 0 0 0 0</span></span>
<span><span class="co">## [8,] 0 0 0 1 0 0 0 0 0</span></span>
<span><span class="co">## [9,] 0 0 0 0 1 0 0 0 0</span></span>
<span><span class="co">## [10,] 0 0 0 0 0 1 0 0 0</span></span>
<span><span class="co">## [11,] 0 0 0 0 0 0 1 0 0</span></span>
<span><span class="co">## [12,] 0 0 0 0 0 0 0 1 0</span></span>
<span><span class="co">## [13,] 0 0 0 0 0 0 0 0 1</span></span>
<span><span class="co">## [14,] 0 0 0 0 0 0 0 0 1</span></span>
<span><span class="co">## [15,] 0 0 0 0 0 0 0 0 1</span></span></code></pre>
<div class="sourceCode" id="cb54"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">y</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 0 0 0 0 0 0 0 0 1</span></span></code></pre>
</div>
<div class="section level3">
<h3 id="ambiguous_nuc">ambiguous_nuc<a class="anchor" aria-label="anchor" href="#ambiguous_nuc"></a>
</h3>
<p>A sequence might contain a character that does not lie inside our
vocabulary. For example, let’s assume we discard <tt>e</tt> from our
vocabulary. We have 4 options to handle this situation</p>
<ol style="list-style-type: decimal">
<li>encode as zero vector</li>
</ol>
<div class="sourceCode" id="cb56"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">vocabulary_2</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"a"</span>, <span class="st">"b"</span>, <span class="st">"c"</span>, <span class="st">"d"</span>, <span class="st">"f"</span>, <span class="st">"g"</span>, <span class="st">"h"</span>, <span class="st">"i"</span><span class="op">)</span> <span class="co"># exclude "e" from vocabulary</span></span>
<span></span>
<span><span class="co"># zero</span></span>
<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">6</span>,</span>
<span> vocabulary <span class="op">=</span> <span class="va">vocabulary_2</span>,</span>
<span> output_format <span class="op">=</span> <span class="st">"target_right"</span>,</span>
<span> ambiguous_nuc <span class="op">=</span> <span class="st">"zeros"</span><span class="op">)</span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> </span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary_2</span></span>
<span><span class="va">x</span> <span class="co"># fifth row is zero vector </span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d f g h i</span></span>
<span><span class="co">## [1,] 1 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [2,] 0 1 0 0 0 0 0 0</span></span>
<span><span class="co">## [3,] 0 0 1 0 0 0 0 0</span></span>
<span><span class="co">## [4,] 0 0 0 1 0 0 0 0</span></span>
<span><span class="co">## [5,] 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [6,] 0 0 0 0 1 0 0 0</span></span></code></pre>
<ol start="2" style="list-style-type: decimal">
<li>equal probability</li>
</ol>
<div class="sourceCode" id="cb58"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="co"># equal</span></span>
<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">6</span>,</span>
<span> vocabulary <span class="op">=</span> <span class="va">vocabulary_2</span>,</span>
<span> output_format <span class="op">=</span> <span class="st">"target_right"</span>,</span>
<span> ambiguous_nuc <span class="op">=</span> <span class="st">"equal"</span><span class="op">)</span> </span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary_2</span></span>
<span><span class="va">x</span> <span class="co"># fifth row is 1/8 for every entry </span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d f g h i</span></span>
<span><span class="co">## [1,] 1.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000</span></span>
<span><span class="co">## [2,] 0.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000</span></span>
<span><span class="co">## [3,] 0.000 0.000 1.000 0.000 0.000 0.000 0.000 0.000</span></span>
<span><span class="co">## [4,] 0.000 0.000 0.000 1.000 0.000 0.000 0.000 0.000</span></span>
<span><span class="co">## [5,] 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125</span></span>
<span><span class="co">## [6,] 0.000 0.000 0.000 0.000 1.000 0.000 0.000 0.000</span></span></code></pre>
<ol start="3" style="list-style-type: decimal">
<li>use distribution of current file</li>
</ol>
<div class="sourceCode" id="cb60"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="co"># empirical</span></span>
<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">6</span>,</span>
<span> vocabulary <span class="op">=</span> <span class="va">vocabulary_2</span>,</span>
<span> output_format <span class="op">=</span> <span class="st">"target_right"</span>,</span>
<span> ambiguous_nuc <span class="op">=</span> <span class="st">"empirical"</span><span class="op">)</span> </span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> </span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary_2</span></span>
<span><span class="va">x</span> <span class="co"># fifth row is distribuation of file</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d f g</span></span>
<span><span class="co">## [1,] 1.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000</span></span>
<span><span class="co">## [2,] 0.00000000 1.00000000 0.00000000 0.00000000 0.00000000 0.00000000</span></span>
<span><span class="co">## [3,] 0.00000000 0.00000000 1.00000000 0.00000000 0.00000000 0.00000000</span></span>
<span><span class="co">## [4,] 0.00000000 0.00000000 0.00000000 1.00000000 0.00000000 0.00000000</span></span>
<span><span class="co">## [5,] 0.09090909 0.09090909 0.09090909 0.09090909 0.09090909 0.09090909</span></span>
<span><span class="co">## [6,] 0.00000000 0.00000000 0.00000000 0.00000000 1.00000000 0.00000000</span></span>
<span><span class="co">## h i</span></span>
<span><span class="co">## [1,] 0.00000000 0.0000000</span></span>
<span><span class="co">## [2,] 0.00000000 0.0000000</span></span>
<span><span class="co">## [3,] 0.00000000 0.0000000</span></span>
<span><span class="co">## [4,] 0.00000000 0.0000000</span></span>
<span><span class="co">## [5,] 0.09090909 0.3636364</span></span>
<span><span class="co">## [6,] 0.00000000 0.0000000</span></span></code></pre>
<ol start="4" style="list-style-type: decimal">
<li>discard</li>
</ol>
<div class="sourceCode" id="cb62"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="co"># discard</span></span>
<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">6</span>,</span>
<span> vocabulary <span class="op">=</span> <span class="va">vocabulary_2</span>,</span>
<span> output_format <span class="op">=</span> <span class="st">"target_right"</span>,</span>
<span> ambiguous_nuc <span class="op">=</span> <span class="st">"discard"</span><span class="op">)</span> </span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary_2</span></span>
<span><span class="va">x</span> <span class="co"># first sample with only characters from vocabulary is fghiii|i</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d f g h i</span></span>
<span><span class="co">## [1,] 0 0 0 0 1 0 0 0</span></span>
<span><span class="co">## [2,] 0 0 0 0 0 1 0 0</span></span>
<span><span class="co">## [3,] 0 0 0 0 0 0 1 0</span></span>
<span><span class="co">## [4,] 0 0 0 0 0 0 0 1</span></span>
<span><span class="co">## [5,] 0 0 0 0 0 0 0 1</span></span>
<span><span class="co">## [6,] 0 0 0 0 0 0 0 1</span></span></code></pre>
</div>
<div class="section level3">
<h3 id="proportion_per_seq">proportion_per_seq<a class="anchor" aria-label="anchor" href="#proportion_per_seq"></a>
</h3>
<p>The <code>proportion_per_seq</code> argument gives the option to use
a random subset instead of the full sequence.</p>
<div class="sourceCode" id="cb64"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/cat.html" class="external-link">cat</a></span><span class="op">(</span><span class="st">"sequence is "</span>, <span class="fu"><a href="https://rdrr.io/r/base/nchar.html" class="external-link">nchar</a></span><span class="op">(</span><span class="va">sequence</span><span class="op">)</span>, <span class="st">"characters long \n"</span><span class="op">)</span></span></code></pre></div>
<pre><code><span><span class="co">## sequence is 12 characters long</span></span></code></pre>
<div class="sourceCode" id="cb66"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">5</span>,</span>
<span> seed <span class="op">=</span> <span class="fl">1</span>,</span>
<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
<span> output_format <span class="op">=</span> <span class="st">"target_right"</span>,</span>
<span> <span class="co"># take random subsequence using 50% of sequence </span></span>
<span> proportion_per_seq <span class="op">=</span> <span class="fl">0.5</span><span class="op">)</span></span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span>
<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="va">x</span> <span class="co"># defgh</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span>
<span><span class="co">## [4,] 0 0 0 1 0 0 0 0 0</span></span>
<span><span class="co">## [5,] 0 0 0 0 1 0 0 0 0</span></span></code></pre>
<div class="sourceCode" id="cb68"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># i</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 0 0 0 0 0 1 0 0 0</span></span></code></pre>
</div>
<div class="section level3">
<h3 id="file_limit">file_limit<a class="anchor" aria-label="anchor" href="#file_limit"></a>
</h3>
<p>Integer or NULL. If integer, use only specified number of randomly
sampled files for training.</p>
</div>
<div class="section level3">
<h3 id="delete_used_files">delete_used_files<a class="anchor" aria-label="anchor" href="#delete_used_files"></a>
</h3>
<p>If true, delete file once used. Only applies for rds files.</p>
<div class="sourceCode" id="cb70"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">x</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/array.html" class="external-link">array</a></span><span class="op">(</span><span class="fl">0</span>, dim <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">1</span>,<span class="fl">5</span>,<span class="fl">4</span><span class="op">)</span><span class="op">)</span></span>
<span><span class="va">y</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/matrix.html" class="external-link">matrix</a></span><span class="op">(</span><span class="fl">0</span>, ncol <span class="op">=</span> <span class="fl">1</span><span class="op">)</span></span>
<span><span class="va">rds_path</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".rds"</span><span class="op">)</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/readRDS.html" class="external-link">saveRDS</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span><span class="va">x</span>, <span class="va">y</span><span class="op">)</span>, <span class="va">rds_path</span><span class="op">)</span></span>
<span></span>
<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">rds_path</span>,</span>
<span> delete_used_files <span class="op">=</span> <span class="cn">TRUE</span>,</span>
<span> train_type <span class="op">=</span> <span class="st">"label_rds"</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">5</span><span class="op">)</span></span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/files.html" class="external-link">file.exists</a></span><span class="op">(</span><span class="va">rds_path</span><span class="op">)</span></span></code></pre></div>
<pre><code><span><span class="co">## [1] FALSE</span></span></code></pre>
<div class="sourceCode" id="cb72"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="co"># z <- gen()</span></span>
<span><span class="co"># When calling the generator again, it will wait until it finds a file again from the files listed in </span></span>
<span><span class="co"># the initial `path` argument. Can be used if another process(es) create rds files.</span></span></code></pre></div>
</div>
<div class="section level3">
<h3 id="max_samples">max_samples<a class="anchor" aria-label="anchor" href="#max_samples"></a>
</h3>
<p>Only use fixed number of samples per file. Randomly choose which
samples to use. (If <code>random_sampling = FALSE</code>, samples are
consecutive.)</p>
<div class="sourceCode" id="cb73"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">2</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">5</span>,</span>
<span> step <span class="op">=</span> <span class="fl">1</span>,</span>
<span> seed <span class="op">=</span> <span class="fl">3</span>,</span>
<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
<span> output_format <span class="op">=</span> <span class="st">"target_right"</span>,</span>
<span> max_samples <span class="op">=</span> <span class="fl">2</span><span class="op">)</span></span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x1</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span>
<span><span class="va">x2</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">2</span>, , <span class="op">]</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x1</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x2</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="va">x1</span> <span class="co"># bcdef</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 0 0 0 0 1 0 0 0 0</span></span>
<span><span class="co">## [2,] 0 0 0 0 0 1 0 0 0</span></span>
<span><span class="co">## [3,] 0 0 0 0 0 0 1 0 0</span></span>
<span><span class="co">## [4,] 0 0 0 0 0 0 0 1 0</span></span>
<span><span class="co">## [5,] 0 0 0 0 0 0 0 0 1</span></span></code></pre>
<div class="sourceCode" id="cb75"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">x2</span> <span class="co"># cdefg</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 0 0 0 0 0 1 0 0 0</span></span>
<span><span class="co">## [2,] 0 0 0 0 0 0 1 0 0</span></span>
<span><span class="co">## [3,] 0 0 0 0 0 0 0 1 0</span></span>
<span><span class="co">## [4,] 0 0 0 0 0 0 0 0 1</span></span>
<span><span class="co">## [5,] 0 0 0 0 0 0 0 0 1</span></span></code></pre>
</div>
<div class="section level3">
<h3 id="random_sampling">random_sampling<a class="anchor" aria-label="anchor" href="#random_sampling"></a>
</h3>
<p>If you use <code>max_samples</code>, generator will randomly choose
subset from all possible samples, but those samples are consecutive.
With <code>random_sampling = TRUE</code>, samples are completely
random.</p>
<div class="sourceCode" id="cb77"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">2</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">5</span>,</span>
<span> seed <span class="op">=</span> <span class="fl">66</span>,</span>
<span> random_sampling <span class="op">=</span> <span class="cn">TRUE</span>,</span>
<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
<span> output_format <span class="op">=</span> <span class="st">"target_right"</span>,</span>
<span> max_samples <span class="op">=</span> <span class="fl">2</span><span class="op">)</span></span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x1</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span>
<span><span class="va">x2</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">2</span>, , <span class="op">]</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x1</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x2</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="va">x1</span> <span class="co"># efghi</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 0 0 0 0 0 0 1 0 0</span></span>
<span><span class="co">## [2,] 0 0 0 0 0 0 0 1 0</span></span>
<span><span class="co">## [3,] 0 0 0 0 0 0 0 0 1</span></span>
<span><span class="co">## [4,] 0 0 0 0 0 0 0 0 1</span></span>
<span><span class="co">## [5,] 0 0 0 0 0 0 0 0 1</span></span></code></pre>
<div class="sourceCode" id="cb79"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">x2</span> <span class="co"># defgh</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 0 0 0 0 1 0 0 0 0</span></span>
<span><span class="co">## [2,] 0 0 0 0 0 1 0 0 0</span></span>
<span><span class="co">## [3,] 0 0 0 0 0 0 1 0 0</span></span>
<span><span class="co">## [4,] 0 0 0 0 0 0 0 1 0</span></span>
<span><span class="co">## [5,] 0 0 0 0 0 0 0 0 1</span></span></code></pre>
</div>
<div class="section level3">
<h3 id="target_len">target_len<a class="anchor" aria-label="anchor" href="#target_len"></a>
</h3>
<p>Target length for language model.</p>
<div class="sourceCode" id="cb81"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
<span> target_len <span class="op">=</span> <span class="fl">3</span>, </span>
<span> maxlen <span class="op">=</span> <span class="fl">5</span>,</span>
<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
<span> output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span>
<span><span class="va">y1</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span><span class="op">[</span> , <span class="fl">1</span>, <span class="op">]</span></span>
<span><span class="va">y2</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span><span class="op">[</span> , <span class="fl">2</span>, <span class="op">]</span></span>
<span><span class="va">y3</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span><span class="op">[</span> , <span class="fl">3</span>, <span class="op">]</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/names.html" class="external-link">names</a></span><span class="op">(</span><span class="va">y1</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/names.html" class="external-link">names</a></span><span class="op">(</span><span class="va">y2</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/names.html" class="external-link">names</a></span><span class="op">(</span><span class="va">y3</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="va">x</span> <span class="co"># abcde</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span>
<span><span class="co">## [4,] 0 0 0 1 0 0 0 0 0</span></span>
<span><span class="co">## [5,] 0 0 0 0 1 0 0 0 0</span></span></code></pre>
<div class="sourceCode" id="cb83"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">y1</span> <span class="co"># f</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i </span></span>
<span><span class="co">## 0 0 0 0 0 1 0 0 0</span></span></code></pre>
<div class="sourceCode" id="cb85"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">y2</span> <span class="co"># g</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i </span></span>
<span><span class="co">## 0 0 0 0 0 0 1 0 0</span></span></code></pre>
<div class="sourceCode" id="cb87"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">y3</span> <span class="co"># h</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i </span></span>
<span><span class="co">## 0 0 0 0 0 0 0 1 0</span></span></code></pre>
</div>
<div class="section level3">
<h3 id="n_gram-n_gram_stride">n_gram / n_gram_stride<a class="anchor" aria-label="anchor" href="#n_gram-n_gram_stride"></a>
</h3>
<p>Encode target in language model not character wise but combine n
characters to one target. <code>n_gram_stride</code> determines the
frequency of the n-gram encoding.</p>
<div class="sourceCode" id="cb89"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
<span> target_len <span class="op">=</span> <span class="fl">6</span>, </span>
<span> n_gram <span class="op">=</span> <span class="fl">3</span>,</span>
<span> n_gram_stride <span class="op">=</span> <span class="fl">3</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">3</span>,</span>
<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
<span> output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span>
<span><span class="va">y1</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span><span class="op">[</span> , <span class="fl">1</span>, <span class="op">]</span></span>
<span><span class="va">y2</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span><span class="op">[</span> , <span class="fl">2</span>, <span class="op">]</span></span>
<span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/dim.html" class="external-link">dim</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">[</span><span class="fl">3</span><span class="op">]</span> <span class="op">==</span> <span class="fu"><a href="https://rdrr.io/r/base/length.html" class="external-link">length</a></span><span class="op">(</span><span class="va">vocabulary</span><span class="op">)</span><span class="op">^</span><span class="fl">3</span></span></code></pre></div>
<pre><code><span><span class="co">## [1] TRUE</span></span></code></pre>
<div class="sourceCode" id="cb91"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="co"># x = abc as 3-gram</span></span>
<span><span class="co"># y1 = def as 3-gram</span></span>
<span><span class="co"># y2 = ghi as 3-gram</span></span></code></pre></div>
</div>
<div class="section level3">
<h3 id="add_noise">add_noise<a class="anchor" aria-label="anchor" href="#add_noise"></a>
</h3>
<p>Add noise to input. Must be a list that specifies noise distribution
or NULL (no noise). List contains arguments <code>noise_type</code>:
either <code>"normal"</code> or <code>"uniform"</code>. Optional
arguments <code>sd</code> or <code>mean</code> if
<code>noise_type</code> is <code>"normal"</code> (default is
<code>sd=1</code> and <code>mean=0</code>) or <code>min</code>,
<code>max</code> if <code>noise_type</code> is <code>"uniform"</code>
(default is <code>min=0</code>, <code>max=1</code>).</p>
<div class="sourceCode" id="cb92"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span>
<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
<span> add_noise <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span>noise_type <span class="op">=</span> <span class="st">"normal"</span>, mean <span class="op">=</span> <span class="fl">0</span>, sd <span class="op">=</span> <span class="fl">0.01</span><span class="op">)</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">5</span>,</span>
<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
<span> output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span>
<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span>
<span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/Round.html" class="external-link">round</a></span><span class="op">(</span><span class="va">x</span>, <span class="fl">3</span><span class="op">)</span> <span class="co"># abcde + noise</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 0.994 0.005 -0.006 0.008 0.006 0.014 -0.004 0.007 -0.001</span></span>
<span><span class="co">## [2,] 0.002 1.007 -0.022 0.006 -0.001 -0.001 -0.001 0.006 0.009</span></span>
<span><span class="co">## [3,] -0.008 0.006 1.011 0.009 -0.002 0.004 0.011 -0.007 0.004</span></span>
<span><span class="co">## [4,] 0.016 -0.003 0.000 1.008 -0.015 -0.001 0.008 -0.007 -0.006</span></span>
<span><span class="co">## [5,] 0.003 0.015 0.000 0.001 0.995 -0.014 -0.002 0.004 0.003</span></span></code></pre>
<div class="sourceCode" id="cb94"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># f</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f</span></span>
<span><span class="co">## [1,] -0.008204684 0.003898432 0.009438362 -0.01989352 0.004179416 0.9958501</span></span>
<span><span class="co">## g h i</span></span>
<span><span class="co">## [1,] -0.002533617 0.007685329 -0.01129363</span></span></code></pre>
</div>
<div class="section level3">
<h3 id="proportion_entries">proportion_entries<a class="anchor" aria-label="anchor" href="#proportion_entries"></a>
</h3>
<p>If a fasta file has multiple entries, you can randomly choose a
subset. For example, if the file has 6 entries and
<code>proportion_entries = 0.5</code> the generator will randomly choose
only 3 of the entries.</p>
</div>
<div class="section level3">
<h3 id="shuffle_file_order">shuffle_file_order<a class="anchor" aria-label="anchor" href="#shuffle_file_order"></a>
</h3>
<p>Shuffle file order before iterating through files. Order gets
reshuffled after every iteration.</p>
</div>
<div class="section level3">
<h3 id="shuffle_input">shuffle_input<a class="anchor" aria-label="anchor" href="#shuffle_input"></a>
</h3>
<p>Whether to shuffle fasta entries if fasta file has multiple
entries.</p>
</div>
<div class="section level3">
<h3 id="reverse_complement">reverse_complement<a class="anchor" aria-label="anchor" href="#reverse_complement"></a>
</h3>
<p>If <code>TRUE</code>, randomly decide for every batch to use original
sequence or its reverse complement. Only implemented for <tt>ACGT</tt>
vocabulary.</p>
</div>
<div class="section level3">
<h3 id="sample_by_file_size">sample_by_file_size<a class="anchor" aria-label="anchor" href="#sample_by_file_size"></a>
</h3>
<p>Randomly choose new file by sampling according to file size (bigger
files more likely).</p>
</div>
<div class="section level3">
<h3 id="concat_seq">concat_seq<a class="anchor" aria-label="anchor" href="#concat_seq"></a>
</h3>
<p>Character string or <code>NULL</code>. If not <code>NULL</code> all
entries from file get concatenated to one sequence with
<code>concat_seq</code> string between them. Use
<code>concat_seq = ""</code> if you don’t want to add a new token.</p>
<div class="sourceCode" id="cb96"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"AC"</span>, <span class="st">"AG"</span>, <span class="st">"AT"</span><span class="op">)</span>, Header <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste0</a></span><span class="op">(</span><span class="st">"header"</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">3</span><span class="op">)</span><span class="op">)</span></span>
<span><span class="va">fasta_path</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fasta"</span><span class="op">)</span></span>
<span><span class="va">fasta_file</span> <span class="op"><-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">fasta_path</span><span class="op">)</span></span>
<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">fasta_path</span>,</span>
<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">9</span>,</span>
<span> vocabulary <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span>, <span class="st">"Z"</span><span class="op">)</span>,</span>
<span> concat_seq <span class="op">=</span> <span class="st">"ZZ"</span>,</span>
<span> output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span>
<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span>
<span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span>, <span class="st">"Z"</span><span class="op">)</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span>, <span class="st">"Z"</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="co"># ACZZAGZZA</span></span></code></pre></div>
<pre><code><span><span class="co">## A C G T Z</span></span>
<span><span class="co">## [1,] 1 0 0 0 0</span></span>
<span><span class="co">## [2,] 0 1 0 0 0</span></span>
<span><span class="co">## [3,] 0 0 0 0 1</span></span>
<span><span class="co">## [4,] 0 0 0 0 1</span></span>
<span><span class="co">## [5,] 1 0 0 0 0</span></span>
<span><span class="co">## [6,] 0 0 1 0 0</span></span>
<span><span class="co">## [7,] 0 0 0 0 1</span></span>
<span><span class="co">## [8,] 0 0 0 0 1</span></span>
<span><span class="co">## [9,] 1 0 0 0 0</span></span></code></pre>
<div class="sourceCode" id="cb98"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># T</span></span></code></pre></div>
<pre><code><span><span class="co">## A C G T Z</span></span>
<span><span class="co">## [1,] 0 0 0 1 0</span></span></code></pre>
</div>
<div class="section level3">
<h3 id="set_learning">set_learning<a class="anchor" aria-label="anchor" href="#set_learning"></a>
</h3>
<p>When you want to assign one label to set of samples. Only implemented
for <code>train_type = "label_folder"</code>. Input is a list with the
following parameters</p>
<ul>
<li>
<code>samples_per_target</code> how many samples to use for one
target</li>
<li>
<code>maxlen</code> length of one sample</li>
<li>
<code>reshape_mode</code>: <code>"time_dist", "multi_input"</code>
or <code>"concat"</code>.
<ul>
<li>If <code>reshape_mode = "multi_input"</code>, generator will produce
<code>samples_per_target</code> separate inputs, each of length
<code>maxlen</code>.</li>
<li>If <code>reshape_mode = "time_dist"</code>, generator will produce a
4D input array. The dimensions correspond to
<code>(batch_size, samples_per_target, maxlen, length(vocabulary))</code>.<br>
</li>
<li>If <code>reshape_mode</code> is <code>"concat"</code>, generator
will concatenate <code>samples_per_target</code> sequences of length
<code>maxlen</code> to one long sequence.</li>
</ul>
</li>
<li>If <code>reshape_mode = "concat"</code>, there is an additional
<code>buffer_len</code> argument: add new token between concatenated
samples
<ul>
<li>If <code>buffer_len</code> is an integer, the sub-sequences are
inter spaced with <code>buffer_len</code> rows. The input length is
(<code>maxlen</code> * <code>samples_per_target</code>) +
<code>buffer_len</code> * (<code>samples_per_target</code> - 1)</li>
</ul>
</li>
</ul>
<div class="sourceCode" id="cb100"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="co"># create data for second label</span></span>
<span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="st">"AABAACAADAAE"</span>, Header <span class="op">=</span> <span class="st">"header_1"</span><span class="op">)</span></span>
<span><span class="va">file_path_2</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fasta"</span><span class="op">)</span></span>
<span><span class="va">fasta_file</span> <span class="op"><-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">file_path_2</span><span class="op">)</span></span>
<span></span>
<span><span class="co"># multi_input </span></span>
<span><span class="va">set_learning</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span>reshape_mode <span class="op">=</span> <span class="st">"multi_input"</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">4</span>,</span>
<span> samples_per_target <span class="op">=</span> <span class="fl">3</span><span class="op">)</span></span>
<span></span>
<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="va">file_path</span>, <span class="va">file_path_2</span><span class="op">)</span>, <span class="co"># path has length 2 => 2 classes</span></span>
<span> train_type <span class="op">=</span> <span class="st">"label_folder"</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">2</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">4</span>,</span>
<span> step <span class="op">=</span> <span class="fl">1</span>, </span>
<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
<span> set_learning <span class="op">=</span> <span class="va">set_learning</span><span class="op">)</span></span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span>
<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/length.html" class="external-link">length</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="co"># 3 samples per target</span></span></code></pre></div>
<pre><code><span><span class="co">## [1] 3</span></span></code></pre>
<div class="sourceCode" id="cb102"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">x_1_1</span> <span class="op"><-</span> <span class="va">x</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span>
<span><span class="va">x_1_1</span> <span class="co"># abcd</span></span></code></pre></div>
<pre><code><span><span class="co">## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]</span></span>
<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span>
<span><span class="co">## [4,] 0 0 0 1 0 0 0 0 0</span></span></code></pre>
<div class="sourceCode" id="cb104"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">x_1_2</span> <span class="op"><-</span> <span class="va">x</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span>
<span><span class="va">x_1_2</span> <span class="co"># bcde</span></span></code></pre></div>
<pre><code><span><span class="co">## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]</span></span>
<span><span class="co">## [1,] 0 1 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [2,] 0 0 1 0 0 0 0 0 0</span></span>
<span><span class="co">## [3,] 0 0 0 1 0 0 0 0 0</span></span>
<span><span class="co">## [4,] 0 0 0 0 1 0 0 0 0</span></span></code></pre>
<div class="sourceCode" id="cb106"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">x_1_3</span> <span class="op"><-</span> <span class="va">x</span><span class="op">[[</span><span class="fl">3</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span>
<span><span class="va">x_1_3</span> <span class="co"># cdef</span></span></code></pre></div>
<pre><code><span><span class="co">## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]</span></span>
<span><span class="co">## [1,] 0 0 1 0 0 0 0 0 0</span></span>
<span><span class="co">## [2,] 0 0 0 1 0 0 0 0 0</span></span>
<span><span class="co">## [3,] 0 0 0 0 1 0 0 0 0</span></span>
<span><span class="co">## [4,] 0 0 0 0 0 1 0 0 0</span></span></code></pre>
<div class="sourceCode" id="cb108"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">x_2_1</span> <span class="op"><-</span> <span class="va">x</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">2</span>, , <span class="op">]</span></span>
<span><span class="va">x_2_1</span> <span class="co"># aaba</span></span></code></pre></div>
<pre><code><span><span class="co">## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]</span></span>
<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [2,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [3,] 0 1 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [4,] 1 0 0 0 0 0 0 0 0</span></span></code></pre>
<div class="sourceCode" id="cb110"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">x_2_2</span> <span class="op"><-</span> <span class="va">x</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">2</span>, , <span class="op">]</span></span>
<span><span class="va">x_2_2</span> <span class="co"># abaa</span></span></code></pre></div>
<pre><code><span><span class="co">## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]</span></span>
<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [3,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [4,] 1 0 0 0 0 0 0 0 0</span></span></code></pre>
<div class="sourceCode" id="cb112"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">x_2_3</span> <span class="op"><-</span> <span class="va">x</span><span class="op">[[</span><span class="fl">3</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">2</span>, , <span class="op">]</span></span>
<span><span class="va">x_2_3</span> <span class="co"># baac</span></span></code></pre></div>
<pre><code><span><span class="co">## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]</span></span>
<span><span class="co">## [1,] 0 1 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [2,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [3,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [4,] 0 0 1 0 0 0 0 0 0</span></span></code></pre>
<div class="sourceCode" id="cb114"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"label_1"</span>, <span class="st">"label_2"</span><span class="op">)</span></span>
<span><span class="va">y</span> </span></code></pre></div>
<pre><code><span><span class="co">## label_1 label_2</span></span>
<span><span class="co">## [1,] 1 0</span></span>
<span><span class="co">## [2,] 0 1</span></span></code></pre>
<div class="sourceCode" id="cb116"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="co"># concat </span></span>
<span><span class="va">set_learning</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span>reshape_mode <span class="op">=</span> <span class="st">"concat"</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">4</span>,</span>
<span> samples_per_target <span class="op">=</span> <span class="fl">3</span><span class="op">)</span></span>
<span></span>
<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="va">file_path</span>, <span class="va">file_path_2</span><span class="op">)</span>, <span class="co"># path has length 2 => 2 classes</span></span>
<span> train_type <span class="op">=</span> <span class="st">"label_folder"</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">2</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">4</span>,</span>
<span> step <span class="op">=</span> <span class="fl">2</span>, </span>
<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
<span> set_learning <span class="op">=</span> <span class="va">set_learning</span><span class="op">)</span></span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span>
<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/dim.html" class="external-link">dim</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> </span></code></pre></div>
<pre><code><span><span class="co">## [1] 2 12 9</span></span></code></pre>
<div class="sourceCode" id="cb118"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">x_1</span> <span class="op"><-</span> <span class="va">x</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x_1</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="va">x_1</span> <span class="co"># abcd | cdef | efgh</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span>
<span><span class="co">## [4,] 0 0 0 1 0 0 0 0 0</span></span>
<span><span class="co">## [5,] 0 0 1 0 0 0 0 0 0</span></span>
<span><span class="co">## [6,] 0 0 0 1 0 0 0 0 0</span></span>
<span><span class="co">## [7,] 0 0 0 0 1 0 0 0 0</span></span>
<span><span class="co">## [8,] 0 0 0 0 0 1 0 0 0</span></span>
<span><span class="co">## [9,] 0 0 0 0 1 0 0 0 0</span></span>
<span><span class="co">## [10,] 0 0 0 0 0 1 0 0 0</span></span>
<span><span class="co">## [11,] 0 0 0 0 0 0 1 0 0</span></span>
<span><span class="co">## [12,] 0 0 0 0 0 0 0 1 0</span></span></code></pre>
<div class="sourceCode" id="cb120"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">x_2</span> <span class="op"><-</span> <span class="va">x</span><span class="op">[</span><span class="fl">2</span>, , <span class="op">]</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x_2</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span>
<span><span class="va">x_2</span> <span class="co"># aaba | baac | acaa</span></span></code></pre></div>
<pre><code><span><span class="co">## a b c d e f g h i</span></span>
<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [2,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [3,] 0 1 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [4,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [5,] 0 1 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [6,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [7,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [8,] 0 0 1 0 0 0 0 0 0</span></span>
<span><span class="co">## [9,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [10,] 0 0 1 0 0 0 0 0 0</span></span>
<span><span class="co">## [11,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [12,] 1 0 0 0 0 0 0 0 0</span></span></code></pre>
<div class="sourceCode" id="cb122"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"label_1"</span>, <span class="st">"label_2"</span><span class="op">)</span></span>
<span><span class="va">y</span> </span></code></pre></div>
<pre><code><span><span class="co">## label_1 label_2</span></span>
<span><span class="co">## [1,] 1 0</span></span>
<span><span class="co">## [2,] 0 1</span></span></code></pre>
</div>
<div class="section level3">
<h3 id="use_quality_score">use_quality_score<a class="anchor" aria-label="anchor" href="#use_quality_score"></a>
</h3>
<p>If <code>TRUE</code>, instead of one-hot encoding, use quality score
of fastq file.</p>
<div class="sourceCode" id="cb124"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="st">"ACAGAT"</span>, Header <span class="op">=</span> <span class="st">"header_1"</span>, Quality <span class="op">=</span> <span class="st">"!#*=?I"</span><span class="op">)</span></span>
<span><span class="va">fastq_path</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fastq"</span><span class="op">)</span></span>
<span><span class="va">fastq_file</span> <span class="op"><-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFastq.html" class="external-link">writeFastq</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">fastq_path</span><span class="op">)</span></span>
<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">fastq_path</span>,</span>
<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">5</span>,</span>
<span> format <span class="op">=</span> <span class="st">"fastq"</span>,</span>
<span> vocabulary <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span>,</span>
<span> use_quality_score <span class="op">=</span> <span class="cn">TRUE</span>,</span>
<span> output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span>
<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span>
<span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="co"># ACAGA</span></span></code></pre></div>
<pre><code><span><span class="co">## A C G T</span></span>
<span><span class="co">## [1,] 0.0000000000 0.3333333333 0.3333333333 0.3333333333</span></span>
<span><span class="co">## [2,] 0.2103191148 0.3690426555 0.2103191148 0.2103191148</span></span>
<span><span class="co">## [3,] 0.8741074588 0.0419641804 0.0419641804 0.0419641804</span></span>
<span><span class="co">## [4,] 0.0005282977 0.0005282977 0.9984151068 0.0005282977</span></span>
<span><span class="co">## [5,] 0.9990000000 0.0003333333 0.0003333333 0.0003333333</span></span></code></pre>
<div class="sourceCode" id="cb126"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># T</span></span></code></pre></div>
<pre><code><span><span class="co">## A C G T</span></span>
<span><span class="co">## [1,] 3.333333e-05 3.333333e-05 3.333333e-05 0.9999</span></span></code></pre>
</div>
<div class="section level3">
<h3 id="use_coverage">use_coverage<a class="anchor" aria-label="anchor" href="#use_coverage"></a>
</h3>
<p>Integer or <code>NULL</code>. If not <code>NULL</code>, use coverage
as encoding rather than one-hot encoding. Coverage information must be
contained in fasta header: there must be a string “cov_n” in the header,
where n is some integer.</p>
<div class="sourceCode" id="cb128"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="st">"ACAGAT"</span>, Header <span class="op">=</span> <span class="st">"header_1_cov_8"</span><span class="op">)</span></span>
<span><span class="va">fasta_path</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fasta"</span><span class="op">)</span></span>
<span><span class="va">fasta_file</span> <span class="op"><-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">fasta_path</span><span class="op">)</span></span>
<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">fasta_path</span>,</span>
<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">5</span>,</span>
<span> vocabulary <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span>,</span>
<span> use_coverage <span class="op">=</span> <span class="fl">25</span>,</span>
<span> output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span>
<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span>
<span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="co"># ACAGA; 0.32 = 8/25</span></span></code></pre></div>
<pre><code><span><span class="co">## A C G T</span></span>
<span><span class="co">## [1,] 0.32 0.00 0.00 0</span></span>
<span><span class="co">## [2,] 0.00 0.32 0.00 0</span></span>
<span><span class="co">## [3,] 0.32 0.00 0.00 0</span></span>
<span><span class="co">## [4,] 0.00 0.00 0.32 0</span></span>
<span><span class="co">## [5,] 0.32 0.00 0.00 0</span></span></code></pre>
<div class="sourceCode" id="cb130"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># T</span></span></code></pre></div>
<pre><code><span><span class="co">## A C G T</span></span>
<span><span class="co">## [1,] 0 0 0 0.32</span></span></code></pre>
</div>
<div class="section level3">
<h3 id="added_label_path">added_label_path<a class="anchor" aria-label="anchor" href="#added_label_path"></a>
</h3>
<p>It is possible to feed a network additional information associated to
a sequence. This information needs to be in a csv file. If all sequences
in one file share the same label, the csv file should have one column
named “file”.</p>
<p>We may add some additional input to our dummy data</p>
<div class="sourceCode" id="cb132"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">file</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/basename.html" class="external-link">basename</a></span><span class="op">(</span><span class="va">file_path</span><span class="op">)</span>, <span class="st">"some_file_name.fasta"</span><span class="op">)</span></span>
<span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>file <span class="op">=</span> <span class="va">file</span>,</span>
<span> label_1 <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fl">1</span><span class="op">)</span>, label_2 <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">1</span>, <span class="fl">0</span><span class="op">)</span>, label_3 <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">1</span>, <span class="fl">0</span><span class="op">)</span><span class="op">)</span></span>
<span><span class="va">df</span></span></code></pre></div>
<pre><code><span><span class="co">## file label_1 label_2 label_3</span></span>
<span><span class="co">## 1 a.fasta 0 1 1</span></span>
<span><span class="co">## 2 some_file_name.fasta 1 0 0</span></span></code></pre>
<div class="sourceCode" id="cb134"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/utils/write.table.html" class="external-link">write.csv</a></span><span class="op">(</span>x <span class="op">=</span> <span class="va">df</span>, file <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/file.path.html" class="external-link">file.path</a></span><span class="op">(</span><span class="va">dir_path</span>, <span class="st">"add_input.csv"</span><span class="op">)</span>, row.names <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span></code></pre></div>
<p>If we add the path to the csv file, the generator will map additional
input to sequences:</p>
<div class="sourceCode" id="cb135"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">dir_path</span>,</span>
<span> train_type <span class="op">=</span> <span class="st">"lm"</span>, </span>
<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">5</span>,</span>
<span> output_format <span class="op">=</span> <span class="st">"target_right"</span>,</span>
<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span>
<span> added_label_path <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/file.path.html" class="external-link">file.path</a></span><span class="op">(</span><span class="va">dir_path</span>, <span class="st">"add_input.csv"</span><span class="op">)</span>,</span>
<span> add_input_as_seq <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span> <span class="co"># don't treat added input as sequence</span></span>
<span> </span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">added_label_input</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span>
<span><span class="va">added_label_input</span></span></code></pre></div>
<pre><code><span><span class="co">## [,1] [,2] [,3]</span></span>
<span><span class="co">## [1,] 0 1 1</span></span></code></pre>
<div class="sourceCode" id="cb137"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span>
<span><span class="va">x</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span></code></pre></div>
<pre><code><span><span class="co">## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]</span></span>
<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span>
<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span>
<span><span class="co">## [4,] 0 0 0 1 0 0 0 0 0</span></span>
<span><span class="co">## [5,] 0 0 0 0 1 0 0 0 0</span></span></code></pre>
<div class="sourceCode" id="cb139"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> </span>
<span><span class="va">y</span></span></code></pre></div>
<pre><code><span><span class="co">## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]</span></span>
<span><span class="co">## [1,] 0 0 0 0 0 1 0 0 0</span></span></code></pre>
<p>If we want to train a network with additional labels, we have to add
an additional input layer.</p>
<div class="sourceCode" id="cb141"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">model</span> <span class="op"><-</span> <span class="fu"><a href="../reference/create_model_lstm_cnn.html">create_model_lstm_cnn</a></span><span class="op">(</span></span>
<span> maxlen <span class="op">=</span> <span class="fl">5</span>,</span>
<span> layer_lstm <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">8</span>, <span class="fl">8</span><span class="op">)</span>,</span>
<span> layer_dense <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">4</span><span class="op">)</span>,</span>
<span> label_input <span class="op">=</span> <span class="fl">3</span> <span class="co"># additional input vector has length 3</span></span>
<span><span class="op">)</span></span></code></pre></div>
<pre><code><span><span class="co">## Model: "model"</span></span>
<span><span class="co">## __________________________________________________________________________________________________</span></span>
<span><span class="co">## Layer (type) Output Shape Param # Connected to </span></span>
<span><span class="co">## ==================================================================================================</span></span>
<span><span class="co">## input_1 (InputLayer) [(None, 5, 4)] 0 [] </span></span>
<span><span class="co">## </span></span>
<span><span class="co">## lstm (LSTM) (None, 5, 8) 416 ['input_1[0][0]'] </span></span>
<span><span class="co">## </span></span>
<span><span class="co">## input_2 (InputLayer) [(None, 3)] 0 [] </span></span>
<span><span class="co">## </span></span>
<span><span class="co">## lstm_1 (LSTM) (None, 8) 544 ['lstm[0][0]'] </span></span>
<span><span class="co">## </span></span>
<span><span class="co">## concatenate (Concatenate) (None, 11) 0 ['input_2[0][0]', </span></span>
<span><span class="co">## 'lstm_1[0][0]'] </span></span>
<span><span class="co">## </span></span>
<span><span class="co">## dense (Dense) (None, 4) 48 ['concatenate[0][0]'] </span></span>
<span><span class="co">## </span></span>
<span><span class="co">## ==================================================================================================</span></span>
<span><span class="co">## Total params: 1008 (3.94 KB)</span></span>
<span><span class="co">## Trainable params: 1008 (3.94 KB)</span></span>
<span><span class="co">## Non-trainable params: 0 (0.00 Byte)</span></span>
<span><span class="co">## __________________________________________________________________________________________________</span></span></code></pre>
<div class="sourceCode" id="cb143"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="co"># train_model(train_type = "lm", </span></span>
<span><span class="co"># model = model,</span></span>
<span><span class="co"># path = file.path(dir_path, "train_files_1"),</span></span>
<span><span class="co"># path_val = file.path(dir_path, "validation_files_1"),</span></span>
<span><span class="co"># added_label_path = file.path(dir_path, "add_input.csv"),</span></span>
<span><span class="co"># steps_per_epoch = 5,</span></span>
<span><span class="co"># batch_size = 8,</span></span>
<span><span class="co"># epochs = 2)</span></span></code></pre></div>
</div>
<div class="section level3">
<h3 id="return_int">return_int<a class="anchor" aria-label="anchor" href="#return_int"></a>
</h3>
<p>Whether to return integer encoding rather than one-hot encoding.</p>
<div class="sourceCode" id="cb144"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="st">"ATCGC"</span>, Header <span class="op">=</span> <span class="st">"seq_1"</span><span class="op">)</span></span>
<span><span class="va">fasta_path</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fasta"</span><span class="op">)</span></span>
<span><span class="va">fasta_file</span> <span class="op"><-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">fasta_path</span><span class="op">)</span></span>
<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">fasta_path</span>,</span>
<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
<span> return_int <span class="op">=</span> <span class="cn">TRUE</span>,</span>
<span> padding <span class="op">=</span> <span class="cn">TRUE</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">8</span>,</span>
<span> vocabulary <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span>,</span>
<span> output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span>
<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"pad"</span>, <span class="st">"pad"</span>, <span class="st">"pad"</span>, <span class="st">"pad"</span>, <span class="st">"A"</span>, <span class="st">"T"</span>, <span class="st">"C"</span>, <span class="st">"G"</span><span class="op">)</span></span>
<span><span class="va">x</span></span></code></pre></div>
<pre><code><span><span class="co">## pad pad pad pad A T C G</span></span>
<span><span class="co">## [1,] 0 0 0 0 1 4 2 3</span></span></code></pre>
<div class="sourceCode" id="cb146"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="st">"C"</span></span>
<span><span class="va">y</span></span></code></pre></div>
<pre><code><span><span class="co">## C</span></span>
<span><span class="co">## [1,] 2</span></span></code></pre>
<p>Can also be combined with n-gram encoding:</p>
<div class="sourceCode" id="cb148"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="st">"AAACCCTTT"</span>, Header <span class="op">=</span> <span class="st">"seq_1"</span><span class="op">)</span></span>
<span><span class="va">fasta_path</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fasta"</span><span class="op">)</span></span>
<span><span class="va">fasta_file</span> <span class="op"><-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">fasta_path</span><span class="op">)</span></span>
<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">fasta_path</span>,</span>
<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
<span> n_gram <span class="op">=</span> <span class="fl">3</span>,</span>
<span> n_gram_stride <span class="op">=</span> <span class="fl">3</span>,</span>
<span> return_int <span class="op">=</span> <span class="cn">TRUE</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">6</span>,</span>
<span> target_len <span class="op">=</span> <span class="fl">3</span>,</span>
<span> vocabulary <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span>,</span>
<span> output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span>
<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"AAA"</span>, <span class="st">"CCC"</span><span class="op">)</span></span>
<span><span class="va">x</span></span></code></pre></div>
<pre><code><span><span class="co">## AAA CCC</span></span>
<span><span class="co">## [1,] 1 22</span></span></code></pre>
<div class="sourceCode" id="cb150"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="st">"TTT"</span></span>
<span><span class="va">y</span></span></code></pre></div>
<pre><code><span><span class="co">## TTT</span></span>
<span><span class="co">## [1,] 64</span></span></code></pre>
</div>
<div class="section level3">
<h3 id="reshape_xy">reshape_xy<a class="anchor" aria-label="anchor" href="#reshape_xy"></a>
</h3>
<p>Apply some function to the output of a generator call.</p>
<div class="sourceCode" id="cb152"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="st">"AAAATTTT"</span>, Header <span class="op">=</span> <span class="st">"header_1"</span><span class="op">)</span></span>
<span><span class="va">fasta_path</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fasta"</span><span class="op">)</span></span>
<span><span class="va">fasta_file</span> <span class="op"><-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">fasta_path</span><span class="op">)</span></span>
<span><span class="va">fx</span> <span class="op"><-</span> <span class="kw">function</span><span class="op">(</span><span class="va">x</span> <span class="op">=</span> <span class="cn">NULL</span>, <span class="va">y</span> <span class="op">=</span> <span class="cn">NULL</span><span class="op">)</span> <span class="op">{</span></span>
<span> <span class="kw"><a href="https://rdrr.io/r/base/function.html" class="external-link">return</a></span><span class="op">(</span><span class="va">x</span> <span class="op">-</span> <span class="fl">1</span><span class="op">)</span></span>
<span><span class="op">}</span></span>
<span><span class="va">fy</span> <span class="op"><-</span> <span class="kw">function</span><span class="op">(</span><span class="va">x</span> <span class="op">=</span> <span class="cn">NULL</span>, <span class="va">y</span> <span class="op">=</span> <span class="cn">NULL</span><span class="op">)</span> <span class="op">{</span></span>
<span> <span class="kw"><a href="https://rdrr.io/r/base/function.html" class="external-link">return</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/Log.html" class="external-link">exp</a></span><span class="op">(</span><span class="va">y</span> <span class="op">*</span> <span class="fl">5</span><span class="op">)</span><span class="op">)</span></span>
<span><span class="op">}</span></span>
<span></span>
<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">fasta_path</span>,</span>
<span> reshape_xy <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span>x <span class="op">=</span> <span class="va">fx</span>, y <span class="op">=</span> <span class="va">fy</span><span class="op">)</span>,</span>
<span> train_type <span class="op">=</span> <span class="st">"label_folder"</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">8</span><span class="op">)</span></span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span>
<span><span class="va">x</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span></span></code></pre></div>
<pre><code><span><span class="co">## [,1] [,2] [,3] [,4]</span></span>
<span><span class="co">## [1,] 0 -1 -1 -1</span></span>
<span><span class="co">## [2,] 0 -1 -1 -1</span></span>
<span><span class="co">## [3,] 0 -1 -1 -1</span></span>
<span><span class="co">## [4,] 0 -1 -1 -1</span></span>
<span><span class="co">## [5,] -1 -1 -1 0</span></span>
<span><span class="co">## [6,] -1 -1 -1 0</span></span>
<span><span class="co">## [7,] -1 -1 -1 0</span></span>
<span><span class="co">## [8,] -1 -1 -1 0</span></span></code></pre>
<div class="sourceCode" id="cb154"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span>
<span><span class="va">y</span></span></code></pre></div>
<pre><code><span><span class="co">## [,1]</span></span>
<span><span class="co">## [1,] 148.4132</span></span></code></pre>
</div>
<div class="section level3">
<h3 id="masked_lm">masked_lm<a class="anchor" aria-label="anchor" href="#masked_lm"></a>
</h3>
<p>Masks some parts of input sequence. Can be used for training
BERT-like models.</p>
<div class="sourceCode" id="cb156"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">nt_seq</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/rep.html" class="external-link">rep</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span>, each <span class="op">=</span> <span class="fl">25</span><span class="op">)</span> <span class="op"><a href="../reference/pipe.html">%>%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste</a></span><span class="op">(</span>collapse <span class="op">=</span> <span class="st">""</span><span class="op">)</span></span>
<span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="va">nt_seq</span>, Header <span class="op">=</span> <span class="st">"seq_1"</span><span class="op">)</span></span>
<span><span class="va">fasta_path</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fasta"</span><span class="op">)</span></span>
<span><span class="va">fasta_file</span> <span class="op"><-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">fasta_path</span><span class="op">)</span></span>
<span><span class="va">masked_lm</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span>mask_rate <span class="op">=</span> <span class="fl">0.10</span>, <span class="co"># replace 10% of input with special mask token</span></span>
<span> random_rate <span class="op">=</span> <span class="fl">0.025</span>, <span class="co"># set 2.5% of input to random value</span></span>
<span> identity_rate <span class="op">=</span> <span class="fl">0.05</span>, <span class="co"># leave 5% unchanged</span></span>
<span> include_sw <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span> <span class="co"># 0,1 matrix showing where masking was applied</span></span>
<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">fasta_path</span>,</span>
<span> train_type <span class="op">=</span> <span class="st">"masked_lm"</span>,</span>
<span> masked_lm <span class="op">=</span> <span class="va">masked_lm</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
<span> n_gram <span class="op">=</span> <span class="fl">1</span>,</span>
<span> n_gram_stride <span class="op">=</span> <span class="fl">1</span>,</span>
<span> return_int <span class="op">=</span> <span class="cn">TRUE</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">100</span>,</span>
<span> vocabulary <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span><span class="op">)</span></span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span>
<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span>
<span><span class="va">sw</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">3</span><span class="op">]</span><span class="op">]</span></span>
<span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>x <span class="op">=</span> <span class="va">x</span><span class="op">[</span><span class="fl">1</span>, <span class="op">]</span>, y <span class="op">=</span> <span class="va">y</span><span class="op">[</span><span class="fl">1</span>, <span class="op">]</span>, sw <span class="op">=</span> <span class="va">sw</span><span class="op">[</span><span class="fl">1</span>, <span class="op">]</span><span class="op">)</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/utils/head.html" class="external-link">head</a></span><span class="op">(</span><span class="va">df</span><span class="op">)</span></span></code></pre></div>
<pre><code><span><span class="co">## x y sw</span></span>
<span><span class="co">## 1 5 1 1</span></span>
<span><span class="co">## 2 1 1 0</span></span>
<span><span class="co">## 3 1 1 0</span></span>
<span><span class="co">## 4 1 1 0</span></span>
<span><span class="co">## 5 1 1 0</span></span>
<span><span class="co">## 6 1 1 0</span></span></code></pre>
<p>Whenever sw (sample weight) column is 0, x and y columns are
identical. Let’s look at rows where sw is 1:</p>
<div class="sourceCode" id="cb158"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">df</span> <span class="op"><a href="../reference/pipe.html">%>%</a></span> <span class="fu">dplyr</span><span class="fu">::</span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html" class="external-link">filter</a></span><span class="op">(</span><span class="va">sw</span> <span class="op">==</span> <span class="fl">1</span><span class="op">)</span></span></code></pre></div>
<pre><code><span><span class="co">## x y sw</span></span>
<span><span class="co">## 1 5 1 1</span></span>
<span><span class="co">## 2 1 1 1</span></span>
<span><span class="co">## 3 1 1 1</span></span>
<span><span class="co">## 4 5 1 1</span></span>
<span><span class="co">## 5 5 2 1</span></span>
<span><span class="co">## 6 5 2 1</span></span>
<span><span class="co">## 7 5 2 1</span></span>
<span><span class="co">## 8 3 3 1</span></span>
<span><span class="co">## 9 2 3 1</span></span>
<span><span class="co">## 10 3 3 1</span></span>
<span><span class="co">## 11 5 3 1</span></span>
<span><span class="co">## 12 5 3 1</span></span>
<span><span class="co">## 13 4 4 1</span></span>
<span><span class="co">## 14 5 4 1</span></span>
<span><span class="co">## 15 4 4 1</span></span>
<span><span class="co">## 16 5 4 1</span></span>
<span><span class="co">## 17 5 4 1</span></span>
<span><span class="co">## 18 4 4 1</span></span></code></pre>
<p>Here 5 is the mask token, this is always the size of the vocabulary +
1.</p>
<div class="sourceCode" id="cb160"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">df</span> <span class="op"><a href="../reference/pipe.html">%>%</a></span> <span class="fu">dplyr</span><span class="fu">::</span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html" class="external-link">filter</a></span><span class="op">(</span><span class="va">sw</span> <span class="op">==</span> <span class="fl">1</span> <span class="op">&</span> <span class="va">x</span> <span class="op">==</span> <span class="fl">5</span><span class="op">)</span> <span class="co"># 10% masked part</span></span></code></pre></div>
<pre><code><span><span class="co">## x y sw</span></span>
<span><span class="co">## 1 5 1 1</span></span>
<span><span class="co">## 2 5 1 1</span></span>
<span><span class="co">## 3 5 2 1</span></span>
<span><span class="co">## 4 5 2 1</span></span>
<span><span class="co">## 5 5 2 1</span></span>
<span><span class="co">## 6 5 3 1</span></span>
<span><span class="co">## 7 5 3 1</span></span>
<span><span class="co">## 8 5 4 1</span></span>
<span><span class="co">## 9 5 4 1</span></span>
<span><span class="co">## 10 5 4 1</span></span></code></pre>
<div class="sourceCode" id="cb162"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">df</span> <span class="op"><a href="../reference/pipe.html">%>%</a></span> <span class="fu">dplyr</span><span class="fu">::</span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html" class="external-link">filter</a></span><span class="op">(</span><span class="va">sw</span> <span class="op">==</span> <span class="fl">1</span> <span class="op">&</span> <span class="va">x</span> <span class="op">!=</span> <span class="fl">5</span><span class="op">)</span> <span class="co"># 5% identity part and 2.5% random part (can randomly be the true value)</span></span></code></pre></div>
<pre><code><span><span class="co">## x y sw</span></span>
<span><span class="co">## 1 1 1 1</span></span>
<span><span class="co">## 2 1 1 1</span></span>
<span><span class="co">## 3 3 3 1</span></span>
<span><span class="co">## 4 2 3 1</span></span>
<span><span class="co">## 5 3 3 1</span></span>
<span><span class="co">## 6 4 4 1</span></span>
<span><span class="co">## 7 4 4 1</span></span>
<span><span class="co">## 8 4 4 1</span></span></code></pre>
<p>Can be combined with n-gram encoding and masking of fixed block
size:</p>
<div class="sourceCode" id="cb164"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">nt_seq</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/rep.html" class="external-link">rep</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span>, each <span class="op">=</span> <span class="fl">25</span><span class="op">)</span> <span class="op"><a href="../reference/pipe.html">%>%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste</a></span><span class="op">(</span>collapse <span class="op">=</span> <span class="st">""</span><span class="op">)</span></span>
<span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="va">nt_seq</span>, Header <span class="op">=</span> <span class="st">"seq_1"</span><span class="op">)</span></span>
<span><span class="va">fasta_path</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fasta"</span><span class="op">)</span></span>
<span><span class="va">fasta_file</span> <span class="op"><-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">fasta_path</span><span class="op">)</span></span>
<span><span class="va">masked_lm</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span>mask_rate <span class="op">=</span> <span class="fl">0.10</span>, <span class="co"># replace 10% of input with special mask token</span></span>
<span> random_rate <span class="op">=</span> <span class="fl">0.05</span>, <span class="co"># set 5% of input to random value</span></span>
<span> identity_rate <span class="op">=</span> <span class="fl">0.05</span>, <span class="co"># leave 5% unchanged</span></span>
<span> include_sw <span class="op">=</span> <span class="cn">TRUE</span>, <span class="co"># 0,1 matrix showing where masking was applied</span></span>
<span> block_len <span class="op">=</span> <span class="fl">3</span><span class="op">)</span> <span class="co"># always mask at least 3 tokens in a row </span></span>
<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">fasta_path</span>,</span>
<span> train_type <span class="op">=</span> <span class="st">"masked_lm"</span>,</span>
<span> masked_lm <span class="op">=</span> <span class="va">masked_lm</span>,</span>
<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span>
<span> n_gram <span class="op">=</span> <span class="fl">3</span>,</span>
<span> seed <span class="op">=</span> <span class="fl">12</span>,</span>
<span> n_gram_stride <span class="op">=</span> <span class="fl">1</span>,</span>
<span> return_int <span class="op">=</span> <span class="cn">TRUE</span>,</span>
<span> maxlen <span class="op">=</span> <span class="fl">100</span>,</span>
<span> vocabulary <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span><span class="op">)</span></span>
<span></span>
<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span>
<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span>
<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span>
<span><span class="va">sw</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">3</span><span class="op">]</span><span class="op">]</span></span>
<span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>x <span class="op">=</span> <span class="va">x</span><span class="op">[</span><span class="fl">1</span>, <span class="op">]</span>, y <span class="op">=</span> <span class="va">y</span><span class="op">[</span><span class="fl">1</span>, <span class="op">]</span>, sw <span class="op">=</span> <span class="va">sw</span><span class="op">[</span><span class="fl">1</span>, <span class="op">]</span>, position <span class="op">=</span> <span class="fl">1</span><span class="op">:</span><span class="fu"><a href="https://rdrr.io/r/base/nrow.html" class="external-link">ncol</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">)</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/utils/head.html" class="external-link">head</a></span><span class="op">(</span><span class="va">df</span><span class="op">)</span></span></code></pre></div>
<pre><code><span><span class="co">## x y sw position</span></span>
<span><span class="co">## 1 1 1 0 1</span></span>
<span><span class="co">## 2 1 1 0 2</span></span>
<span><span class="co">## 3 1 1 0 3</span></span>
<span><span class="co">## 4 39 1 1 4</span></span>
<span><span class="co">## 5 48 1 1 5</span></span>
<span><span class="co">## 6 13 1 1 6</span></span></code></pre>
<div class="sourceCode" id="cb166"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/utils/head.html" class="external-link">tail</a></span><span class="op">(</span><span class="va">df</span><span class="op">)</span></span></code></pre></div>
<pre><code><span><span class="co">## x y sw position</span></span>
<span><span class="co">## 93 65 64 1 93</span></span>
<span><span class="co">## 94 64 64 0 94</span></span>
<span><span class="co">## 95 64 64 0 95</span></span>
<span><span class="co">## 96 64 64 0 96</span></span>
<span><span class="co">## 97 64 64 0 97</span></span>
<span><span class="co">## 98 64 64 0 98</span></span></code></pre>
<p>We can check that sample weights appear only in blocks.</p>
<div class="sourceCode" id="cb168"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/which.html" class="external-link">which</a></span><span class="op">(</span><span class="va">sw</span> <span class="op">==</span> <span class="fl">1</span><span class="op">)</span></span></code></pre></div>
<pre><code><span><span class="co">## [1] 4 5 6 13 14 15 22 23 24 40 41 42 52 53 54 79 80 81 82 83 84 91 92 93</span></span></code></pre>
<p>Here 65 is the mask token (4^3 + 1 = size of the vocabulary + 1).</p>
<div class="sourceCode" id="cb170"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">df</span> <span class="op"><a href="../reference/pipe.html">%>%</a></span> <span class="fu">dplyr</span><span class="fu">::</span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html" class="external-link">filter</a></span><span class="op">(</span><span class="va">sw</span> <span class="op">==</span> <span class="fl">1</span> <span class="op">&</span> <span class="va">x</span> <span class="op">==</span> <span class="fl">65</span><span class="op">)</span> <span class="co"># 10% masked part</span></span></code></pre></div>
<pre><code><span><span class="co">## x y sw position</span></span>
<span><span class="co">## 1 65 22 1 40</span></span>
<span><span class="co">## 2 65 22 1 41</span></span>
<span><span class="co">## 3 65 22 1 42</span></span>
<span><span class="co">## 4 65 64 1 79</span></span>
<span><span class="co">## 5 65 64 1 80</span></span>
<span><span class="co">## 6 65 64 1 81</span></span>
<span><span class="co">## 7 65 64 1 82</span></span>
<span><span class="co">## 8 65 64 1 83</span></span>
<span><span class="co">## 9 65 64 1 84</span></span>
<span><span class="co">## 10 65 64 1 91</span></span>
<span><span class="co">## 11 65 64 1 92</span></span>
<span><span class="co">## 12 65 64 1 93</span></span></code></pre>
<div class="sourceCode" id="cb172"><pre class="downlit sourceCode r">
<code class="sourceCode R"><span><span class="va">df</span> <span class="op"><a href="../reference/pipe.html">%>%</a></span> <span class="fu">dplyr</span><span class="fu">::</span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html" class="external-link">filter</a></span><span class="op">(</span><span class="va">sw</span> <span class="op">==</span> <span class="fl">1</span> <span class="op">&</span> <span class="va">x</span> <span class="op">!=</span> <span class="fl">65</span><span class="op">)</span> <span class="co"># 5% identity part and 5% random part (can randomly be the true value)</span></span></code></pre></div>
<pre><code><span><span class="co">## x y sw position</span></span>
<span><span class="co">## 1 39 1 1 4</span></span>
<span><span class="co">## 2 48 1 1 5</span></span>
<span><span class="co">## 3 13 1 1 6</span></span>
<span><span class="co">## 4 1 1 1 13</span></span>
<span><span class="co">## 5 1 1 1 14</span></span>
<span><span class="co">## 6 1 1 1 15</span></span>
<span><span class="co">## 7 1 1 1 22</span></span>
<span><span class="co">## 8 1 1 1 23</span></span>
<span><span class="co">## 9 2 2 1 24</span></span>
<span><span class="co">## 10 56 43 1 52</span></span>
<span><span class="co">## 11 4 43 1 53</span></span>
<span><span class="co">## 12 24 43 1 54</span></span></code></pre>
</div>
</div>
</main><aside class="col-md-3"><nav id="toc"><h2>On this page</h2>
</nav></aside>
</div>
<footer><div class="pkgdown-footer-left">
<p>Developed by Philipp Münch, René Mreches, Martin Binder, Hüseyin Anil Gündüz, Xiao-Yin To, Alice McHardy.</p>
</div>
<div class="pkgdown-footer-right">
<p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.9.</p>
</div>
</footer>
</div>
</body>
</html>