--- a +++ b/docs/articles/data_generator.html @@ -0,0 +1,1814 @@ +<!DOCTYPE html> +<!-- Generated by pkgdown: do not edit by hand --><html lang="en"> +<head> +<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> +<meta charset="utf-8"> +<meta http-equiv="X-UA-Compatible" content="IE=edge"> +<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"> +<meta name="description" content="deepG"> +<title>Data Generator • deepG</title> +<!-- favicons --><link rel="icon" type="image/png" sizes="16x16" href="../favicon-16x16.png"> +<link rel="icon" type="image/png" sizes="32x32" href="../favicon-32x32.png"> +<link rel="apple-touch-icon" type="image/png" sizes="180x180" href="../apple-touch-icon.png"> +<link rel="apple-touch-icon" type="image/png" sizes="120x120" href="../apple-touch-icon-120x120.png"> +<link rel="apple-touch-icon" type="image/png" sizes="76x76" href="../apple-touch-icon-76x76.png"> +<link rel="apple-touch-icon" type="image/png" sizes="60x60" href="../apple-touch-icon-60x60.png"> +<script src="../deps/jquery-3.6.0/jquery-3.6.0.min.js"></script><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"> +<link href="../deps/bootstrap-5.3.1/bootstrap.min.css" rel="stylesheet"> +<script src="../deps/bootstrap-5.3.1/bootstrap.bundle.min.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"> +<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"> +<!-- bootstrap-toc --><script src="https://cdn.jsdelivr.net/gh/afeld/bootstrap-toc@v1.0.1/dist/bootstrap-toc.min.js" integrity="sha256-4veVQbu7//Lk5TSmc7YV48MxtMy98e26cf5MrgZYnwo=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.11/clipboard.min.js" integrity="sha512-7O5pXpc0oCRrxk8RUfDYFgn0nO1t+jLuIOQdOMRp4APB7uZ4vSjspzp5y6YDtDs4VzUSTbWzBFZ/LKJhnyFOKw==" crossorigin="anonymous" referrerpolicy="no-referrer"></script><!-- search --><script src="https://cdnjs.cloudflare.com/ajax/libs/fuse.js/6.4.6/fuse.js" integrity="sha512-zv6Ywkjyktsohkbp9bb45V6tEMoWhzFzXis+LrMehmJZZSys19Yxf1dopHx7WzIKxr5tK2dVcYmaCk2uqdjF4A==" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/autocomplete.js/0.38.0/autocomplete.jquery.min.js" integrity="sha512-GU9ayf+66Xx2TmpxqJpliWbT5PiGYxpaG8rfnBEk1LL8l1KGkRShhngwdXK1UgqhAzWpZHSiYPc09/NwDQIGyg==" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mark.js/8.11.1/mark.min.js" integrity="sha512-5CYOlHXGh6QpOFA/TeTylKLWfB3ftPsde7AnmhuitiTX4K5SqCLBeKro6sPS8ilsz1Q4NRx3v8Ko2IBiszzdww==" crossorigin="anonymous"></script><!-- pkgdown --><script src="../pkgdown.js"></script><meta property="og:title" content="Data Generator"> +<meta property="og:description" content="deepG"> +<meta property="og:image" content="https://genomenet.github.io/deepG/logo.png"> +<!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]> +<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script> +<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script> +<![endif]--> +</head> +<body> + <a href="#main" class="visually-hidden-focusable">Skip to contents</a> + + + <nav class="navbar fixed-top navbar-light navbar-expand-lg bg-light" data-bs-theme="light"><div class="container"> + + <a class="navbar-brand me-2" href="../index.html">deepG</a> + + <small class="nav-text text-default me-auto" data-bs-toggle="tooltip" data-bs-placement="bottom" title="Released version">0.3.0</small> + + + <button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbar" aria-controls="navbar" aria-expanded="false" aria-label="Toggle navigation"> + <span class="navbar-toggler-icon"></span> + </button> + + <div id="navbar" class="collapse navbar-collapse ms-3"> + <ul class="navbar-nav me-auto"> +<li class="nav-item"> + <a class="nav-link" href="../reference/index.html"> + <span class="fa fa fa fa-file-alt"></span> + + Reference + </a> +</li> +<li class="nav-item dropdown"> + <a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false" aria-haspopup="true" id="dropdown-notebooks">Notebooks</a> + <div class="dropdown-menu" aria-labelledby="dropdown-notebooks"> + <a class="external-link dropdown-item" href="https://colab.research.google.com/drive/175jIdXcDcgPUvaBo2rH2Lupbpjnp5O7G?usp=sharing">deepG tutorial</a> + <a class="external-link dropdown-item" href="https://colab.research.google.com/drive/1Eolc0koMNM1zkuO4XyVM58ImeF1BpRiH?usp=sharing">Read-length level: Human contamination</a> + <a class="external-link dropdown-item" href="https://colab.research.google.com/drive/1yiXSwFafXpMLHaov9iBTQLIDZ6bK1zYX?usp=sharing">Locus level: CRISPR detection</a> + <a class="external-link dropdown-item" href="https://colab.research.google.com/drive/1G7bOFEX87cZNrM2tdRtTdkrZn5fM__g0?usp=sharing">Gene level: 16S rRNA detection</a> + <a class="external-link dropdown-item" href="https://colab.research.google.com/drive/1BCggL-tfQF136YeJ8cKKi-zoBEDMgkNh?usp=sharing">Genome level: Bacterial morphology (Sporulation)</a> + <a class="external-link dropdown-item" href="https://colab.research.google.com/drive/10xpRzGd3JeBAbqQYSCxzQUMctt01sx9D?usp=sharing">Full metagenome level: Colorectal cancer prediction</a> + <a class="external-link dropdown-item" href="https://colab.research.google.com/drive/1kyYK7IU7GSfdpDzO_a8U3_qD4i3zTu6w?usp=sharing">BERT with deepG</a> + </div> +</li> +<li class="active nav-item dropdown"> + <a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false" aria-haspopup="true" id="dropdown-tutorials">Tutorials</a> + <div class="dropdown-menu" aria-labelledby="dropdown-tutorials"> + <a class="dropdown-item" href="../articles/getting_started.html">Getting Started</a> + <a class="dropdown-item" href="../articles/training_types.html">Training types</a> + <a class="dropdown-item" href="../articles/data_generator.html">Data generator</a> + <a class="dropdown-item" href="../articles/using_tb.html">Using tensorboard</a> + <a class="dropdown-item" href="../articles/integrated_gradient.html">Integrated Gradient</a> + </div> +</li> + </ul> +<form class="form-inline my-2 my-lg-0" role="search"> + <input type="search" class="form-control me-sm-2" aria-label="Toggle navigation" name="search-input" data-search-index="../search.json" id="search-input" placeholder="Search for" autocomplete="off"> +</form> + + <ul class="navbar-nav"> +<li class="nav-item"> + <a class="external-link nav-link" href="https://github.com/GenomeNet/deepG/" aria-label="github"> + <span class="fab fa fab fa-github fa-lg"></span> + + </a> +</li> + </ul> +</div> + + + </div> +</nav><div class="container template-article"> + + + + +<div class="row"> + <main id="main" class="col-md-9"><div class="page-header"> + <img src="../logo.png" class="logo" alt=""><h1>Data Generator</h1> + + + <small class="dont-index">Source: <a href="https://github.com/GenomeNet/deepG/blob/HEAD/vignettes/data_generator.Rmd" class="external-link"><code>vignettes/data_generator.Rmd</code></a></small> + <div class="d-none name"><code>data_generator.Rmd</code></div> + </div> + + + +<div class="sourceCode" id="cb1"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="co">#devtools::install_github("GenomeNet/deepG")</span></span> +<span><span class="co">#library(deepG)</span></span> +<span><span class="co">#library(magrittr)</span></span></code></pre></div> +<style type="text/css"> +mark.in { + background-color: CornflowerBlue; +} + +mark.out { + background-color: IndianRed; +} + +</style> +<div class="section level2"> +<h2 id="introduction">Introduction<a class="anchor" aria-label="anchor" href="#introduction"></a> +</h2> +<p>The most common use case for the deepG data generator is to extract +samples from a collection of fasta (or fastq) files. The generator will +always return a list of length 2. The first element is the input <span class="math inline">\(X\)</span> and the second the target <span class="math inline">\(Y\)</span>. We can differentiate between 2 +approaches</p> +<ul> +<li> +<strong>Language model</strong>: Part of a sequence is the input and +other part the target. +<ul> +<li>Example: Predict the next nucleotide given the previous 100 +nucleotides.</li> +</ul> +</li> +<li> +<strong>Label classification</strong>: Assign a label to a sequence. +<ul> +<li>Example: Assign a label “virus” or “bacteria” to a sequence of +length 100.</li> +</ul> +</li> +</ul> +<p>Suppose we are given 2 fasta files called “a.fasta” and “b.fasta” +that look as follows:</p> +<div style="float: left;margin-right:10px"> +<table class="table"><tr> +<td> +<strong>a.fasta</strong> <br><tt> >header_a1 <br> AACCAAGG <br> +>header_a2 <br> TTTGGG <br> >header_a3 <br> ACGTACGT <br></tt> +</td> +</tr></table> +</div> +<div style="float: left"> +<table class="table"><tr> +<td> +<strong>b.fasta</strong> <br><tt> >header_b1 <br> GTGTGT <br> +>header_b2 <br> AAGG <br></tt> +</td> +</tr></table> +</div> +<p><br><br><br><br><br><br><br><br><br></p> +<p>If we want to extract sequences of length 4 from these files, there +would be 17 possible samples (5 from <tt>AACCAAGG</tt>, 3 from +<tt>TTTGGG</tt>, …). A naive approach would be to extract the samples in +a sequential manner:</p> +<p><em>1. sample</em>:</p> +<div style="float: left;margin-right:10px"> +<table class="table"><tr> +<td> +<strong>a.fasta</strong> <br><tt> >header_a1 <br><mark class="in">AACC</mark>AAGG <br> >header_a2 <br> TTTGGG <br> +>header_a3 <br> ACGTACGT <br></tt> +</td> +</tr></table> +</div> +<div style="float: left"> +<table class="table"><tr> +<td> +<strong>b.fasta</strong> <br><tt> >header_b1 <br> GTGTGT <br> +>header_b2 <br> AAGG <br></tt> +</td> +</tr></table> +</div> +<p><br><br><br><br><br><br><br><br><br></p> +<p><em>2. sample</em>:</p> +<div style="float: left;margin-right:10px"> +<table class="table"><tr> +<td> +<strong>a.fasta</strong> <br><tt> >header_a1 <br> +A<mark class="in">ACCA</mark>AGG <br> >header_a2 <br> TTTGGG <br> +>header_a3 <br> ACGTACGT <br></tt> +</td> +</tr></table> +</div> +<div style="float: left"> +<table class="table"><tr> +<td> +<strong>b.fasta</strong> <br><tt> >header_b1 <br> GTGTGT <br> +>header_b2 <br> AAGG <br></tt> +</td> +</tr></table> +</div> +<p><br><br><br><br><br><br><br><br><br></p> +<p>…</p> +<p><br></p> +<p><em>17. sample</em>:</p> +<div style="float: left;margin-right:10px"> +<table class="table"><tr> +<td> +<strong>a.fasta</strong> <br><tt> >header_a1 <br> AACCAAGG <br> +>header_a2 <br> TTTGGG <br> >header_a3 <br> ACGTACGT <br></tt> +</td> +</tr></table> +</div> +<div style="float: left"> +<table class="table"><tr> +<td> +<strong>b.fasta</strong> <br><tt> >header_b1 <br> GTGTGT <br> +>header_b2 <br><mark class="in">AAGG</mark><br></tt> +</td> +</tr></table> +</div> +<p><br><br><br><br><br><br><br><br><br></p> +<p><em>18. sample</em>:</p> +<div style="float: left;margin-right:10px"> +<table class="table"><tr> +<td> +<strong>a.fasta</strong> <br><tt> >header_a1 <br><mark class="in">AACC</mark>AAGG <br> >header_a2 <br> TTTGGG <br> +>header_a3 <br> ACGTACGT <br></tt> +</td> +</tr></table> +</div> +<div style="float: left"> +<table class="table"><tr> +<td> +<strong>b.fasta</strong> <br><tt> >header_b1 <br> GTGTGT <br> +>header_b2 <br> AAGG <br></tt> +</td> +</tr></table> +</div> +<p><br><br><br><br><br><br><br><br><br></p> +<p>… <br><br></p> +<p>For longer sequences this is not a desirable strategy since the data +is very redundant (often just one nucleotide difference) and the model +would often see long stretches of data from the same source. Choosing +the samples completely at random can also be problematic since we would +constantly have to open new files. The deepG generators offers several +option to navigate the data sampling strategy to achieve a good balance +between the two approaches.</p> +</div> +<div class="section level2"> +<h2 id="data-generator-options">Data generator options<a class="anchor" aria-label="anchor" href="#data-generator-options"></a> +</h2> +<p>In the following code examples, we will mostly use the sequence <tt> +<strong>abcdefghiiii</strong> </tt> to demonstrate some of the deepG +data generator options. (In real world application you would usually +have sequences from the <tt>ACGT</tt> vocabulary.)</p> +<div class="sourceCode" id="cb2"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">sequence</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste0</a></span><span class="op">(</span><span class="st">"a"</span>, <span class="st">"b"</span>, <span class="st">"c"</span>, <span class="st">"d"</span>, <span class="st">"e"</span>, <span class="st">"f"</span>, <span class="st">"g"</span>, <span class="st">"h"</span>, <span class="st">"i"</span>, <span class="st">"i"</span>, <span class="st">"i"</span>, <span class="st">"i"</span><span class="op">)</span></span> +<span><span class="va">vocabulary</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"a"</span>, <span class="st">"b"</span>, <span class="st">"c"</span>, <span class="st">"d"</span>, <span class="st">"e"</span>, <span class="st">"f"</span>, <span class="st">"g"</span>, <span class="st">"h"</span>, <span class="st">"i"</span><span class="op">)</span> </span></code></pre></div> +<p>We may store this sequence in a fasta file</p> +<div class="sourceCode" id="cb3"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">temp_dir</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span><span class="op">)</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/files2.html" class="external-link">dir.create</a></span><span class="op">(</span><span class="va">temp_dir</span><span class="op">)</span></span> +<span><span class="va">dir_path</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste0</a></span><span class="op">(</span><span class="va">temp_dir</span>, <span class="st">"/dummy_data"</span><span class="op">)</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/files2.html" class="external-link">dir.create</a></span><span class="op">(</span><span class="va">dir_path</span><span class="op">)</span></span> +<span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="va">sequence</span>, Header <span class="op">=</span> <span class="st">"label_1"</span>, stringsAsFactors <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span> +<span><span class="va">file_path</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/file.path.html" class="external-link">file.path</a></span><span class="op">(</span><span class="va">dir_path</span>, <span class="st">"a.fasta"</span><span class="op">)</span></span> +<span><span class="co"># sequence as fasta file</span></span> +<span><span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span>fdta <span class="op">=</span> <span class="fu">dplyr</span><span class="fu">::</span><span class="fu"><a href="https://tibble.tidyverse.org/reference/as_tibble.html" class="external-link">as_tibble</a></span><span class="op">(</span><span class="va">df</span><span class="op">)</span>, out.file <span class="op">=</span> <span class="va">file_path</span><span class="op">)</span></span></code></pre></div> +<p>Since neural networks can only work with numeric data, we have to +encode sequences of characters with numeric data. Usually this is +achieved by one-hot-encoding; there are some other approaches +implemented: see <code>use_coverage</code>, +<code>use_quality_score</code> and <code>ambiguous_nuc</code> +sections.</p> +<div class="sourceCode" id="cb4"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="co"># one-hot encoding example</span></span> +<span><span class="va">s</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"a"</span>, <span class="st">"c"</span>, <span class="st">"a"</span>, <span class="st">"f"</span>, <span class="st">"i"</span>, <span class="st">"b"</span><span class="op">)</span></span> +<span><span class="va">s_as_int_seq</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/vector.html" class="external-link">vector</a></span><span class="op">(</span><span class="st">"integer"</span>, <span class="fu"><a href="https://rdrr.io/r/base/length.html" class="external-link">length</a></span><span class="op">(</span><span class="va">s</span><span class="op">)</span><span class="op">)</span></span> +<span><span class="kw">for</span> <span class="op">(</span><span class="va">i</span> <span class="kw">in</span> <span class="fl">1</span><span class="op">:</span><span class="fu"><a href="https://rdrr.io/r/base/length.html" class="external-link">length</a></span><span class="op">(</span><span class="va">s</span><span class="op">)</span><span class="op">)</span> <span class="op">{</span></span> +<span> <span class="va">s_as_int_seq</span><span class="op">[</span><span class="va">i</span><span class="op">]</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/which.html" class="external-link">which</a></span><span class="op">(</span><span class="va">s</span><span class="op">[</span><span class="va">i</span><span class="op">]</span> <span class="op">==</span> <span class="va">vocabulary</span><span class="op">)</span> <span class="op">-</span> <span class="fl">1</span></span> +<span><span class="op">}</span></span> +<span><span class="va">one_hot_sample</span> <span class="op"><-</span> <span class="fu">keras</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/keras/man/to_categorical.html" class="external-link">to_categorical</a></span><span class="op">(</span><span class="va">s_as_int_seq</span><span class="op">)</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">one_hot_sample</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="va">one_hot_sample</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [2,] 0 0 1 0 0 0 0 0 0</span></span> +<span><span class="co">## [3,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [4,] 0 0 0 0 0 1 0 0 0</span></span> +<span><span class="co">## [5,] 0 0 0 0 0 0 0 0 1</span></span> +<span><span class="co">## [6,] 0 1 0 0 0 0 0 0 0</span></span></code></pre> +<div class="section level3"> +<h3 id="maxlen">maxlen<a class="anchor" aria-label="anchor" href="#maxlen"></a> +</h3> +<p>The length of the input sequence.</p> +</div> +<div class="section level3"> +<h3 id="vocabulary">vocabulary<a class="anchor" aria-label="anchor" href="#vocabulary"></a> +</h3> +<p>The set of allowed characters in a sequence. What happens to +characters outside the vocabulary can be controlled with the +<code>ambiguous_nuc</code> argument.</p> +</div> +<div class="section level3"> +<h3 id="train_type">train_type<a class="anchor" aria-label="anchor" href="#train_type"></a> +</h3> +<p>The generator will always return a list of length 2. The first +element is the input <span class="math inline">\(X\)</span> and the +second the target <span class="math inline">\(Y\)</span>. The +<code>train_type</code> argument determines how <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> get extracted. Possible arguments for +<u> <em>language models</em> </u> are:</p> +<ul> +<li> +<strong>“lm”</strong> or <strong>“lm_rds”</strong>: Given some +sequence <span class="math inline">\(s\)</span>, we take some subset of +that sequence as input and the rest as target. How to split <span class="math inline">\(s\)</span> can be specified in +<code>output_format</code> argument.</li> +</ul> +<p>Besides the language model approach, we can use <u> <em>label +classification</em> </u>. This means we map some label to a sequence. +For example, the target for some nucleotide sequence could be one of the +labels “bacteria” or “virus”. We have to specify how to extract a label +corresponding to a sequence. Possible arguments are:</p> +<ul> +<li><p><strong>“label_header”</strong>: get label from fasta +headers.</p></li> +<li><p><strong>“label_folder”</strong>: get label from folder, i.e. all +files in one folder must belong to the same class.</p></li> +<li> +<p><strong>“label_csv”</strong>: get label from csv file. Csv file +should have one column named “file”. The targets then correspond to +entries in that row (except “file” column). Example: if we are currently +working with a file called “a.fasta”, there should be a row in our csv +file with some target information for that file <br></p> +<table class="table"> +<thead><tr class="header"> +<th>file</th> +<th>label_1</th> +<th>label_2</th> +</tr></thead> +<tbody><tr class="odd"> +<td>“a.fasta”</td> +<td>1</td> +<td>0</td> +</tr></tbody> +</table> +</li> +<li><p><strong>“label_rds”</strong>: rds file contains preprocessed list +of input and target tensors.</p></li> +</ul> +<p>Another option is <strong>“dummy_gen”</strong>: generator creates +random data once and repeatedly returns them.</p> +<p>Extract target from fasta header (fasta header is “label_1” in +example file):</p> +<div class="sourceCode" id="cb6"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="co"># get target from header</span></span> +<span><span class="va">vocabulary_label</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste0</a></span><span class="op">(</span><span class="st">"label_"</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">5</span><span class="op">)</span></span> +<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span> +<span> train_type <span class="op">=</span> <span class="st">"label_header"</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">6</span>,</span> +<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span> +<span> vocabulary_label <span class="op">=</span> <span class="va">vocabulary_label</span><span class="op">)</span></span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> </span> +<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> </span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary_label</span> </span> +<span><span class="va">x</span> <span class="co"># abcdef</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span> +<span><span class="co">## [4,] 0 0 0 1 0 0 0 0 0</span></span> +<span><span class="co">## [5,] 0 0 0 0 1 0 0 0 0</span></span> +<span><span class="co">## [6,] 0 0 0 0 0 1 0 0 0</span></span></code></pre> +<div class="sourceCode" id="cb8"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># label_1 </span></span></code></pre></div> +<pre><code><span><span class="co">## label_1 label_2 label_3 label_4 label_5</span></span> +<span><span class="co">## [1,] 1 0 0 0 0</span></span></code></pre> +<p>Extract target from fasta folder:</p> +<div class="sourceCode" id="cb10"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="co"># create data for second class</span></span> +<span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="st">"AABAACAADAAE"</span>, Header <span class="op">=</span> <span class="st">"header_1"</span><span class="op">)</span></span> +<span><span class="va">file_path_2</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fasta"</span><span class="op">)</span></span> +<span><span class="va">fasta_file</span> <span class="op"><-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">file_path_2</span><span class="op">)</span></span> +<span></span> +<span><span class="co"># get target from folder</span></span> +<span><span class="va">vocabulary_label</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste0</a></span><span class="op">(</span><span class="st">"label_"</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">2</span><span class="op">)</span></span> +<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="va">file_path</span>, <span class="va">file_path_2</span><span class="op">)</span>, <span class="co"># one entry for each class</span></span> +<span> train_type <span class="op">=</span> <span class="st">"label_folder"</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">8</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">6</span>,</span> +<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span> +<span> vocabulary_label <span class="op">=</span> <span class="va">vocabulary_label</span><span class="op">)</span></span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span> +<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> </span> +<span><span class="va">x_1_1</span> <span class="op"><-</span> <span class="va">x</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x_1_1</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="va">x_1_1</span> <span class="co"># first sample from first class</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span> +<span><span class="co">## [4,] 0 0 0 1 0 0 0 0 0</span></span> +<span><span class="co">## [5,] 0 0 0 0 1 0 0 0 0</span></span> +<span><span class="co">## [6,] 0 0 0 0 0 1 0 0 0</span></span></code></pre> +<div class="sourceCode" id="cb12"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">x_2_1</span> <span class="op"><-</span> <span class="va">x</span><span class="op">[</span><span class="fl">5</span>, , <span class="op">]</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x_2_1</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="va">x_2_1</span> <span class="co"># first sample from second class</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [2,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [3,] 0 1 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [4,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [5,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [6,] 0 0 1 0 0 0 0 0 0</span></span></code></pre> +<div class="sourceCode" id="cb14"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary_label</span> </span> +<span><span class="va">y</span> <span class="co"># 4 samples from each class </span></span></code></pre></div> +<pre><code><span><span class="co">## label_1 label_2</span></span> +<span><span class="co">## [1,] 1 0</span></span> +<span><span class="co">## [2,] 1 0</span></span> +<span><span class="co">## [3,] 1 0</span></span> +<span><span class="co">## [4,] 1 0</span></span> +<span><span class="co">## [5,] 0 1</span></span> +<span><span class="co">## [6,] 0 1</span></span> +<span><span class="co">## [7,] 0 1</span></span> +<span><span class="co">## [8,] 0 1</span></span></code></pre> +<p>Extract target from csv file:</p> +<div class="sourceCode" id="cb16"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="co"># get target from csv</span></span> +<span><span class="va">file</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/basename.html" class="external-link">basename</a></span><span class="op">(</span><span class="va">file_path</span><span class="op">)</span>, <span class="st">"xyz.fasta"</span>, <span class="st">"abc.fasta"</span>, <span class="st">"x_123.fasta"</span><span class="op">)</span></span> +<span><span class="va">vocabulary_label</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste0</a></span><span class="op">(</span><span class="st">"label"</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">4</span><span class="op">)</span></span> +<span><span class="va">label_1</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">1</span>, <span class="fl">0</span>, <span class="fl">0</span>, <span class="fl">0</span><span class="op">)</span></span> +<span><span class="va">label_2</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fl">1</span>, <span class="fl">0</span>, <span class="fl">0</span><span class="op">)</span></span> +<span><span class="va">label_3</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fl">0</span>, <span class="fl">1</span>, <span class="fl">0</span><span class="op">)</span></span> +<span><span class="va">label_4</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fl">0</span>, <span class="fl">0</span>, <span class="fl">1</span><span class="op">)</span></span> +<span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span><span class="va">file</span>, <span class="va">label_1</span>, <span class="va">label_2</span>, <span class="va">label_3</span>, <span class="va">label_4</span><span class="op">)</span></span> +<span><span class="va">df</span></span></code></pre></div> +<pre><code><span><span class="co">## file label_1 label_2 label_3 label_4</span></span> +<span><span class="co">## 1 a.fasta 1 0 0 0</span></span> +<span><span class="co">## 2 xyz.fasta 0 1 0 0</span></span> +<span><span class="co">## 3 abc.fasta 0 0 1 0</span></span> +<span><span class="co">## 4 x_123.fasta 0 0 0 1</span></span></code></pre> +<div class="sourceCode" id="cb18"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">csv_file</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".csv"</span><span class="op">)</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/utils/write.table.html" class="external-link">write.csv</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">csv_file</span>, row.names <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span> +<span></span> +<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span> +<span> train_type <span class="op">=</span> <span class="st">"label_csv"</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">6</span>,</span> +<span> target_from_csv <span class="op">=</span> <span class="va">csv_file</span>,</span> +<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span> +<span> vocabulary_label <span class="op">=</span> <span class="va">vocabulary_label</span><span class="op">)</span></span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> </span> +<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> </span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary_label</span> </span> +<span><span class="va">x</span> <span class="co"># abcdef</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span> +<span><span class="co">## [4,] 0 0 0 1 0 0 0 0 0</span></span> +<span><span class="co">## [5,] 0 0 0 0 1 0 0 0 0</span></span> +<span><span class="co">## [6,] 0 0 0 0 0 1 0 0 0</span></span></code></pre> +<div class="sourceCode" id="cb20"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># label_1 </span></span></code></pre></div> +<pre><code><span><span class="co">## label1 label2 label3 label4</span></span> +<span><span class="co">## [1,] 1 0 0 0</span></span></code></pre> +<p>Examples for language models follow in the next section.</p> +</div> +<div class="section level3"> +<h3 id="output_format">output_format<a class="anchor" aria-label="anchor" href="#output_format"></a> +</h3> +<p>The <code>output_format</code> determines the shape of the output for +a language model, i.e. part of a sequence is the input <span class="math inline">\(X\)</span> and another the target <span class="math inline">\(Y\)</span>. Assume a sequence <tt>abcdefg</tt> and +<code>maxlen = 6</code>. Output correspond as follows</p> +<p><strong>“target_right”</strong>: <span class="math inline">\(X=\)</span> <tt>abcdef</tt>, <span class="math inline">\(Y=\)</span> <tt>g</tt></p> +<p><strong>“target_middle_lstm”</strong>: <span class="math inline">\(X +=\)</span> (<span class="math inline">\(X_1 =\)</span> <tt>abc</tt>, +<span class="math inline">\(X_2 =\)</span> <tt>gfe</tt>), <span class="math inline">\(Y=\)</span> <tt>d</tt> (note reversed order of +<span class="math inline">\(X_2\)</span>)</p> +<p><strong>“target_middle_cnn”</strong>: <span class="math inline">\(X +=\)</span> <tt>abcefg</tt>, <span class="math inline">\(Y =\)</span> +<tt>d</tt></p> +<p><strong>“wavenet”</strong>: <span class="math inline">\(X =\)</span> +<tt>abcdef</tt>, <span class="math inline">\(Y =\)</span> +<tt>bcdefg</tt></p> +<div class="sourceCode" id="cb22"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="co"># target_right</span></span> +<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span> +<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">6</span>,</span> +<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span> +<span> output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> </span> +<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> </span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="va">x</span> <span class="co"># abcdef</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span> +<span><span class="co">## [4,] 0 0 0 1 0 0 0 0 0</span></span> +<span><span class="co">## [5,] 0 0 0 0 1 0 0 0 0</span></span> +<span><span class="co">## [6,] 0 0 0 0 0 1 0 0 0</span></span></code></pre> +<div class="sourceCode" id="cb24"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># g </span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 0 0 0 0 0 0 1 0 0</span></span></code></pre> +<div class="sourceCode" id="cb26"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="co"># target_middle_lstm</span></span> +<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span> +<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">6</span>,</span> +<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span> +<span> output_format <span class="op">=</span> <span class="st">"target_middle_lstm"</span><span class="op">)</span></span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x_1</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> </span> +<span><span class="va">x_2</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> </span> +<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> </span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x_1</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x_2</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="va">x_1</span> <span class="co"># abc</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span></code></pre> +<div class="sourceCode" id="cb28"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">x_2</span> <span class="co"># gfe</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 0 0 0 0 0 0 1 0 0</span></span> +<span><span class="co">## [2,] 0 0 0 0 0 1 0 0 0</span></span> +<span><span class="co">## [3,] 0 0 0 0 1 0 0 0 0</span></span></code></pre> +<div class="sourceCode" id="cb30"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># d </span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 0 0 0 1 0 0 0 0 0</span></span></code></pre> +<div class="sourceCode" id="cb32"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="co"># target_middle_cnn</span></span> +<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span> +<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">6</span>,</span> +<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span> +<span> output_format <span class="op">=</span> <span class="st">"target_middle_cnn"</span><span class="op">)</span></span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span></span> +<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="va">x</span> <span class="co"># abcefg</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span> +<span><span class="co">## [4,] 0 0 0 0 1 0 0 0 0</span></span> +<span><span class="co">## [5,] 0 0 0 0 0 1 0 0 0</span></span> +<span><span class="co">## [6,] 0 0 0 0 0 0 1 0 0</span></span></code></pre> +<div class="sourceCode" id="cb34"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># d</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 0 0 0 1 0 0 0 0 0</span></span></code></pre> +<div class="sourceCode" id="cb36"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="co"># wavenet</span></span> +<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span> +<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">6</span>,</span> +<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span> +<span> output_format <span class="op">=</span> <span class="st">"wavenet"</span><span class="op">)</span></span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> </span> +<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="va">x</span> <span class="co"># abcdef</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span> +<span><span class="co">## [4,] 0 0 0 1 0 0 0 0 0</span></span> +<span><span class="co">## [5,] 0 0 0 0 1 0 0 0 0</span></span> +<span><span class="co">## [6,] 0 0 0 0 0 1 0 0 0</span></span></code></pre> +<div class="sourceCode" id="cb38"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># bcdefg</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 0 1 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [2,] 0 0 1 0 0 0 0 0 0</span></span> +<span><span class="co">## [3,] 0 0 0 1 0 0 0 0 0</span></span> +<span><span class="co">## [4,] 0 0 0 0 1 0 0 0 0</span></span> +<span><span class="co">## [5,] 0 0 0 0 0 1 0 0 0</span></span> +<span><span class="co">## [6,] 0 0 0 0 0 0 1 0 0</span></span></code></pre> +</div> +<div class="section level3"> +<h3 id="batch_size">batch_size<a class="anchor" aria-label="anchor" href="#batch_size"></a> +</h3> +<p>Number of samples in one batch.</p> +<div class="sourceCode" id="cb40"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="co"># target_right</span></span> +<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span> +<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">7</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">6</span>,</span> +<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span> +<span> output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span> +<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> </span> +<span><span class="fu"><a href="https://rdrr.io/r/base/dim.html" class="external-link">dim</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span></code></pre></div> +<pre><code><span><span class="co">## [1] 7 6 9</span></span></code></pre> +<div class="sourceCode" id="cb42"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/dim.html" class="external-link">dim</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span></span></code></pre></div> +<pre><code><span><span class="co">## [1] 7 9</span></span></code></pre> +</div> +<div class="section level3"> +<h3 id="step">step<a class="anchor" aria-label="anchor" href="#step"></a> +</h3> +<p>We may determine how frequently we want to take a sample. If +<code>step = 1</code> we take a sample at every possible step. Let’s +assume we want to predict the next character, i.e. part of the sequence +is the <mark class="in">input</mark> and next character the +<mark class="out">target</mark>. If +<code>maxlen = 3, step = 1</code>:</p> +<ol style="list-style-type: decimal"> +<li><p>sample: +<tt><mark class="in">abc</mark><mark class="out">d</mark>efghiiii</tt></p></li> +<li><p>sample: +<tt>a<mark class="in">bcd</mark><mark class="out">e</mark>fghiiii</tt></p></li> +<li><p>sample: +<tt>ab<mark class="in">cde</mark><mark class="out">f</mark>ghiiii</tt></p></li> +</ol> +<p>if <code>step = 3</code></p> +<ol style="list-style-type: decimal"> +<li><p>sample: +<tt><mark class="in">abc</mark><mark class="out">d</mark>efghiiii</tt></p></li> +<li><p>sample: +<tt>abc<mark class="in">def</mark><mark class="out">g</mark>hiiii</tt></p></li> +<li><p>sample: +<tt>abcdef<mark class="in">ghi</mark><mark class="out">i</mark>ii</tt></p></li> +</ol> +<div class="sourceCode" id="cb44"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span> +<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">3</span>,</span> +<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span> +<span> step <span class="op">=</span> <span class="fl">3</span>, </span> +<span> output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> <span class="co">#encodes abc</span></span> +<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> <span class="co"># encodes d</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="va">x</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span></code></pre> +<div class="sourceCode" id="cb46"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">y</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 0 0 0 1 0 0 0 0 0</span></span></code></pre> +<div class="sourceCode" id="cb48"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="co"># go 3 steps forward</span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> <span class="co">#encodes def</span></span> +<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> <span class="co"># encodes g</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="va">x</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 0 0 0 1 0 0 0 0 0</span></span> +<span><span class="co">## [2,] 0 0 0 0 1 0 0 0 0</span></span> +<span><span class="co">## [3,] 0 0 0 0 0 1 0 0 0</span></span></code></pre> +<div class="sourceCode" id="cb50"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">y</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 0 0 0 0 0 0 1 0 0</span></span></code></pre> +</div> +<div class="section level3"> +<h3 id="padding">padding<a class="anchor" aria-label="anchor" href="#padding"></a> +</h3> +<p>If the sequence is too short to create a single sample, we can pad +the sequence with zero-vectors. If <code>padding = FALSE</code> the +generator will go to next file/ fasta entry until it finds a sequence +long enough for a sample.</p> +<div class="sourceCode" id="cb52"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span> +<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">15</span>, <span class="co"># maxlen is longer than sequence</span></span> +<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span> +<span> step <span class="op">=</span> <span class="fl">3</span>,</span> +<span> padding <span class="op">=</span> <span class="cn">TRUE</span>,</span> +<span> output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> </span> +<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> </span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="va">x</span> <span class="co"># first 4 entries are zero-vectors</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 0 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [2,] 0 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [3,] 0 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [4,] 0 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [5,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [6,] 0 1 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [7,] 0 0 1 0 0 0 0 0 0</span></span> +<span><span class="co">## [8,] 0 0 0 1 0 0 0 0 0</span></span> +<span><span class="co">## [9,] 0 0 0 0 1 0 0 0 0</span></span> +<span><span class="co">## [10,] 0 0 0 0 0 1 0 0 0</span></span> +<span><span class="co">## [11,] 0 0 0 0 0 0 1 0 0</span></span> +<span><span class="co">## [12,] 0 0 0 0 0 0 0 1 0</span></span> +<span><span class="co">## [13,] 0 0 0 0 0 0 0 0 1</span></span> +<span><span class="co">## [14,] 0 0 0 0 0 0 0 0 1</span></span> +<span><span class="co">## [15,] 0 0 0 0 0 0 0 0 1</span></span></code></pre> +<div class="sourceCode" id="cb54"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">y</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 0 0 0 0 0 0 0 0 1</span></span></code></pre> +</div> +<div class="section level3"> +<h3 id="ambiguous_nuc">ambiguous_nuc<a class="anchor" aria-label="anchor" href="#ambiguous_nuc"></a> +</h3> +<p>A sequence might contain a character that does not lie inside our +vocabulary. For example, let’s assume we discard <tt>e</tt> from our +vocabulary. We have 4 options to handle this situation</p> +<ol style="list-style-type: decimal"> +<li>encode as zero vector</li> +</ol> +<div class="sourceCode" id="cb56"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">vocabulary_2</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"a"</span>, <span class="st">"b"</span>, <span class="st">"c"</span>, <span class="st">"d"</span>, <span class="st">"f"</span>, <span class="st">"g"</span>, <span class="st">"h"</span>, <span class="st">"i"</span><span class="op">)</span> <span class="co"># exclude "e" from vocabulary</span></span> +<span></span> +<span><span class="co"># zero</span></span> +<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span> +<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">6</span>,</span> +<span> vocabulary <span class="op">=</span> <span class="va">vocabulary_2</span>,</span> +<span> output_format <span class="op">=</span> <span class="st">"target_right"</span>,</span> +<span> ambiguous_nuc <span class="op">=</span> <span class="st">"zeros"</span><span class="op">)</span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> </span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary_2</span></span> +<span><span class="va">x</span> <span class="co"># fifth row is zero vector </span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d f g h i</span></span> +<span><span class="co">## [1,] 1 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [2,] 0 1 0 0 0 0 0 0</span></span> +<span><span class="co">## [3,] 0 0 1 0 0 0 0 0</span></span> +<span><span class="co">## [4,] 0 0 0 1 0 0 0 0</span></span> +<span><span class="co">## [5,] 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [6,] 0 0 0 0 1 0 0 0</span></span></code></pre> +<ol start="2" style="list-style-type: decimal"> +<li>equal probability</li> +</ol> +<div class="sourceCode" id="cb58"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="co"># equal</span></span> +<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span> +<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">6</span>,</span> +<span> vocabulary <span class="op">=</span> <span class="va">vocabulary_2</span>,</span> +<span> output_format <span class="op">=</span> <span class="st">"target_right"</span>,</span> +<span> ambiguous_nuc <span class="op">=</span> <span class="st">"equal"</span><span class="op">)</span> </span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary_2</span></span> +<span><span class="va">x</span> <span class="co"># fifth row is 1/8 for every entry </span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d f g h i</span></span> +<span><span class="co">## [1,] 1.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000</span></span> +<span><span class="co">## [2,] 0.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000</span></span> +<span><span class="co">## [3,] 0.000 0.000 1.000 0.000 0.000 0.000 0.000 0.000</span></span> +<span><span class="co">## [4,] 0.000 0.000 0.000 1.000 0.000 0.000 0.000 0.000</span></span> +<span><span class="co">## [5,] 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125</span></span> +<span><span class="co">## [6,] 0.000 0.000 0.000 0.000 1.000 0.000 0.000 0.000</span></span></code></pre> +<ol start="3" style="list-style-type: decimal"> +<li>use distribution of current file</li> +</ol> +<div class="sourceCode" id="cb60"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="co"># empirical</span></span> +<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span> +<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">6</span>,</span> +<span> vocabulary <span class="op">=</span> <span class="va">vocabulary_2</span>,</span> +<span> output_format <span class="op">=</span> <span class="st">"target_right"</span>,</span> +<span> ambiguous_nuc <span class="op">=</span> <span class="st">"empirical"</span><span class="op">)</span> </span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span> </span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary_2</span></span> +<span><span class="va">x</span> <span class="co"># fifth row is distribuation of file</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d f g</span></span> +<span><span class="co">## [1,] 1.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000</span></span> +<span><span class="co">## [2,] 0.00000000 1.00000000 0.00000000 0.00000000 0.00000000 0.00000000</span></span> +<span><span class="co">## [3,] 0.00000000 0.00000000 1.00000000 0.00000000 0.00000000 0.00000000</span></span> +<span><span class="co">## [4,] 0.00000000 0.00000000 0.00000000 1.00000000 0.00000000 0.00000000</span></span> +<span><span class="co">## [5,] 0.09090909 0.09090909 0.09090909 0.09090909 0.09090909 0.09090909</span></span> +<span><span class="co">## [6,] 0.00000000 0.00000000 0.00000000 0.00000000 1.00000000 0.00000000</span></span> +<span><span class="co">## h i</span></span> +<span><span class="co">## [1,] 0.00000000 0.0000000</span></span> +<span><span class="co">## [2,] 0.00000000 0.0000000</span></span> +<span><span class="co">## [3,] 0.00000000 0.0000000</span></span> +<span><span class="co">## [4,] 0.00000000 0.0000000</span></span> +<span><span class="co">## [5,] 0.09090909 0.3636364</span></span> +<span><span class="co">## [6,] 0.00000000 0.0000000</span></span></code></pre> +<ol start="4" style="list-style-type: decimal"> +<li>discard</li> +</ol> +<div class="sourceCode" id="cb62"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="co"># discard</span></span> +<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span> +<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">6</span>,</span> +<span> vocabulary <span class="op">=</span> <span class="va">vocabulary_2</span>,</span> +<span> output_format <span class="op">=</span> <span class="st">"target_right"</span>,</span> +<span> ambiguous_nuc <span class="op">=</span> <span class="st">"discard"</span><span class="op">)</span> </span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary_2</span></span> +<span><span class="va">x</span> <span class="co"># first sample with only characters from vocabulary is fghiii|i</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d f g h i</span></span> +<span><span class="co">## [1,] 0 0 0 0 1 0 0 0</span></span> +<span><span class="co">## [2,] 0 0 0 0 0 1 0 0</span></span> +<span><span class="co">## [3,] 0 0 0 0 0 0 1 0</span></span> +<span><span class="co">## [4,] 0 0 0 0 0 0 0 1</span></span> +<span><span class="co">## [5,] 0 0 0 0 0 0 0 1</span></span> +<span><span class="co">## [6,] 0 0 0 0 0 0 0 1</span></span></code></pre> +</div> +<div class="section level3"> +<h3 id="proportion_per_seq">proportion_per_seq<a class="anchor" aria-label="anchor" href="#proportion_per_seq"></a> +</h3> +<p>The <code>proportion_per_seq</code> argument gives the option to use +a random subset instead of the full sequence.</p> +<div class="sourceCode" id="cb64"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/cat.html" class="external-link">cat</a></span><span class="op">(</span><span class="st">"sequence is "</span>, <span class="fu"><a href="https://rdrr.io/r/base/nchar.html" class="external-link">nchar</a></span><span class="op">(</span><span class="va">sequence</span><span class="op">)</span>, <span class="st">"characters long \n"</span><span class="op">)</span></span></code></pre></div> +<pre><code><span><span class="co">## sequence is 12 characters long</span></span></code></pre> +<div class="sourceCode" id="cb66"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span> +<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">5</span>,</span> +<span> seed <span class="op">=</span> <span class="fl">1</span>,</span> +<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span> +<span> output_format <span class="op">=</span> <span class="st">"target_right"</span>,</span> +<span> <span class="co"># take random subsequence using 50% of sequence </span></span> +<span> proportion_per_seq <span class="op">=</span> <span class="fl">0.5</span><span class="op">)</span></span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span> +<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="va">x</span> <span class="co"># defgh</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span> +<span><span class="co">## [4,] 0 0 0 1 0 0 0 0 0</span></span> +<span><span class="co">## [5,] 0 0 0 0 1 0 0 0 0</span></span></code></pre> +<div class="sourceCode" id="cb68"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># i</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 0 0 0 0 0 1 0 0 0</span></span></code></pre> +</div> +<div class="section level3"> +<h3 id="file_limit">file_limit<a class="anchor" aria-label="anchor" href="#file_limit"></a> +</h3> +<p>Integer or NULL. If integer, use only specified number of randomly +sampled files for training.</p> +</div> +<div class="section level3"> +<h3 id="delete_used_files">delete_used_files<a class="anchor" aria-label="anchor" href="#delete_used_files"></a> +</h3> +<p>If true, delete file once used. Only applies for rds files.</p> +<div class="sourceCode" id="cb70"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">x</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/array.html" class="external-link">array</a></span><span class="op">(</span><span class="fl">0</span>, dim <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">1</span>,<span class="fl">5</span>,<span class="fl">4</span><span class="op">)</span><span class="op">)</span></span> +<span><span class="va">y</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/matrix.html" class="external-link">matrix</a></span><span class="op">(</span><span class="fl">0</span>, ncol <span class="op">=</span> <span class="fl">1</span><span class="op">)</span></span> +<span><span class="va">rds_path</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".rds"</span><span class="op">)</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/readRDS.html" class="external-link">saveRDS</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span><span class="va">x</span>, <span class="va">y</span><span class="op">)</span>, <span class="va">rds_path</span><span class="op">)</span></span> +<span></span> +<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">rds_path</span>,</span> +<span> delete_used_files <span class="op">=</span> <span class="cn">TRUE</span>,</span> +<span> train_type <span class="op">=</span> <span class="st">"label_rds"</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">5</span><span class="op">)</span></span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/files.html" class="external-link">file.exists</a></span><span class="op">(</span><span class="va">rds_path</span><span class="op">)</span></span></code></pre></div> +<pre><code><span><span class="co">## [1] FALSE</span></span></code></pre> +<div class="sourceCode" id="cb72"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="co"># z <- gen()</span></span> +<span><span class="co"># When calling the generator again, it will wait until it finds a file again from the files listed in </span></span> +<span><span class="co"># the initial `path` argument. Can be used if another process(es) create rds files.</span></span></code></pre></div> +</div> +<div class="section level3"> +<h3 id="max_samples">max_samples<a class="anchor" aria-label="anchor" href="#max_samples"></a> +</h3> +<p>Only use fixed number of samples per file. Randomly choose which +samples to use. (If <code>random_sampling = FALSE</code>, samples are +consecutive.)</p> +<div class="sourceCode" id="cb73"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span> +<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">2</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">5</span>,</span> +<span> step <span class="op">=</span> <span class="fl">1</span>,</span> +<span> seed <span class="op">=</span> <span class="fl">3</span>,</span> +<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span> +<span> output_format <span class="op">=</span> <span class="st">"target_right"</span>,</span> +<span> max_samples <span class="op">=</span> <span class="fl">2</span><span class="op">)</span></span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x1</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span> +<span><span class="va">x2</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">2</span>, , <span class="op">]</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x1</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x2</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="va">x1</span> <span class="co"># bcdef</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 0 0 0 0 1 0 0 0 0</span></span> +<span><span class="co">## [2,] 0 0 0 0 0 1 0 0 0</span></span> +<span><span class="co">## [3,] 0 0 0 0 0 0 1 0 0</span></span> +<span><span class="co">## [4,] 0 0 0 0 0 0 0 1 0</span></span> +<span><span class="co">## [5,] 0 0 0 0 0 0 0 0 1</span></span></code></pre> +<div class="sourceCode" id="cb75"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">x2</span> <span class="co"># cdefg</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 0 0 0 0 0 1 0 0 0</span></span> +<span><span class="co">## [2,] 0 0 0 0 0 0 1 0 0</span></span> +<span><span class="co">## [3,] 0 0 0 0 0 0 0 1 0</span></span> +<span><span class="co">## [4,] 0 0 0 0 0 0 0 0 1</span></span> +<span><span class="co">## [5,] 0 0 0 0 0 0 0 0 1</span></span></code></pre> +</div> +<div class="section level3"> +<h3 id="random_sampling">random_sampling<a class="anchor" aria-label="anchor" href="#random_sampling"></a> +</h3> +<p>If you use <code>max_samples</code>, generator will randomly choose +subset from all possible samples, but those samples are consecutive. +With <code>random_sampling = TRUE</code>, samples are completely +random.</p> +<div class="sourceCode" id="cb77"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span> +<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">2</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">5</span>,</span> +<span> seed <span class="op">=</span> <span class="fl">66</span>,</span> +<span> random_sampling <span class="op">=</span> <span class="cn">TRUE</span>,</span> +<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span> +<span> output_format <span class="op">=</span> <span class="st">"target_right"</span>,</span> +<span> max_samples <span class="op">=</span> <span class="fl">2</span><span class="op">)</span></span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x1</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span> +<span><span class="va">x2</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">2</span>, , <span class="op">]</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x1</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x2</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="va">x1</span> <span class="co"># efghi</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 0 0 0 0 0 0 1 0 0</span></span> +<span><span class="co">## [2,] 0 0 0 0 0 0 0 1 0</span></span> +<span><span class="co">## [3,] 0 0 0 0 0 0 0 0 1</span></span> +<span><span class="co">## [4,] 0 0 0 0 0 0 0 0 1</span></span> +<span><span class="co">## [5,] 0 0 0 0 0 0 0 0 1</span></span></code></pre> +<div class="sourceCode" id="cb79"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">x2</span> <span class="co"># defgh</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 0 0 0 0 1 0 0 0 0</span></span> +<span><span class="co">## [2,] 0 0 0 0 0 1 0 0 0</span></span> +<span><span class="co">## [3,] 0 0 0 0 0 0 1 0 0</span></span> +<span><span class="co">## [4,] 0 0 0 0 0 0 0 1 0</span></span> +<span><span class="co">## [5,] 0 0 0 0 0 0 0 0 1</span></span></code></pre> +</div> +<div class="section level3"> +<h3 id="target_len">target_len<a class="anchor" aria-label="anchor" href="#target_len"></a> +</h3> +<p>Target length for language model.</p> +<div class="sourceCode" id="cb81"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span> +<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span> +<span> target_len <span class="op">=</span> <span class="fl">3</span>, </span> +<span> maxlen <span class="op">=</span> <span class="fl">5</span>,</span> +<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span> +<span> output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span> +<span><span class="va">y1</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span><span class="op">[</span> , <span class="fl">1</span>, <span class="op">]</span></span> +<span><span class="va">y2</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span><span class="op">[</span> , <span class="fl">2</span>, <span class="op">]</span></span> +<span><span class="va">y3</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span><span class="op">[</span> , <span class="fl">3</span>, <span class="op">]</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/names.html" class="external-link">names</a></span><span class="op">(</span><span class="va">y1</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/names.html" class="external-link">names</a></span><span class="op">(</span><span class="va">y2</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/names.html" class="external-link">names</a></span><span class="op">(</span><span class="va">y3</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="va">x</span> <span class="co"># abcde</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span> +<span><span class="co">## [4,] 0 0 0 1 0 0 0 0 0</span></span> +<span><span class="co">## [5,] 0 0 0 0 1 0 0 0 0</span></span></code></pre> +<div class="sourceCode" id="cb83"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">y1</span> <span class="co"># f</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i </span></span> +<span><span class="co">## 0 0 0 0 0 1 0 0 0</span></span></code></pre> +<div class="sourceCode" id="cb85"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">y2</span> <span class="co"># g</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i </span></span> +<span><span class="co">## 0 0 0 0 0 0 1 0 0</span></span></code></pre> +<div class="sourceCode" id="cb87"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">y3</span> <span class="co"># h</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i </span></span> +<span><span class="co">## 0 0 0 0 0 0 0 1 0</span></span></code></pre> +</div> +<div class="section level3"> +<h3 id="n_gram-n_gram_stride">n_gram / n_gram_stride<a class="anchor" aria-label="anchor" href="#n_gram-n_gram_stride"></a> +</h3> +<p>Encode target in language model not character wise but combine n +characters to one target. <code>n_gram_stride</code> determines the +frequency of the n-gram encoding.</p> +<div class="sourceCode" id="cb89"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span> +<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span> +<span> target_len <span class="op">=</span> <span class="fl">6</span>, </span> +<span> n_gram <span class="op">=</span> <span class="fl">3</span>,</span> +<span> n_gram_stride <span class="op">=</span> <span class="fl">3</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">3</span>,</span> +<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span> +<span> output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span> +<span><span class="va">y1</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span><span class="op">[</span> , <span class="fl">1</span>, <span class="op">]</span></span> +<span><span class="va">y2</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span><span class="op">[</span> , <span class="fl">2</span>, <span class="op">]</span></span> +<span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/dim.html" class="external-link">dim</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">[</span><span class="fl">3</span><span class="op">]</span> <span class="op">==</span> <span class="fu"><a href="https://rdrr.io/r/base/length.html" class="external-link">length</a></span><span class="op">(</span><span class="va">vocabulary</span><span class="op">)</span><span class="op">^</span><span class="fl">3</span></span></code></pre></div> +<pre><code><span><span class="co">## [1] TRUE</span></span></code></pre> +<div class="sourceCode" id="cb91"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="co"># x = abc as 3-gram</span></span> +<span><span class="co"># y1 = def as 3-gram</span></span> +<span><span class="co"># y2 = ghi as 3-gram</span></span></code></pre></div> +</div> +<div class="section level3"> +<h3 id="add_noise">add_noise<a class="anchor" aria-label="anchor" href="#add_noise"></a> +</h3> +<p>Add noise to input. Must be a list that specifies noise distribution +or NULL (no noise). List contains arguments <code>noise_type</code>: +either <code>"normal"</code> or <code>"uniform"</code>. Optional +arguments <code>sd</code> or <code>mean</code> if +<code>noise_type</code> is <code>"normal"</code> (default is +<code>sd=1</code> and <code>mean=0</code>) or <code>min</code>, +<code>max</code> if <code>noise_type</code> is <code>"uniform"</code> +(default is <code>min=0</code>, <code>max=1</code>).</p> +<div class="sourceCode" id="cb92"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">file_path</span>,</span> +<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span> +<span> add_noise <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span>noise_type <span class="op">=</span> <span class="st">"normal"</span>, mean <span class="op">=</span> <span class="fl">0</span>, sd <span class="op">=</span> <span class="fl">0.01</span><span class="op">)</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">5</span>,</span> +<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span> +<span> output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span> +<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span> +<span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/Round.html" class="external-link">round</a></span><span class="op">(</span><span class="va">x</span>, <span class="fl">3</span><span class="op">)</span> <span class="co"># abcde + noise</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 0.994 0.005 -0.006 0.008 0.006 0.014 -0.004 0.007 -0.001</span></span> +<span><span class="co">## [2,] 0.002 1.007 -0.022 0.006 -0.001 -0.001 -0.001 0.006 0.009</span></span> +<span><span class="co">## [3,] -0.008 0.006 1.011 0.009 -0.002 0.004 0.011 -0.007 0.004</span></span> +<span><span class="co">## [4,] 0.016 -0.003 0.000 1.008 -0.015 -0.001 0.008 -0.007 -0.006</span></span> +<span><span class="co">## [5,] 0.003 0.015 0.000 0.001 0.995 -0.014 -0.002 0.004 0.003</span></span></code></pre> +<div class="sourceCode" id="cb94"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># f</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f</span></span> +<span><span class="co">## [1,] -0.008204684 0.003898432 0.009438362 -0.01989352 0.004179416 0.9958501</span></span> +<span><span class="co">## g h i</span></span> +<span><span class="co">## [1,] -0.002533617 0.007685329 -0.01129363</span></span></code></pre> +</div> +<div class="section level3"> +<h3 id="proportion_entries">proportion_entries<a class="anchor" aria-label="anchor" href="#proportion_entries"></a> +</h3> +<p>If a fasta file has multiple entries, you can randomly choose a +subset. For example, if the file has 6 entries and +<code>proportion_entries = 0.5</code> the generator will randomly choose +only 3 of the entries.</p> +</div> +<div class="section level3"> +<h3 id="shuffle_file_order">shuffle_file_order<a class="anchor" aria-label="anchor" href="#shuffle_file_order"></a> +</h3> +<p>Shuffle file order before iterating through files. Order gets +reshuffled after every iteration.</p> +</div> +<div class="section level3"> +<h3 id="shuffle_input">shuffle_input<a class="anchor" aria-label="anchor" href="#shuffle_input"></a> +</h3> +<p>Whether to shuffle fasta entries if fasta file has multiple +entries.</p> +</div> +<div class="section level3"> +<h3 id="reverse_complement">reverse_complement<a class="anchor" aria-label="anchor" href="#reverse_complement"></a> +</h3> +<p>If <code>TRUE</code>, randomly decide for every batch to use original +sequence or its reverse complement. Only implemented for <tt>ACGT</tt> +vocabulary.</p> +</div> +<div class="section level3"> +<h3 id="sample_by_file_size">sample_by_file_size<a class="anchor" aria-label="anchor" href="#sample_by_file_size"></a> +</h3> +<p>Randomly choose new file by sampling according to file size (bigger +files more likely).</p> +</div> +<div class="section level3"> +<h3 id="concat_seq">concat_seq<a class="anchor" aria-label="anchor" href="#concat_seq"></a> +</h3> +<p>Character string or <code>NULL</code>. If not <code>NULL</code> all +entries from file get concatenated to one sequence with +<code>concat_seq</code> string between them. Use +<code>concat_seq = ""</code> if you don’t want to add a new token.</p> +<div class="sourceCode" id="cb96"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"AC"</span>, <span class="st">"AG"</span>, <span class="st">"AT"</span><span class="op">)</span>, Header <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste0</a></span><span class="op">(</span><span class="st">"header"</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">3</span><span class="op">)</span><span class="op">)</span></span> +<span><span class="va">fasta_path</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fasta"</span><span class="op">)</span></span> +<span><span class="va">fasta_file</span> <span class="op"><-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">fasta_path</span><span class="op">)</span></span> +<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">fasta_path</span>,</span> +<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">9</span>,</span> +<span> vocabulary <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span>, <span class="st">"Z"</span><span class="op">)</span>,</span> +<span> concat_seq <span class="op">=</span> <span class="st">"ZZ"</span>,</span> +<span> output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span> +<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span> +<span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span>, <span class="st">"Z"</span><span class="op">)</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span>, <span class="st">"Z"</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="co"># ACZZAGZZA</span></span></code></pre></div> +<pre><code><span><span class="co">## A C G T Z</span></span> +<span><span class="co">## [1,] 1 0 0 0 0</span></span> +<span><span class="co">## [2,] 0 1 0 0 0</span></span> +<span><span class="co">## [3,] 0 0 0 0 1</span></span> +<span><span class="co">## [4,] 0 0 0 0 1</span></span> +<span><span class="co">## [5,] 1 0 0 0 0</span></span> +<span><span class="co">## [6,] 0 0 1 0 0</span></span> +<span><span class="co">## [7,] 0 0 0 0 1</span></span> +<span><span class="co">## [8,] 0 0 0 0 1</span></span> +<span><span class="co">## [9,] 1 0 0 0 0</span></span></code></pre> +<div class="sourceCode" id="cb98"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># T</span></span></code></pre></div> +<pre><code><span><span class="co">## A C G T Z</span></span> +<span><span class="co">## [1,] 0 0 0 1 0</span></span></code></pre> +</div> +<div class="section level3"> +<h3 id="set_learning">set_learning<a class="anchor" aria-label="anchor" href="#set_learning"></a> +</h3> +<p>When you want to assign one label to set of samples. Only implemented +for <code>train_type = "label_folder"</code>. Input is a list with the +following parameters</p> +<ul> +<li> +<code>samples_per_target</code> how many samples to use for one +target</li> +<li> +<code>maxlen</code> length of one sample</li> +<li> +<code>reshape_mode</code>: <code>"time_dist", "multi_input"</code> +or <code>"concat"</code>. +<ul> +<li>If <code>reshape_mode = "multi_input"</code>, generator will produce +<code>samples_per_target</code> separate inputs, each of length +<code>maxlen</code>.</li> +<li>If <code>reshape_mode = "time_dist"</code>, generator will produce a +4D input array. The dimensions correspond to +<code>(batch_size, samples_per_target, maxlen, length(vocabulary))</code>.<br> +</li> +<li>If <code>reshape_mode</code> is <code>"concat"</code>, generator +will concatenate <code>samples_per_target</code> sequences of length +<code>maxlen</code> to one long sequence.</li> +</ul> +</li> +<li>If <code>reshape_mode = "concat"</code>, there is an additional +<code>buffer_len</code> argument: add new token between concatenated +samples +<ul> +<li>If <code>buffer_len</code> is an integer, the sub-sequences are +inter spaced with <code>buffer_len</code> rows. The input length is +(<code>maxlen</code> * <code>samples_per_target</code>) + +<code>buffer_len</code> * (<code>samples_per_target</code> - 1)</li> +</ul> +</li> +</ul> +<div class="sourceCode" id="cb100"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="co"># create data for second label</span></span> +<span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="st">"AABAACAADAAE"</span>, Header <span class="op">=</span> <span class="st">"header_1"</span><span class="op">)</span></span> +<span><span class="va">file_path_2</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fasta"</span><span class="op">)</span></span> +<span><span class="va">fasta_file</span> <span class="op"><-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">file_path_2</span><span class="op">)</span></span> +<span></span> +<span><span class="co"># multi_input </span></span> +<span><span class="va">set_learning</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span>reshape_mode <span class="op">=</span> <span class="st">"multi_input"</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">4</span>,</span> +<span> samples_per_target <span class="op">=</span> <span class="fl">3</span><span class="op">)</span></span> +<span></span> +<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="va">file_path</span>, <span class="va">file_path_2</span><span class="op">)</span>, <span class="co"># path has length 2 => 2 classes</span></span> +<span> train_type <span class="op">=</span> <span class="st">"label_folder"</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">2</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">4</span>,</span> +<span> step <span class="op">=</span> <span class="fl">1</span>, </span> +<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span> +<span> set_learning <span class="op">=</span> <span class="va">set_learning</span><span class="op">)</span></span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span> +<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/length.html" class="external-link">length</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="co"># 3 samples per target</span></span></code></pre></div> +<pre><code><span><span class="co">## [1] 3</span></span></code></pre> +<div class="sourceCode" id="cb102"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">x_1_1</span> <span class="op"><-</span> <span class="va">x</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span> +<span><span class="va">x_1_1</span> <span class="co"># abcd</span></span></code></pre></div> +<pre><code><span><span class="co">## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]</span></span> +<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span> +<span><span class="co">## [4,] 0 0 0 1 0 0 0 0 0</span></span></code></pre> +<div class="sourceCode" id="cb104"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">x_1_2</span> <span class="op"><-</span> <span class="va">x</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span> +<span><span class="va">x_1_2</span> <span class="co"># bcde</span></span></code></pre></div> +<pre><code><span><span class="co">## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]</span></span> +<span><span class="co">## [1,] 0 1 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [2,] 0 0 1 0 0 0 0 0 0</span></span> +<span><span class="co">## [3,] 0 0 0 1 0 0 0 0 0</span></span> +<span><span class="co">## [4,] 0 0 0 0 1 0 0 0 0</span></span></code></pre> +<div class="sourceCode" id="cb106"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">x_1_3</span> <span class="op"><-</span> <span class="va">x</span><span class="op">[[</span><span class="fl">3</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span> +<span><span class="va">x_1_3</span> <span class="co"># cdef</span></span></code></pre></div> +<pre><code><span><span class="co">## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]</span></span> +<span><span class="co">## [1,] 0 0 1 0 0 0 0 0 0</span></span> +<span><span class="co">## [2,] 0 0 0 1 0 0 0 0 0</span></span> +<span><span class="co">## [3,] 0 0 0 0 1 0 0 0 0</span></span> +<span><span class="co">## [4,] 0 0 0 0 0 1 0 0 0</span></span></code></pre> +<div class="sourceCode" id="cb108"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">x_2_1</span> <span class="op"><-</span> <span class="va">x</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">2</span>, , <span class="op">]</span></span> +<span><span class="va">x_2_1</span> <span class="co"># aaba</span></span></code></pre></div> +<pre><code><span><span class="co">## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]</span></span> +<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [2,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [3,] 0 1 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [4,] 1 0 0 0 0 0 0 0 0</span></span></code></pre> +<div class="sourceCode" id="cb110"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">x_2_2</span> <span class="op"><-</span> <span class="va">x</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">2</span>, , <span class="op">]</span></span> +<span><span class="va">x_2_2</span> <span class="co"># abaa</span></span></code></pre></div> +<pre><code><span><span class="co">## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]</span></span> +<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [3,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [4,] 1 0 0 0 0 0 0 0 0</span></span></code></pre> +<div class="sourceCode" id="cb112"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">x_2_3</span> <span class="op"><-</span> <span class="va">x</span><span class="op">[[</span><span class="fl">3</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">2</span>, , <span class="op">]</span></span> +<span><span class="va">x_2_3</span> <span class="co"># baac</span></span></code></pre></div> +<pre><code><span><span class="co">## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]</span></span> +<span><span class="co">## [1,] 0 1 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [2,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [3,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [4,] 0 0 1 0 0 0 0 0 0</span></span></code></pre> +<div class="sourceCode" id="cb114"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"label_1"</span>, <span class="st">"label_2"</span><span class="op">)</span></span> +<span><span class="va">y</span> </span></code></pre></div> +<pre><code><span><span class="co">## label_1 label_2</span></span> +<span><span class="co">## [1,] 1 0</span></span> +<span><span class="co">## [2,] 0 1</span></span></code></pre> +<div class="sourceCode" id="cb116"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="co"># concat </span></span> +<span><span class="va">set_learning</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span>reshape_mode <span class="op">=</span> <span class="st">"concat"</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">4</span>,</span> +<span> samples_per_target <span class="op">=</span> <span class="fl">3</span><span class="op">)</span></span> +<span></span> +<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="va">file_path</span>, <span class="va">file_path_2</span><span class="op">)</span>, <span class="co"># path has length 2 => 2 classes</span></span> +<span> train_type <span class="op">=</span> <span class="st">"label_folder"</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">2</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">4</span>,</span> +<span> step <span class="op">=</span> <span class="fl">2</span>, </span> +<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span> +<span> set_learning <span class="op">=</span> <span class="va">set_learning</span><span class="op">)</span></span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span> +<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/dim.html" class="external-link">dim</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> </span></code></pre></div> +<pre><code><span><span class="co">## [1] 2 12 9</span></span></code></pre> +<div class="sourceCode" id="cb118"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">x_1</span> <span class="op"><-</span> <span class="va">x</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x_1</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="va">x_1</span> <span class="co"># abcd | cdef | efgh</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span> +<span><span class="co">## [4,] 0 0 0 1 0 0 0 0 0</span></span> +<span><span class="co">## [5,] 0 0 1 0 0 0 0 0 0</span></span> +<span><span class="co">## [6,] 0 0 0 1 0 0 0 0 0</span></span> +<span><span class="co">## [7,] 0 0 0 0 1 0 0 0 0</span></span> +<span><span class="co">## [8,] 0 0 0 0 0 1 0 0 0</span></span> +<span><span class="co">## [9,] 0 0 0 0 1 0 0 0 0</span></span> +<span><span class="co">## [10,] 0 0 0 0 0 1 0 0 0</span></span> +<span><span class="co">## [11,] 0 0 0 0 0 0 1 0 0</span></span> +<span><span class="co">## [12,] 0 0 0 0 0 0 0 1 0</span></span></code></pre> +<div class="sourceCode" id="cb120"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">x_2</span> <span class="op"><-</span> <span class="va">x</span><span class="op">[</span><span class="fl">2</span>, , <span class="op">]</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x_2</span><span class="op">)</span> <span class="op"><-</span> <span class="va">vocabulary</span></span> +<span><span class="va">x_2</span> <span class="co"># aaba | baac | acaa</span></span></code></pre></div> +<pre><code><span><span class="co">## a b c d e f g h i</span></span> +<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [2,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [3,] 0 1 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [4,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [5,] 0 1 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [6,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [7,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [8,] 0 0 1 0 0 0 0 0 0</span></span> +<span><span class="co">## [9,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [10,] 0 0 1 0 0 0 0 0 0</span></span> +<span><span class="co">## [11,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [12,] 1 0 0 0 0 0 0 0 0</span></span></code></pre> +<div class="sourceCode" id="cb122"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"label_1"</span>, <span class="st">"label_2"</span><span class="op">)</span></span> +<span><span class="va">y</span> </span></code></pre></div> +<pre><code><span><span class="co">## label_1 label_2</span></span> +<span><span class="co">## [1,] 1 0</span></span> +<span><span class="co">## [2,] 0 1</span></span></code></pre> +</div> +<div class="section level3"> +<h3 id="use_quality_score">use_quality_score<a class="anchor" aria-label="anchor" href="#use_quality_score"></a> +</h3> +<p>If <code>TRUE</code>, instead of one-hot encoding, use quality score +of fastq file.</p> +<div class="sourceCode" id="cb124"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="st">"ACAGAT"</span>, Header <span class="op">=</span> <span class="st">"header_1"</span>, Quality <span class="op">=</span> <span class="st">"!#*=?I"</span><span class="op">)</span></span> +<span><span class="va">fastq_path</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fastq"</span><span class="op">)</span></span> +<span><span class="va">fastq_file</span> <span class="op"><-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFastq.html" class="external-link">writeFastq</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">fastq_path</span><span class="op">)</span></span> +<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">fastq_path</span>,</span> +<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">5</span>,</span> +<span> format <span class="op">=</span> <span class="st">"fastq"</span>,</span> +<span> vocabulary <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span>,</span> +<span> use_quality_score <span class="op">=</span> <span class="cn">TRUE</span>,</span> +<span> output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span> +<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span> +<span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="co"># ACAGA</span></span></code></pre></div> +<pre><code><span><span class="co">## A C G T</span></span> +<span><span class="co">## [1,] 0.0000000000 0.3333333333 0.3333333333 0.3333333333</span></span> +<span><span class="co">## [2,] 0.2103191148 0.3690426555 0.2103191148 0.2103191148</span></span> +<span><span class="co">## [3,] 0.8741074588 0.0419641804 0.0419641804 0.0419641804</span></span> +<span><span class="co">## [4,] 0.0005282977 0.0005282977 0.9984151068 0.0005282977</span></span> +<span><span class="co">## [5,] 0.9990000000 0.0003333333 0.0003333333 0.0003333333</span></span></code></pre> +<div class="sourceCode" id="cb126"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># T</span></span></code></pre></div> +<pre><code><span><span class="co">## A C G T</span></span> +<span><span class="co">## [1,] 3.333333e-05 3.333333e-05 3.333333e-05 0.9999</span></span></code></pre> +</div> +<div class="section level3"> +<h3 id="use_coverage">use_coverage<a class="anchor" aria-label="anchor" href="#use_coverage"></a> +</h3> +<p>Integer or <code>NULL</code>. If not <code>NULL</code>, use coverage +as encoding rather than one-hot encoding. Coverage information must be +contained in fasta header: there must be a string “cov_n” in the header, +where n is some integer.</p> +<div class="sourceCode" id="cb128"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="st">"ACAGAT"</span>, Header <span class="op">=</span> <span class="st">"header_1_cov_8"</span><span class="op">)</span></span> +<span><span class="va">fasta_path</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fasta"</span><span class="op">)</span></span> +<span><span class="va">fasta_file</span> <span class="op"><-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">fasta_path</span><span class="op">)</span></span> +<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">fasta_path</span>,</span> +<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">5</span>,</span> +<span> vocabulary <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span>,</span> +<span> use_coverage <span class="op">=</span> <span class="fl">25</span>,</span> +<span> output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span> +<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span> +<span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="co"># ACAGA; 0.32 = 8/25</span></span></code></pre></div> +<pre><code><span><span class="co">## A C G T</span></span> +<span><span class="co">## [1,] 0.32 0.00 0.00 0</span></span> +<span><span class="co">## [2,] 0.00 0.32 0.00 0</span></span> +<span><span class="co">## [3,] 0.32 0.00 0.00 0</span></span> +<span><span class="co">## [4,] 0.00 0.00 0.32 0</span></span> +<span><span class="co">## [5,] 0.32 0.00 0.00 0</span></span></code></pre> +<div class="sourceCode" id="cb130"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">y</span> <span class="co"># T</span></span></code></pre></div> +<pre><code><span><span class="co">## A C G T</span></span> +<span><span class="co">## [1,] 0 0 0 0.32</span></span></code></pre> +</div> +<div class="section level3"> +<h3 id="added_label_path">added_label_path<a class="anchor" aria-label="anchor" href="#added_label_path"></a> +</h3> +<p>It is possible to feed a network additional information associated to +a sequence. This information needs to be in a csv file. If all sequences +in one file share the same label, the csv file should have one column +named “file”.</p> +<p>We may add some additional input to our dummy data</p> +<div class="sourceCode" id="cb132"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">file</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/basename.html" class="external-link">basename</a></span><span class="op">(</span><span class="va">file_path</span><span class="op">)</span>, <span class="st">"some_file_name.fasta"</span><span class="op">)</span></span> +<span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>file <span class="op">=</span> <span class="va">file</span>,</span> +<span> label_1 <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fl">1</span><span class="op">)</span>, label_2 <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">1</span>, <span class="fl">0</span><span class="op">)</span>, label_3 <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">1</span>, <span class="fl">0</span><span class="op">)</span><span class="op">)</span></span> +<span><span class="va">df</span></span></code></pre></div> +<pre><code><span><span class="co">## file label_1 label_2 label_3</span></span> +<span><span class="co">## 1 a.fasta 0 1 1</span></span> +<span><span class="co">## 2 some_file_name.fasta 1 0 0</span></span></code></pre> +<div class="sourceCode" id="cb134"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/utils/write.table.html" class="external-link">write.csv</a></span><span class="op">(</span>x <span class="op">=</span> <span class="va">df</span>, file <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/file.path.html" class="external-link">file.path</a></span><span class="op">(</span><span class="va">dir_path</span>, <span class="st">"add_input.csv"</span><span class="op">)</span>, row.names <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span></code></pre></div> +<p>If we add the path to the csv file, the generator will map additional +input to sequences:</p> +<div class="sourceCode" id="cb135"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">dir_path</span>,</span> +<span> train_type <span class="op">=</span> <span class="st">"lm"</span>, </span> +<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">5</span>,</span> +<span> output_format <span class="op">=</span> <span class="st">"target_right"</span>,</span> +<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span>,</span> +<span> added_label_path <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/file.path.html" class="external-link">file.path</a></span><span class="op">(</span><span class="va">dir_path</span>, <span class="st">"add_input.csv"</span><span class="op">)</span>,</span> +<span> add_input_as_seq <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span> <span class="co"># don't treat added input as sequence</span></span> +<span> </span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">added_label_input</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span> +<span><span class="va">added_label_input</span></span></code></pre></div> +<pre><code><span><span class="co">## [,1] [,2] [,3]</span></span> +<span><span class="co">## [1,] 0 1 1</span></span></code></pre> +<div class="sourceCode" id="cb137"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span> +<span><span class="va">x</span><span class="op">[</span><span class="fl">1</span>, , <span class="op">]</span></span></code></pre></div> +<pre><code><span><span class="co">## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]</span></span> +<span><span class="co">## [1,] 1 0 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [2,] 0 1 0 0 0 0 0 0 0</span></span> +<span><span class="co">## [3,] 0 0 1 0 0 0 0 0 0</span></span> +<span><span class="co">## [4,] 0 0 0 1 0 0 0 0 0</span></span> +<span><span class="co">## [5,] 0 0 0 0 1 0 0 0 0</span></span></code></pre> +<div class="sourceCode" id="cb139"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span> </span> +<span><span class="va">y</span></span></code></pre></div> +<pre><code><span><span class="co">## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]</span></span> +<span><span class="co">## [1,] 0 0 0 0 0 1 0 0 0</span></span></code></pre> +<p>If we want to train a network with additional labels, we have to add +an additional input layer.</p> +<div class="sourceCode" id="cb141"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">model</span> <span class="op"><-</span> <span class="fu"><a href="../reference/create_model_lstm_cnn.html">create_model_lstm_cnn</a></span><span class="op">(</span></span> +<span> maxlen <span class="op">=</span> <span class="fl">5</span>,</span> +<span> layer_lstm <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">8</span>, <span class="fl">8</span><span class="op">)</span>,</span> +<span> layer_dense <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">4</span><span class="op">)</span>,</span> +<span> label_input <span class="op">=</span> <span class="fl">3</span> <span class="co"># additional input vector has length 3</span></span> +<span><span class="op">)</span></span></code></pre></div> +<pre><code><span><span class="co">## Model: "model"</span></span> +<span><span class="co">## __________________________________________________________________________________________________</span></span> +<span><span class="co">## Layer (type) Output Shape Param # Connected to </span></span> +<span><span class="co">## ==================================================================================================</span></span> +<span><span class="co">## input_1 (InputLayer) [(None, 5, 4)] 0 [] </span></span> +<span><span class="co">## </span></span> +<span><span class="co">## lstm (LSTM) (None, 5, 8) 416 ['input_1[0][0]'] </span></span> +<span><span class="co">## </span></span> +<span><span class="co">## input_2 (InputLayer) [(None, 3)] 0 [] </span></span> +<span><span class="co">## </span></span> +<span><span class="co">## lstm_1 (LSTM) (None, 8) 544 ['lstm[0][0]'] </span></span> +<span><span class="co">## </span></span> +<span><span class="co">## concatenate (Concatenate) (None, 11) 0 ['input_2[0][0]', </span></span> +<span><span class="co">## 'lstm_1[0][0]'] </span></span> +<span><span class="co">## </span></span> +<span><span class="co">## dense (Dense) (None, 4) 48 ['concatenate[0][0]'] </span></span> +<span><span class="co">## </span></span> +<span><span class="co">## ==================================================================================================</span></span> +<span><span class="co">## Total params: 1008 (3.94 KB)</span></span> +<span><span class="co">## Trainable params: 1008 (3.94 KB)</span></span> +<span><span class="co">## Non-trainable params: 0 (0.00 Byte)</span></span> +<span><span class="co">## __________________________________________________________________________________________________</span></span></code></pre> +<div class="sourceCode" id="cb143"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="co"># train_model(train_type = "lm", </span></span> +<span><span class="co"># model = model,</span></span> +<span><span class="co"># path = file.path(dir_path, "train_files_1"),</span></span> +<span><span class="co"># path_val = file.path(dir_path, "validation_files_1"),</span></span> +<span><span class="co"># added_label_path = file.path(dir_path, "add_input.csv"),</span></span> +<span><span class="co"># steps_per_epoch = 5,</span></span> +<span><span class="co"># batch_size = 8,</span></span> +<span><span class="co"># epochs = 2)</span></span></code></pre></div> +</div> +<div class="section level3"> +<h3 id="return_int">return_int<a class="anchor" aria-label="anchor" href="#return_int"></a> +</h3> +<p>Whether to return integer encoding rather than one-hot encoding.</p> +<div class="sourceCode" id="cb144"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="st">"ATCGC"</span>, Header <span class="op">=</span> <span class="st">"seq_1"</span><span class="op">)</span></span> +<span><span class="va">fasta_path</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fasta"</span><span class="op">)</span></span> +<span><span class="va">fasta_file</span> <span class="op"><-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">fasta_path</span><span class="op">)</span></span> +<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">fasta_path</span>,</span> +<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span> +<span> return_int <span class="op">=</span> <span class="cn">TRUE</span>,</span> +<span> padding <span class="op">=</span> <span class="cn">TRUE</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">8</span>,</span> +<span> vocabulary <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span>,</span> +<span> output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span> +<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"pad"</span>, <span class="st">"pad"</span>, <span class="st">"pad"</span>, <span class="st">"pad"</span>, <span class="st">"A"</span>, <span class="st">"T"</span>, <span class="st">"C"</span>, <span class="st">"G"</span><span class="op">)</span></span> +<span><span class="va">x</span></span></code></pre></div> +<pre><code><span><span class="co">## pad pad pad pad A T C G</span></span> +<span><span class="co">## [1,] 0 0 0 0 1 4 2 3</span></span></code></pre> +<div class="sourceCode" id="cb146"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="st">"C"</span></span> +<span><span class="va">y</span></span></code></pre></div> +<pre><code><span><span class="co">## C</span></span> +<span><span class="co">## [1,] 2</span></span></code></pre> +<p>Can also be combined with n-gram encoding:</p> +<div class="sourceCode" id="cb148"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="st">"AAACCCTTT"</span>, Header <span class="op">=</span> <span class="st">"seq_1"</span><span class="op">)</span></span> +<span><span class="va">fasta_path</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fasta"</span><span class="op">)</span></span> +<span><span class="va">fasta_file</span> <span class="op"><-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">fasta_path</span><span class="op">)</span></span> +<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">fasta_path</span>,</span> +<span> train_type <span class="op">=</span> <span class="st">"lm"</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span> +<span> n_gram <span class="op">=</span> <span class="fl">3</span>,</span> +<span> n_gram_stride <span class="op">=</span> <span class="fl">3</span>,</span> +<span> return_int <span class="op">=</span> <span class="cn">TRUE</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">6</span>,</span> +<span> target_len <span class="op">=</span> <span class="fl">3</span>,</span> +<span> vocabulary <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span>,</span> +<span> output_format <span class="op">=</span> <span class="st">"target_right"</span><span class="op">)</span></span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span> +<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"AAA"</span>, <span class="st">"CCC"</span><span class="op">)</span></span> +<span><span class="va">x</span></span></code></pre></div> +<pre><code><span><span class="co">## AAA CCC</span></span> +<span><span class="co">## [1,] 1 22</span></span></code></pre> +<div class="sourceCode" id="cb150"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html" class="external-link">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op"><-</span> <span class="st">"TTT"</span></span> +<span><span class="va">y</span></span></code></pre></div> +<pre><code><span><span class="co">## TTT</span></span> +<span><span class="co">## [1,] 64</span></span></code></pre> +</div> +<div class="section level3"> +<h3 id="reshape_xy">reshape_xy<a class="anchor" aria-label="anchor" href="#reshape_xy"></a> +</h3> +<p>Apply some function to the output of a generator call.</p> +<div class="sourceCode" id="cb152"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="st">"AAAATTTT"</span>, Header <span class="op">=</span> <span class="st">"header_1"</span><span class="op">)</span></span> +<span><span class="va">fasta_path</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fasta"</span><span class="op">)</span></span> +<span><span class="va">fasta_file</span> <span class="op"><-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">fasta_path</span><span class="op">)</span></span> +<span><span class="va">fx</span> <span class="op"><-</span> <span class="kw">function</span><span class="op">(</span><span class="va">x</span> <span class="op">=</span> <span class="cn">NULL</span>, <span class="va">y</span> <span class="op">=</span> <span class="cn">NULL</span><span class="op">)</span> <span class="op">{</span></span> +<span> <span class="kw"><a href="https://rdrr.io/r/base/function.html" class="external-link">return</a></span><span class="op">(</span><span class="va">x</span> <span class="op">-</span> <span class="fl">1</span><span class="op">)</span></span> +<span><span class="op">}</span></span> +<span><span class="va">fy</span> <span class="op"><-</span> <span class="kw">function</span><span class="op">(</span><span class="va">x</span> <span class="op">=</span> <span class="cn">NULL</span>, <span class="va">y</span> <span class="op">=</span> <span class="cn">NULL</span><span class="op">)</span> <span class="op">{</span></span> +<span> <span class="kw"><a href="https://rdrr.io/r/base/function.html" class="external-link">return</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/Log.html" class="external-link">exp</a></span><span class="op">(</span><span class="va">y</span> <span class="op">*</span> <span class="fl">5</span><span class="op">)</span><span class="op">)</span></span> +<span><span class="op">}</span></span> +<span></span> +<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">fasta_path</span>,</span> +<span> reshape_xy <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span>x <span class="op">=</span> <span class="va">fx</span>, y <span class="op">=</span> <span class="va">fy</span><span class="op">)</span>,</span> +<span> train_type <span class="op">=</span> <span class="st">"label_folder"</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">8</span><span class="op">)</span></span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span> +<span><span class="va">x</span><span class="op">[</span><span class="fl">1</span>,,<span class="op">]</span></span></code></pre></div> +<pre><code><span><span class="co">## [,1] [,2] [,3] [,4]</span></span> +<span><span class="co">## [1,] 0 -1 -1 -1</span></span> +<span><span class="co">## [2,] 0 -1 -1 -1</span></span> +<span><span class="co">## [3,] 0 -1 -1 -1</span></span> +<span><span class="co">## [4,] 0 -1 -1 -1</span></span> +<span><span class="co">## [5,] -1 -1 -1 0</span></span> +<span><span class="co">## [6,] -1 -1 -1 0</span></span> +<span><span class="co">## [7,] -1 -1 -1 0</span></span> +<span><span class="co">## [8,] -1 -1 -1 0</span></span></code></pre> +<div class="sourceCode" id="cb154"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span> +<span><span class="va">y</span></span></code></pre></div> +<pre><code><span><span class="co">## [,1]</span></span> +<span><span class="co">## [1,] 148.4132</span></span></code></pre> +</div> +<div class="section level3"> +<h3 id="masked_lm">masked_lm<a class="anchor" aria-label="anchor" href="#masked_lm"></a> +</h3> +<p>Masks some parts of input sequence. Can be used for training +BERT-like models.</p> +<div class="sourceCode" id="cb156"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">nt_seq</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/rep.html" class="external-link">rep</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span>, each <span class="op">=</span> <span class="fl">25</span><span class="op">)</span> <span class="op"><a href="../reference/pipe.html">%>%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste</a></span><span class="op">(</span>collapse <span class="op">=</span> <span class="st">""</span><span class="op">)</span></span> +<span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="va">nt_seq</span>, Header <span class="op">=</span> <span class="st">"seq_1"</span><span class="op">)</span></span> +<span><span class="va">fasta_path</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fasta"</span><span class="op">)</span></span> +<span><span class="va">fasta_file</span> <span class="op"><-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">fasta_path</span><span class="op">)</span></span> +<span><span class="va">masked_lm</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span>mask_rate <span class="op">=</span> <span class="fl">0.10</span>, <span class="co"># replace 10% of input with special mask token</span></span> +<span> random_rate <span class="op">=</span> <span class="fl">0.025</span>, <span class="co"># set 2.5% of input to random value</span></span> +<span> identity_rate <span class="op">=</span> <span class="fl">0.05</span>, <span class="co"># leave 5% unchanged</span></span> +<span> include_sw <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span> <span class="co"># 0,1 matrix showing where masking was applied</span></span> +<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">fasta_path</span>,</span> +<span> train_type <span class="op">=</span> <span class="st">"masked_lm"</span>,</span> +<span> masked_lm <span class="op">=</span> <span class="va">masked_lm</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span> +<span> n_gram <span class="op">=</span> <span class="fl">1</span>,</span> +<span> n_gram_stride <span class="op">=</span> <span class="fl">1</span>,</span> +<span> return_int <span class="op">=</span> <span class="cn">TRUE</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">100</span>,</span> +<span> vocabulary <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span><span class="op">)</span></span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span> +<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span> +<span><span class="va">sw</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">3</span><span class="op">]</span><span class="op">]</span></span> +<span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>x <span class="op">=</span> <span class="va">x</span><span class="op">[</span><span class="fl">1</span>, <span class="op">]</span>, y <span class="op">=</span> <span class="va">y</span><span class="op">[</span><span class="fl">1</span>, <span class="op">]</span>, sw <span class="op">=</span> <span class="va">sw</span><span class="op">[</span><span class="fl">1</span>, <span class="op">]</span><span class="op">)</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/utils/head.html" class="external-link">head</a></span><span class="op">(</span><span class="va">df</span><span class="op">)</span></span></code></pre></div> +<pre><code><span><span class="co">## x y sw</span></span> +<span><span class="co">## 1 5 1 1</span></span> +<span><span class="co">## 2 1 1 0</span></span> +<span><span class="co">## 3 1 1 0</span></span> +<span><span class="co">## 4 1 1 0</span></span> +<span><span class="co">## 5 1 1 0</span></span> +<span><span class="co">## 6 1 1 0</span></span></code></pre> +<p>Whenever sw (sample weight) column is 0, x and y columns are +identical. Let’s look at rows where sw is 1:</p> +<div class="sourceCode" id="cb158"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">df</span> <span class="op"><a href="../reference/pipe.html">%>%</a></span> <span class="fu">dplyr</span><span class="fu">::</span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html" class="external-link">filter</a></span><span class="op">(</span><span class="va">sw</span> <span class="op">==</span> <span class="fl">1</span><span class="op">)</span></span></code></pre></div> +<pre><code><span><span class="co">## x y sw</span></span> +<span><span class="co">## 1 5 1 1</span></span> +<span><span class="co">## 2 1 1 1</span></span> +<span><span class="co">## 3 1 1 1</span></span> +<span><span class="co">## 4 5 1 1</span></span> +<span><span class="co">## 5 5 2 1</span></span> +<span><span class="co">## 6 5 2 1</span></span> +<span><span class="co">## 7 5 2 1</span></span> +<span><span class="co">## 8 3 3 1</span></span> +<span><span class="co">## 9 2 3 1</span></span> +<span><span class="co">## 10 3 3 1</span></span> +<span><span class="co">## 11 5 3 1</span></span> +<span><span class="co">## 12 5 3 1</span></span> +<span><span class="co">## 13 4 4 1</span></span> +<span><span class="co">## 14 5 4 1</span></span> +<span><span class="co">## 15 4 4 1</span></span> +<span><span class="co">## 16 5 4 1</span></span> +<span><span class="co">## 17 5 4 1</span></span> +<span><span class="co">## 18 4 4 1</span></span></code></pre> +<p>Here 5 is the mask token, this is always the size of the vocabulary + +1.</p> +<div class="sourceCode" id="cb160"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">df</span> <span class="op"><a href="../reference/pipe.html">%>%</a></span> <span class="fu">dplyr</span><span class="fu">::</span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html" class="external-link">filter</a></span><span class="op">(</span><span class="va">sw</span> <span class="op">==</span> <span class="fl">1</span> <span class="op">&</span> <span class="va">x</span> <span class="op">==</span> <span class="fl">5</span><span class="op">)</span> <span class="co"># 10% masked part</span></span></code></pre></div> +<pre><code><span><span class="co">## x y sw</span></span> +<span><span class="co">## 1 5 1 1</span></span> +<span><span class="co">## 2 5 1 1</span></span> +<span><span class="co">## 3 5 2 1</span></span> +<span><span class="co">## 4 5 2 1</span></span> +<span><span class="co">## 5 5 2 1</span></span> +<span><span class="co">## 6 5 3 1</span></span> +<span><span class="co">## 7 5 3 1</span></span> +<span><span class="co">## 8 5 4 1</span></span> +<span><span class="co">## 9 5 4 1</span></span> +<span><span class="co">## 10 5 4 1</span></span></code></pre> +<div class="sourceCode" id="cb162"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">df</span> <span class="op"><a href="../reference/pipe.html">%>%</a></span> <span class="fu">dplyr</span><span class="fu">::</span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html" class="external-link">filter</a></span><span class="op">(</span><span class="va">sw</span> <span class="op">==</span> <span class="fl">1</span> <span class="op">&</span> <span class="va">x</span> <span class="op">!=</span> <span class="fl">5</span><span class="op">)</span> <span class="co"># 5% identity part and 2.5% random part (can randomly be the true value)</span></span></code></pre></div> +<pre><code><span><span class="co">## x y sw</span></span> +<span><span class="co">## 1 1 1 1</span></span> +<span><span class="co">## 2 1 1 1</span></span> +<span><span class="co">## 3 3 3 1</span></span> +<span><span class="co">## 4 2 3 1</span></span> +<span><span class="co">## 5 3 3 1</span></span> +<span><span class="co">## 6 4 4 1</span></span> +<span><span class="co">## 7 4 4 1</span></span> +<span><span class="co">## 8 4 4 1</span></span></code></pre> +<p>Can be combined with n-gram encoding and masking of fixed block +size:</p> +<div class="sourceCode" id="cb164"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">nt_seq</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/rep.html" class="external-link">rep</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span>, each <span class="op">=</span> <span class="fl">25</span><span class="op">)</span> <span class="op"><a href="../reference/pipe.html">%>%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste</a></span><span class="op">(</span>collapse <span class="op">=</span> <span class="st">""</span><span class="op">)</span></span> +<span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>Sequence <span class="op">=</span> <span class="va">nt_seq</span>, Header <span class="op">=</span> <span class="st">"seq_1"</span><span class="op">)</span></span> +<span><span class="va">fasta_path</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".fasta"</span><span class="op">)</span></span> +<span><span class="va">fasta_file</span> <span class="op"><-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">writeFasta</a></span><span class="op">(</span><span class="va">df</span>, <span class="va">fasta_path</span><span class="op">)</span></span> +<span><span class="va">masked_lm</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span>mask_rate <span class="op">=</span> <span class="fl">0.10</span>, <span class="co"># replace 10% of input with special mask token</span></span> +<span> random_rate <span class="op">=</span> <span class="fl">0.05</span>, <span class="co"># set 5% of input to random value</span></span> +<span> identity_rate <span class="op">=</span> <span class="fl">0.05</span>, <span class="co"># leave 5% unchanged</span></span> +<span> include_sw <span class="op">=</span> <span class="cn">TRUE</span>, <span class="co"># 0,1 matrix showing where masking was applied</span></span> +<span> block_len <span class="op">=</span> <span class="fl">3</span><span class="op">)</span> <span class="co"># always mask at least 3 tokens in a row </span></span> +<span><span class="va">gen</span> <span class="op"><-</span> <span class="fu"><a href="../reference/get_generator.html">get_generator</a></span><span class="op">(</span>path <span class="op">=</span> <span class="va">fasta_path</span>,</span> +<span> train_type <span class="op">=</span> <span class="st">"masked_lm"</span>,</span> +<span> masked_lm <span class="op">=</span> <span class="va">masked_lm</span>,</span> +<span> batch_size <span class="op">=</span> <span class="fl">1</span>,</span> +<span> n_gram <span class="op">=</span> <span class="fl">3</span>,</span> +<span> seed <span class="op">=</span> <span class="fl">12</span>,</span> +<span> n_gram_stride <span class="op">=</span> <span class="fl">1</span>,</span> +<span> return_int <span class="op">=</span> <span class="cn">TRUE</span>,</span> +<span> maxlen <span class="op">=</span> <span class="fl">100</span>,</span> +<span> vocabulary <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span><span class="op">)</span></span> +<span></span> +<span><span class="va">z</span> <span class="op"><-</span> <span class="fu">gen</span><span class="op">(</span><span class="op">)</span></span> +<span><span class="va">x</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span> +<span><span class="va">y</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span></span> +<span><span class="va">sw</span> <span class="op"><-</span> <span class="va">z</span><span class="op">[[</span><span class="fl">3</span><span class="op">]</span><span class="op">]</span></span> +<span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>x <span class="op">=</span> <span class="va">x</span><span class="op">[</span><span class="fl">1</span>, <span class="op">]</span>, y <span class="op">=</span> <span class="va">y</span><span class="op">[</span><span class="fl">1</span>, <span class="op">]</span>, sw <span class="op">=</span> <span class="va">sw</span><span class="op">[</span><span class="fl">1</span>, <span class="op">]</span>, position <span class="op">=</span> <span class="fl">1</span><span class="op">:</span><span class="fu"><a href="https://rdrr.io/r/base/nrow.html" class="external-link">ncol</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">)</span></span> +<span><span class="fu"><a href="https://rdrr.io/r/utils/head.html" class="external-link">head</a></span><span class="op">(</span><span class="va">df</span><span class="op">)</span></span></code></pre></div> +<pre><code><span><span class="co">## x y sw position</span></span> +<span><span class="co">## 1 1 1 0 1</span></span> +<span><span class="co">## 2 1 1 0 2</span></span> +<span><span class="co">## 3 1 1 0 3</span></span> +<span><span class="co">## 4 39 1 1 4</span></span> +<span><span class="co">## 5 48 1 1 5</span></span> +<span><span class="co">## 6 13 1 1 6</span></span></code></pre> +<div class="sourceCode" id="cb166"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/utils/head.html" class="external-link">tail</a></span><span class="op">(</span><span class="va">df</span><span class="op">)</span></span></code></pre></div> +<pre><code><span><span class="co">## x y sw position</span></span> +<span><span class="co">## 93 65 64 1 93</span></span> +<span><span class="co">## 94 64 64 0 94</span></span> +<span><span class="co">## 95 64 64 0 95</span></span> +<span><span class="co">## 96 64 64 0 96</span></span> +<span><span class="co">## 97 64 64 0 97</span></span> +<span><span class="co">## 98 64 64 0 98</span></span></code></pre> +<p>We can check that sample weights appear only in blocks.</p> +<div class="sourceCode" id="cb168"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/which.html" class="external-link">which</a></span><span class="op">(</span><span class="va">sw</span> <span class="op">==</span> <span class="fl">1</span><span class="op">)</span></span></code></pre></div> +<pre><code><span><span class="co">## [1] 4 5 6 13 14 15 22 23 24 40 41 42 52 53 54 79 80 81 82 83 84 91 92 93</span></span></code></pre> +<p>Here 65 is the mask token (4^3 + 1 = size of the vocabulary + 1).</p> +<div class="sourceCode" id="cb170"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">df</span> <span class="op"><a href="../reference/pipe.html">%>%</a></span> <span class="fu">dplyr</span><span class="fu">::</span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html" class="external-link">filter</a></span><span class="op">(</span><span class="va">sw</span> <span class="op">==</span> <span class="fl">1</span> <span class="op">&</span> <span class="va">x</span> <span class="op">==</span> <span class="fl">65</span><span class="op">)</span> <span class="co"># 10% masked part</span></span></code></pre></div> +<pre><code><span><span class="co">## x y sw position</span></span> +<span><span class="co">## 1 65 22 1 40</span></span> +<span><span class="co">## 2 65 22 1 41</span></span> +<span><span class="co">## 3 65 22 1 42</span></span> +<span><span class="co">## 4 65 64 1 79</span></span> +<span><span class="co">## 5 65 64 1 80</span></span> +<span><span class="co">## 6 65 64 1 81</span></span> +<span><span class="co">## 7 65 64 1 82</span></span> +<span><span class="co">## 8 65 64 1 83</span></span> +<span><span class="co">## 9 65 64 1 84</span></span> +<span><span class="co">## 10 65 64 1 91</span></span> +<span><span class="co">## 11 65 64 1 92</span></span> +<span><span class="co">## 12 65 64 1 93</span></span></code></pre> +<div class="sourceCode" id="cb172"><pre class="downlit sourceCode r"> +<code class="sourceCode R"><span><span class="va">df</span> <span class="op"><a href="../reference/pipe.html">%>%</a></span> <span class="fu">dplyr</span><span class="fu">::</span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html" class="external-link">filter</a></span><span class="op">(</span><span class="va">sw</span> <span class="op">==</span> <span class="fl">1</span> <span class="op">&</span> <span class="va">x</span> <span class="op">!=</span> <span class="fl">65</span><span class="op">)</span> <span class="co"># 5% identity part and 5% random part (can randomly be the true value)</span></span></code></pre></div> +<pre><code><span><span class="co">## x y sw position</span></span> +<span><span class="co">## 1 39 1 1 4</span></span> +<span><span class="co">## 2 48 1 1 5</span></span> +<span><span class="co">## 3 13 1 1 6</span></span> +<span><span class="co">## 4 1 1 1 13</span></span> +<span><span class="co">## 5 1 1 1 14</span></span> +<span><span class="co">## 6 1 1 1 15</span></span> +<span><span class="co">## 7 1 1 1 22</span></span> +<span><span class="co">## 8 1 1 1 23</span></span> +<span><span class="co">## 9 2 2 1 24</span></span> +<span><span class="co">## 10 56 43 1 52</span></span> +<span><span class="co">## 11 4 43 1 53</span></span> +<span><span class="co">## 12 24 43 1 54</span></span></code></pre> +</div> +</div> + </main><aside class="col-md-3"><nav id="toc"><h2>On this page</h2> + </nav></aside> +</div> + + + + <footer><div class="pkgdown-footer-left"> + <p>Developed by Philipp Münch, René Mreches, Martin Binder, Hüseyin Anil Gündüz, Xiao-Yin To, Alice McHardy.</p> +</div> + +<div class="pkgdown-footer-right"> + <p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.9.</p> +</div> + + </footer> +</div> + + + + + + </body> +</html>