|
a |
|
b/docs/articles/getting_started.html |
|
|
1 |
<!DOCTYPE html> |
|
|
2 |
<!-- Generated by pkgdown: do not edit by hand --><html lang="en"> |
|
|
3 |
<head> |
|
|
4 |
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> |
|
|
5 |
<meta charset="utf-8"> |
|
|
6 |
<meta http-equiv="X-UA-Compatible" content="IE=edge"> |
|
|
7 |
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"> |
|
|
8 |
<meta name="description" content="deepG"> |
|
|
9 |
<title>Getting started • deepG</title> |
|
|
10 |
<!-- favicons --><link rel="icon" type="image/png" sizes="16x16" href="../favicon-16x16.png"> |
|
|
11 |
<link rel="icon" type="image/png" sizes="32x32" href="../favicon-32x32.png"> |
|
|
12 |
<link rel="apple-touch-icon" type="image/png" sizes="180x180" href="../apple-touch-icon.png"> |
|
|
13 |
<link rel="apple-touch-icon" type="image/png" sizes="120x120" href="../apple-touch-icon-120x120.png"> |
|
|
14 |
<link rel="apple-touch-icon" type="image/png" sizes="76x76" href="../apple-touch-icon-76x76.png"> |
|
|
15 |
<link rel="apple-touch-icon" type="image/png" sizes="60x60" href="../apple-touch-icon-60x60.png"> |
|
|
16 |
<script src="../deps/jquery-3.6.0/jquery-3.6.0.min.js"></script><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"> |
|
|
17 |
<link href="../deps/bootstrap-5.3.1/bootstrap.min.css" rel="stylesheet"> |
|
|
18 |
<script src="../deps/bootstrap-5.3.1/bootstrap.bundle.min.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"> |
|
|
19 |
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"> |
|
|
20 |
<!-- bootstrap-toc --><script src="https://cdn.jsdelivr.net/gh/afeld/bootstrap-toc@v1.0.1/dist/bootstrap-toc.min.js" integrity="sha256-4veVQbu7//Lk5TSmc7YV48MxtMy98e26cf5MrgZYnwo=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.11/clipboard.min.js" integrity="sha512-7O5pXpc0oCRrxk8RUfDYFgn0nO1t+jLuIOQdOMRp4APB7uZ4vSjspzp5y6YDtDs4VzUSTbWzBFZ/LKJhnyFOKw==" crossorigin="anonymous" referrerpolicy="no-referrer"></script><!-- search --><script src="https://cdnjs.cloudflare.com/ajax/libs/fuse.js/6.4.6/fuse.js" integrity="sha512-zv6Ywkjyktsohkbp9bb45V6tEMoWhzFzXis+LrMehmJZZSys19Yxf1dopHx7WzIKxr5tK2dVcYmaCk2uqdjF4A==" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/autocomplete.js/0.38.0/autocomplete.jquery.min.js" integrity="sha512-GU9ayf+66Xx2TmpxqJpliWbT5PiGYxpaG8rfnBEk1LL8l1KGkRShhngwdXK1UgqhAzWpZHSiYPc09/NwDQIGyg==" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mark.js/8.11.1/mark.min.js" integrity="sha512-5CYOlHXGh6QpOFA/TeTylKLWfB3ftPsde7AnmhuitiTX4K5SqCLBeKro6sPS8ilsz1Q4NRx3v8Ko2IBiszzdww==" crossorigin="anonymous"></script><!-- pkgdown --><script src="../pkgdown.js"></script><meta property="og:title" content="Getting started"> |
|
|
21 |
<meta property="og:description" content="deepG"> |
|
|
22 |
<meta property="og:image" content="https://genomenet.github.io/deepG/logo.png"> |
|
|
23 |
<!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]> |
|
|
24 |
<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script> |
|
|
25 |
<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script> |
|
|
26 |
<![endif]--> |
|
|
27 |
</head> |
|
|
28 |
<body> |
|
|
29 |
<a href="#main" class="visually-hidden-focusable">Skip to contents</a> |
|
|
30 |
|
|
|
31 |
|
|
|
32 |
<nav class="navbar fixed-top navbar-light navbar-expand-lg bg-light" data-bs-theme="light"><div class="container"> |
|
|
33 |
|
|
|
34 |
<a class="navbar-brand me-2" href="../index.html">deepG</a> |
|
|
35 |
|
|
|
36 |
<small class="nav-text text-default me-auto" data-bs-toggle="tooltip" data-bs-placement="bottom" title="Released version">0.3.0</small> |
|
|
37 |
|
|
|
38 |
|
|
|
39 |
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbar" aria-controls="navbar" aria-expanded="false" aria-label="Toggle navigation"> |
|
|
40 |
<span class="navbar-toggler-icon"></span> |
|
|
41 |
</button> |
|
|
42 |
|
|
|
43 |
<div id="navbar" class="collapse navbar-collapse ms-3"> |
|
|
44 |
<ul class="navbar-nav me-auto"> |
|
|
45 |
<li class="nav-item"> |
|
|
46 |
<a class="nav-link" href="../reference/index.html"> |
|
|
47 |
<span class="fa fa fa fa-file-alt"></span> |
|
|
48 |
|
|
|
49 |
Reference |
|
|
50 |
</a> |
|
|
51 |
</li> |
|
|
52 |
<li class="nav-item dropdown"> |
|
|
53 |
<a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false" aria-haspopup="true" id="dropdown-notebooks">Notebooks</a> |
|
|
54 |
<div class="dropdown-menu" aria-labelledby="dropdown-notebooks"> |
|
|
55 |
<a class="external-link dropdown-item" href="https://colab.research.google.com/drive/175jIdXcDcgPUvaBo2rH2Lupbpjnp5O7G?usp=sharing">deepG tutorial</a> |
|
|
56 |
<a class="external-link dropdown-item" href="https://colab.research.google.com/drive/1Eolc0koMNM1zkuO4XyVM58ImeF1BpRiH?usp=sharing">Read-length level: Human contamination</a> |
|
|
57 |
<a class="external-link dropdown-item" href="https://colab.research.google.com/drive/1yiXSwFafXpMLHaov9iBTQLIDZ6bK1zYX?usp=sharing">Locus level: CRISPR detection</a> |
|
|
58 |
<a class="external-link dropdown-item" href="https://colab.research.google.com/drive/1G7bOFEX87cZNrM2tdRtTdkrZn5fM__g0?usp=sharing">Gene level: 16S rRNA detection</a> |
|
|
59 |
<a class="external-link dropdown-item" href="https://colab.research.google.com/drive/1BCggL-tfQF136YeJ8cKKi-zoBEDMgkNh?usp=sharing">Genome level: Bacterial morphology (Sporulation)</a> |
|
|
60 |
<a class="external-link dropdown-item" href="https://colab.research.google.com/drive/10xpRzGd3JeBAbqQYSCxzQUMctt01sx9D?usp=sharing">Full metagenome level: Colorectal cancer prediction</a> |
|
|
61 |
<a class="external-link dropdown-item" href="https://colab.research.google.com/drive/1kyYK7IU7GSfdpDzO_a8U3_qD4i3zTu6w?usp=sharing">BERT with deepG</a> |
|
|
62 |
</div> |
|
|
63 |
</li> |
|
|
64 |
<li class="active nav-item dropdown"> |
|
|
65 |
<a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false" aria-haspopup="true" id="dropdown-tutorials">Tutorials</a> |
|
|
66 |
<div class="dropdown-menu" aria-labelledby="dropdown-tutorials"> |
|
|
67 |
<a class="dropdown-item" href="../articles/getting_started.html">Getting Started</a> |
|
|
68 |
<a class="dropdown-item" href="../articles/training_types.html">Training types</a> |
|
|
69 |
<a class="dropdown-item" href="../articles/data_generator.html">Data generator</a> |
|
|
70 |
<a class="dropdown-item" href="../articles/using_tb.html">Using tensorboard</a> |
|
|
71 |
<a class="dropdown-item" href="../articles/integrated_gradient.html">Integrated Gradient</a> |
|
|
72 |
</div> |
|
|
73 |
</li> |
|
|
74 |
</ul> |
|
|
75 |
<form class="form-inline my-2 my-lg-0" role="search"> |
|
|
76 |
<input type="search" class="form-control me-sm-2" aria-label="Toggle navigation" name="search-input" data-search-index="../search.json" id="search-input" placeholder="Search for" autocomplete="off"> |
|
|
77 |
</form> |
|
|
78 |
|
|
|
79 |
<ul class="navbar-nav"> |
|
|
80 |
<li class="nav-item"> |
|
|
81 |
<a class="external-link nav-link" href="https://github.com/GenomeNet/deepG/" aria-label="github"> |
|
|
82 |
<span class="fab fa fab fa-github fa-lg"></span> |
|
|
83 |
|
|
|
84 |
</a> |
|
|
85 |
</li> |
|
|
86 |
</ul> |
|
|
87 |
</div> |
|
|
88 |
|
|
|
89 |
|
|
|
90 |
</div> |
|
|
91 |
</nav><div class="container template-article"> |
|
|
92 |
|
|
|
93 |
|
|
|
94 |
|
|
|
95 |
|
|
|
96 |
<div class="row"> |
|
|
97 |
<main id="main" class="col-md-9"><div class="page-header"> |
|
|
98 |
<img src="../logo.png" class="logo" alt=""><h1>Getting started</h1> |
|
|
99 |
|
|
|
100 |
|
|
|
101 |
<small class="dont-index">Source: <a href="https://github.com/GenomeNet/deepG/blob/HEAD/vignettes/getting_started.Rmd" class="external-link"><code>vignettes/getting_started.Rmd</code></a></small> |
|
|
102 |
<div class="d-none name"><code>getting_started.Rmd</code></div> |
|
|
103 |
</div> |
|
|
104 |
|
|
|
105 |
|
|
|
106 |
|
|
|
107 |
<div class="sourceCode" id="cb1"><pre class="downlit sourceCode r"> |
|
|
108 |
<code class="sourceCode R"><span><span class="co">#devtools::install_github("GenomeNet/deepG")</span></span> |
|
|
109 |
<span><span class="kw"><a href="https://rdrr.io/r/base/library.html" class="external-link">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/GenomeNet/deepG" class="external-link">deepG</a></span><span class="op">)</span></span> |
|
|
110 |
<span><span class="kw"><a href="https://rdrr.io/r/base/library.html" class="external-link">library</a></span><span class="op">(</span><span class="va"><a href="https://magrittr.tidyverse.org" class="external-link">magrittr</a></span><span class="op">)</span></span></code></pre></div> |
|
|
111 |
<style type="text/css"> |
|
|
112 |
mark.in { |
|
|
113 |
background-color: CornflowerBlue; |
|
|
114 |
} |
|
|
115 |
|
|
|
116 |
mark.out { |
|
|
117 |
background-color: IndianRed; |
|
|
118 |
} |
|
|
119 |
|
|
|
120 |
</style> |
|
|
121 |
<div class="section level2"> |
|
|
122 |
<h2 id="introduction">Introduction<a class="anchor" aria-label="anchor" href="#introduction"></a> |
|
|
123 |
</h2> |
|
|
124 |
<p>The goal of the deepG package is to speed up the development of |
|
|
125 |
bioinformatical tools for sequence classification, homology detection |
|
|
126 |
and other bioinformatical tasks. The package offers several functions |
|
|
127 |
for</p> |
|
|
128 |
<ul> |
|
|
129 |
<li>Data (pre-) processing</li> |
|
|
130 |
<li>Deep learning architectures</li> |
|
|
131 |
<li>Model training</li> |
|
|
132 |
<li>Model evaluation</li> |
|
|
133 |
<li>Visualizing training progress</li> |
|
|
134 |
</ul> |
|
|
135 |
<div class="section level3"> |
|
|
136 |
<h3 id="create-dummy-data">Create dummy data<a class="anchor" aria-label="anchor" href="#create-dummy-data"></a> |
|
|
137 |
</h3> |
|
|
138 |
<p>We create two simple dummy training and validation data sets. Both |
|
|
139 |
consist of random <tt>ACGT</tt> sequences but the first category has a |
|
|
140 |
probability of 40% each for drawing <tt>G</tt> or <tt>C</tt> and the |
|
|
141 |
second has equal probability for each nucleotide (first category has |
|
|
142 |
around 80% <tt>GC</tt> content and second one around 50%).</p> |
|
|
143 |
<div class="sourceCode" id="cb2"><pre class="downlit sourceCode r"> |
|
|
144 |
<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Random.html" class="external-link">set.seed</a></span><span class="op">(</span><span class="fl">123</span><span class="op">)</span></span> |
|
|
145 |
<span><span class="va">vocabulary</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"A"</span>, <span class="st">"C"</span>, <span class="st">"G"</span>, <span class="st">"T"</span><span class="op">)</span></span> |
|
|
146 |
<span></span> |
|
|
147 |
<span><span class="va">data_type</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"train_1"</span>, <span class="st">"train_2"</span>, <span class="st">"val_1"</span>, <span class="st">"val_2"</span><span class="op">)</span></span> |
|
|
148 |
<span></span> |
|
|
149 |
<span><span class="kw">for</span> <span class="op">(</span><span class="va">i</span> <span class="kw">in</span> <span class="fl">1</span><span class="op">:</span><span class="fu"><a href="https://rdrr.io/r/base/length.html" class="external-link">length</a></span><span class="op">(</span><span class="va">data_type</span><span class="op">)</span><span class="op">)</span> <span class="op">{</span></span> |
|
|
150 |
<span> </span> |
|
|
151 |
<span> <span class="va">temp_file</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span><span class="op">)</span></span> |
|
|
152 |
<span> <span class="fu"><a href="https://rdrr.io/r/base/assign.html" class="external-link">assign</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste0</a></span><span class="op">(</span><span class="va">data_type</span><span class="op">[</span><span class="va">i</span><span class="op">]</span>, <span class="st">"_dir"</span><span class="op">)</span>, <span class="va">temp_file</span><span class="op">)</span></span> |
|
|
153 |
<span> <span class="fu"><a href="https://rdrr.io/r/base/files2.html" class="external-link">dir.create</a></span><span class="op">(</span><span class="va">temp_file</span><span class="op">)</span></span> |
|
|
154 |
<span> </span> |
|
|
155 |
<span> <span class="kw">if</span> <span class="op">(</span><span class="va">i</span> <span class="op"><a href="https://rdrr.io/r/base/Arithmetic.html" class="external-link">%%</a></span> <span class="fl">2</span> <span class="op">==</span> <span class="fl">1</span><span class="op">)</span> <span class="op">{</span></span> |
|
|
156 |
<span> <span class="va">header</span> <span class="op"><-</span> <span class="st">"high_gc"</span></span> |
|
|
157 |
<span> <span class="va">prob</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">0.1</span>, <span class="fl">0.4</span>, <span class="fl">0.4</span>, <span class="fl">0.1</span><span class="op">)</span></span> |
|
|
158 |
<span> <span class="op">}</span> <span class="kw">else</span> <span class="op">{</span></span> |
|
|
159 |
<span> <span class="va">header</span> <span class="op"><-</span> <span class="st">"equal_dist"</span></span> |
|
|
160 |
<span> <span class="va">prob</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/rep.html" class="external-link">rep</a></span><span class="op">(</span><span class="fl">0.25</span>, <span class="fl">4</span><span class="op">)</span></span> |
|
|
161 |
<span> <span class="op">}</span></span> |
|
|
162 |
<span> </span> |
|
|
163 |
<span> <span class="va">fasta_name_start</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste0</a></span><span class="op">(</span><span class="va">header</span>, <span class="st">"_"</span>, <span class="va">data_type</span><span class="op">[</span><span class="va">i</span><span class="op">]</span>, <span class="st">"file"</span><span class="op">)</span></span> |
|
|
164 |
<span> </span> |
|
|
165 |
<span> <span class="fu"><a href="../reference/create_dummy_data.html">create_dummy_data</a></span><span class="op">(</span>file_path <span class="op">=</span> <span class="va">temp_file</span>,</span> |
|
|
166 |
<span> num_files <span class="op">=</span> <span class="fl">1</span>,</span> |
|
|
167 |
<span> seq_length <span class="op">=</span> <span class="fl">10000</span>, </span> |
|
|
168 |
<span> num_seq <span class="op">=</span> <span class="fl">1</span>,</span> |
|
|
169 |
<span> header <span class="op">=</span> <span class="va">header</span>,</span> |
|
|
170 |
<span> prob <span class="op">=</span> <span class="va">prob</span>,</span> |
|
|
171 |
<span> fasta_name_start <span class="op">=</span> <span class="va">fasta_name_start</span>,</span> |
|
|
172 |
<span> vocabulary <span class="op">=</span> <span class="va">vocabulary</span><span class="op">)</span></span> |
|
|
173 |
<span> </span> |
|
|
174 |
<span><span class="op">}</span></span></code></pre></div> |
|
|
175 |
</div> |
|
|
176 |
<div class="section level3"> |
|
|
177 |
<h3 id="training">Training<a class="anchor" aria-label="anchor" href="#training"></a> |
|
|
178 |
</h3> |
|
|
179 |
<p>We can now train a model that can differentiate between the two |
|
|
180 |
categories. First, we can create our network architecture. We take an |
|
|
181 |
input size of 50 nucleotides. The model has one lstm layer with 16 cells |
|
|
182 |
and two dense layers with 8 and 2 neurons.</p> |
|
|
183 |
<div class="sourceCode" id="cb3"><pre class="downlit sourceCode r"> |
|
|
184 |
<code class="sourceCode R"><span><span class="va">maxlen</span> <span class="op"><-</span> <span class="fl">50</span></span> |
|
|
185 |
<span><span class="va">model</span> <span class="op"><-</span> <span class="fu"><a href="../reference/create_model_lstm_cnn.html">create_model_lstm_cnn</a></span><span class="op">(</span>maxlen <span class="op">=</span> <span class="va">maxlen</span>,</span> |
|
|
186 |
<span> layer_lstm <span class="op">=</span> <span class="fl">16</span>,</span> |
|
|
187 |
<span> layer_dense <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">8</span>, <span class="fl">2</span><span class="op">)</span><span class="op">)</span></span></code></pre></div> |
|
|
188 |
<pre><code><span><span class="co">## Model: "model"</span></span> |
|
|
189 |
<span><span class="co">## _________________________________________________________________</span></span> |
|
|
190 |
<span><span class="co">## Layer (type) Output Shape Param # </span></span> |
|
|
191 |
<span><span class="co">## =================================================================</span></span> |
|
|
192 |
<span><span class="co">## input_1 (InputLayer) [(None, 50, 4)] 0 </span></span> |
|
|
193 |
<span><span class="co">## </span></span> |
|
|
194 |
<span><span class="co">## lstm (LSTM) (None, 16) 1344 </span></span> |
|
|
195 |
<span><span class="co">## </span></span> |
|
|
196 |
<span><span class="co">## dense (Dense) (None, 8) 136 </span></span> |
|
|
197 |
<span><span class="co">## </span></span> |
|
|
198 |
<span><span class="co">## dense_1 (Dense) (None, 2) 18 </span></span> |
|
|
199 |
<span><span class="co">## </span></span> |
|
|
200 |
<span><span class="co">## =================================================================</span></span> |
|
|
201 |
<span><span class="co">## Total params: 1498 (5.85 KB)</span></span> |
|
|
202 |
<span><span class="co">## Trainable params: 1498 (5.85 KB)</span></span> |
|
|
203 |
<span><span class="co">## Non-trainable params: 0 (0.00 Byte)</span></span> |
|
|
204 |
<span><span class="co">## _________________________________________________________________</span></span></code></pre> |
|
|
205 |
<p>Next we can train the model using the <code>train_model</code> |
|
|
206 |
function. Function will internally build a data generator for |
|
|
207 |
training.</p> |
|
|
208 |
<div class="sourceCode" id="cb5"><pre class="downlit sourceCode r"> |
|
|
209 |
<code class="sourceCode R"><span><span class="va">hist</span> <span class="op"><-</span> <span class="fu"><a href="../reference/train_model.html">train_model</a></span><span class="op">(</span><span class="va">model</span>,</span> |
|
|
210 |
<span> train_type <span class="op">=</span> <span class="st">"label_folder"</span>,</span> |
|
|
211 |
<span> run_name <span class="op">=</span> <span class="st">"gc_model_1"</span>,</span> |
|
|
212 |
<span> path <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="va">train_1_dir</span>, <span class="va">train_2_dir</span><span class="op">)</span>,</span> |
|
|
213 |
<span> path_val <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="va">val_1_dir</span>, <span class="va">val_2_dir</span><span class="op">)</span>,</span> |
|
|
214 |
<span> epochs <span class="op">=</span> <span class="fl">4</span>,</span> |
|
|
215 |
<span> steps_per_epoch <span class="op">=</span> <span class="fl">25</span>, <span class="co"># one epoch = 25 batches</span></span> |
|
|
216 |
<span> batch_size <span class="op">=</span> <span class="fl">64</span>,</span> |
|
|
217 |
<span> step <span class="op">=</span> <span class="fl">50</span>, <span class="co"># take a sample every 50 nt</span></span> |
|
|
218 |
<span> vocabulary_label <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"high_gc"</span>, <span class="st">"equal_dist"</span><span class="op">)</span><span class="op">)</span></span></code></pre></div> |
|
|
219 |
<pre><code><span><span class="co">## Epoch 1/4</span></span> |
|
|
220 |
<span><span class="co">## 1/25 [>.............................] - ETA: 21s - loss: 0.7058 - acc: 0.5938 4/25 [===>..........................] - ETA: 0s - loss: 0.7028 - acc: 0.5430 7/25 [=======>......................] - ETA: 0s - loss: 0.7013 - acc: 0.533510/25 [===========>..................] - ETA: 0s - loss: 0.6976 - acc: 0.539114/25 [===============>..............] - ETA: 0s - loss: 0.6935 - acc: 0.559217/25 [===================>..........] - ETA: 0s - loss: 0.6900 - acc: 0.577221/25 [========================>.....] - ETA: 0s - loss: 0.6860 - acc: 0.607124/25 [===========================>..] - ETA: 0s - loss: 0.6821 - acc: 0.624325/25 [==============================] - 2s 31ms/step - loss: 0.6813 - acc: 0.6256 - val_loss: 0.6511 - val_acc: 0.7563 - lr: 0.0010</span></span> |
|
|
221 |
<span><span class="co">## Epoch 2/4</span></span> |
|
|
222 |
<span><span class="co">## 1/25 [>.............................] - ETA: 0s - loss: 0.6379 - acc: 0.8281 4/25 [===>..........................] - ETA: 0s - loss: 0.6423 - acc: 0.7617 8/25 [========>.....................] - ETA: 0s - loss: 0.6340 - acc: 0.785212/25 [=============>................] - ETA: 0s - loss: 0.6228 - acc: 0.789116/25 [==================>...........] - ETA: 0s - loss: 0.6086 - acc: 0.805720/25 [=======================>......] - ETA: 0s - loss: 0.5892 - acc: 0.825824/25 [===========================>..] - ETA: 0s - loss: 0.5650 - acc: 0.850925/25 [==============================] - 1s 21ms/step - loss: 0.5590 - acc: 0.8556 - val_loss: 0.3910 - val_acc: 0.9719 - lr: 0.0010</span></span> |
|
|
223 |
<span><span class="co">## Epoch 3/4</span></span> |
|
|
224 |
<span><span class="co">## 1/25 [>.............................] - ETA: 0s - loss: 0.3548 - acc: 1.0000 4/25 [===>..........................] - ETA: 0s - loss: 0.3463 - acc: 0.9883 8/25 [========>.....................] - ETA: 0s - loss: 0.3230 - acc: 0.976611/25 [============>.................] - ETA: 0s - loss: 0.3052 - acc: 0.975915/25 [=================>............] - ETA: 0s - loss: 0.2893 - acc: 0.970817/25 [===================>..........] - ETA: 0s - loss: 0.2792 - acc: 0.970621/25 [========================>.....] - ETA: 0s - loss: 0.2665 - acc: 0.969524/25 [===========================>..] - ETA: 0s - loss: 0.2547 - acc: 0.971425/25 [==============================] - 1s 21ms/step - loss: 0.2533 - acc: 0.9706 - val_loss: 0.1765 - val_acc: 0.9719 - lr: 0.0010</span></span> |
|
|
225 |
<span><span class="co">## Epoch 4/4</span></span> |
|
|
226 |
<span><span class="co">## 1/25 [>.............................] - ETA: 0s - loss: 0.1369 - acc: 1.0000 4/25 [===>..........................] - ETA: 0s - loss: 0.1456 - acc: 0.9922 7/25 [=======>......................] - ETA: 0s - loss: 0.1494 - acc: 0.986610/25 [===========>..................] - ETA: 0s - loss: 0.1425 - acc: 0.987514/25 [===============>..............] - ETA: 0s - loss: 0.1376 - acc: 0.986617/25 [===================>..........] - ETA: 0s - loss: 0.1315 - acc: 0.987121/25 [========================>.....] - ETA: 0s - loss: 0.1259 - acc: 0.986625/25 [==============================] - ETA: 0s - loss: 0.1225 - acc: 0.985025/25 [==============================] - 1s 21ms/step - loss: 0.1225 - acc: 0.9850 - val_loss: 0.0992 - val_acc: 0.9812 - lr: 0.0010</span></span></code></pre> |
|
|
227 |
<pre><code><span><span class="co">## Training done.</span></span></code></pre> |
|
|
228 |
<div class="sourceCode" id="cb8"><pre class="downlit sourceCode r"> |
|
|
229 |
<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/plot.default.html" class="external-link">plot</a></span><span class="op">(</span><span class="va">hist</span><span class="op">)</span></span></code></pre></div> |
|
|
230 |
<p><img src="getting_started_files/figure-html/unnamed-chunk-7-1.png" width="700"></p> |
|
|
231 |
</div> |
|
|
232 |
<div class="section level3"> |
|
|
233 |
<h3 id="evaluation">Evaluation<a class="anchor" aria-label="anchor" href="#evaluation"></a> |
|
|
234 |
</h3> |
|
|
235 |
<p>We can now evaluate the trained model on all the validation data</p> |
|
|
236 |
<div class="sourceCode" id="cb9"><pre class="downlit sourceCode r"> |
|
|
237 |
<code class="sourceCode R"><span><span class="va">eval</span> <span class="op"><-</span> <span class="fu"><a href="../reference/evaluate_model.html">evaluate_model</a></span><span class="op">(</span>path_input <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="va">val_1_dir</span>, <span class="va">val_2_dir</span><span class="op">)</span>,</span> |
|
|
238 |
<span> model <span class="op">=</span> <span class="va">model</span>,</span> |
|
|
239 |
<span> batch_size <span class="op">=</span> <span class="fl">100</span>,</span> |
|
|
240 |
<span> step <span class="op">=</span> <span class="fl">25</span>, <span class="co"># take a sample every 25 nt </span></span> |
|
|
241 |
<span> vocabulary_label <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"high_gc"</span>, <span class="st">"equal_dist"</span><span class="op">)</span><span class="op">)</span>,</span> |
|
|
242 |
<span> mode <span class="op">=</span> <span class="st">"label_folder"</span>,</span> |
|
|
243 |
<span> evaluate_all_files <span class="op">=</span> <span class="cn">TRUE</span>,</span> |
|
|
244 |
<span> verbose <span class="op">=</span> <span class="cn">FALSE</span>,</span> |
|
|
245 |
<span> auc <span class="op">=</span> <span class="cn">TRUE</span>,</span> |
|
|
246 |
<span> auprc <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span></code></pre></div> |
|
|
247 |
<pre><code><span><span class="co">## Evaluate 399 samples for class high_gc.</span></span> |
|
|
248 |
<span><span class="co">## Evaluate 399 samples for class equal_dist.</span></span></code></pre> |
|
|
249 |
<div class="sourceCode" id="cb11"><pre class="downlit sourceCode r"> |
|
|
250 |
<code class="sourceCode R"><span><span class="va">eval</span></span></code></pre></div> |
|
|
251 |
<pre><code><span><span class="co">## [[1]]</span></span> |
|
|
252 |
<span><span class="co">## [[1]]$confusion_matrix</span></span> |
|
|
253 |
<span><span class="co">## Truth</span></span> |
|
|
254 |
<span><span class="co">## Prediction high_gc equal_dist</span></span> |
|
|
255 |
<span><span class="co">## high_gc 383 5</span></span> |
|
|
256 |
<span><span class="co">## equal_dist 16 394</span></span> |
|
|
257 |
<span><span class="co">## </span></span> |
|
|
258 |
<span><span class="co">## [[1]]$accuracy</span></span> |
|
|
259 |
<span><span class="co">## [1] 0.9736842</span></span> |
|
|
260 |
<span><span class="co">## </span></span> |
|
|
261 |
<span><span class="co">## [[1]]$categorical_crossentropy_loss</span></span> |
|
|
262 |
<span><span class="co">## [1] 0.1157783</span></span> |
|
|
263 |
<span><span class="co">## </span></span> |
|
|
264 |
<span><span class="co">## [[1]]$AUC</span></span> |
|
|
265 |
<span><span class="co">## [1] 0.9968593</span></span> |
|
|
266 |
<span><span class="co">## </span></span> |
|
|
267 |
<span><span class="co">## [[1]]$AUPRC</span></span> |
|
|
268 |
<span><span class="co">## [1] 0.9968503</span></span></code></pre> |
|
|
269 |
<p>We can check where our model made mistakes for the sequence with high |
|
|
270 |
GC content.</p> |
|
|
271 |
<div class="sourceCode" id="cb13"><pre class="downlit sourceCode r"> |
|
|
272 |
<code class="sourceCode R"><span><span class="va">high_gc_file</span> <span class="op"><-</span> <span class="fu">microseq</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/microseq/man/readFasta.html" class="external-link">readFasta</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/list.files.html" class="external-link">list.files</a></span><span class="op">(</span><span class="va">val_1_dir</span>, full.names <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span><span class="op">[</span><span class="fl">1</span><span class="op">]</span><span class="op">)</span></span> |
|
|
273 |
<span><span class="va">high_gc_seq</span> <span class="op"><-</span> <span class="va">high_gc_file</span><span class="op">$</span><span class="va">Sequence</span></span> |
|
|
274 |
<span></span> |
|
|
275 |
<span><span class="va">pred_high_gc</span> <span class="op"><-</span> <span class="fu"><a href="../reference/predict_model.html">predict_model</a></span><span class="op">(</span>model <span class="op">=</span> <span class="va">model</span>, </span> |
|
|
276 |
<span> sequence <span class="op">=</span> <span class="va">high_gc_seq</span>,</span> |
|
|
277 |
<span> filename <span class="op">=</span> <span class="cn">NULL</span>, </span> |
|
|
278 |
<span> step <span class="op">=</span> <span class="fl">25</span>,</span> |
|
|
279 |
<span> batch_size <span class="op">=</span> <span class="fl">512</span>,</span> |
|
|
280 |
<span> verbose <span class="op">=</span> <span class="cn">TRUE</span>,</span> |
|
|
281 |
<span> return_states <span class="op">=</span> <span class="cn">TRUE</span>,</span> |
|
|
282 |
<span> mode <span class="op">=</span> <span class="st">"label"</span><span class="op">)</span></span></code></pre></div> |
|
|
283 |
<pre><code><span><span class="co">## layer_name not specified. Using layer dense_1</span></span></code></pre> |
|
|
284 |
<pre><code><span><span class="co">## Computing output for model at layer dense_1 </span></span> |
|
|
285 |
<span><span class="co">## Model: "model_1"</span></span> |
|
|
286 |
<span><span class="co">## ________________________________________________________________________________</span></span> |
|
|
287 |
<span><span class="co">## Layer (type) Output Shape Param # </span></span> |
|
|
288 |
<span><span class="co">## ================================================================================</span></span> |
|
|
289 |
<span><span class="co">## input_1 (InputLayer) [(None, 50, 4)] 0 </span></span> |
|
|
290 |
<span><span class="co">## lstm (LSTM) (None, 16) 1344 </span></span> |
|
|
291 |
<span><span class="co">## dense (Dense) (None, 8) 136 </span></span> |
|
|
292 |
<span><span class="co">## dense_1 (Dense) (None, 2) 18 </span></span> |
|
|
293 |
<span><span class="co">## ================================================================================</span></span> |
|
|
294 |
<span><span class="co">## Total params: 1498 (5.85 KB)</span></span> |
|
|
295 |
<span><span class="co">## Trainable params: 1498 (5.85 KB)</span></span> |
|
|
296 |
<span><span class="co">## Non-trainable params: 0 (0.00 Byte)</span></span> |
|
|
297 |
<span><span class="co">## ________________________________________________________________________________</span></span></code></pre> |
|
|
298 |
<div class="sourceCode" id="cb16"><pre class="downlit sourceCode r"> |
|
|
299 |
<code class="sourceCode R"><span><span class="va">pred_df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/cbind.html" class="external-link">cbind</a></span><span class="op">(</span><span class="va">pred_high_gc</span><span class="op">$</span><span class="va">states</span>, <span class="va">pred_high_gc</span><span class="op">$</span><span class="va">sample_end_position</span><span class="op">)</span> <span class="op"><a href="../reference/pipe.html">%>%</a></span> </span> |
|
|
300 |
<span> <span class="fu"><a href="https://rdrr.io/r/base/as.data.frame.html" class="external-link">as.data.frame</a></span><span class="op">(</span><span class="op">)</span></span> |
|
|
301 |
<span><span class="fu"><a href="https://rdrr.io/r/base/names.html" class="external-link">names</a></span><span class="op">(</span><span class="va">pred_df</span><span class="op">)</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"high_gc_conf"</span>, <span class="st">"equal_dist_conf"</span>, <span class="st">"sample_end_position"</span><span class="op">)</span></span> |
|
|
302 |
<span><span class="fu"><a href="https://rdrr.io/r/utils/head.html" class="external-link">head</a></span><span class="op">(</span><span class="va">pred_df</span><span class="op">)</span></span></code></pre></div> |
|
|
303 |
<pre><code><span><span class="co">## high_gc_conf equal_dist_conf sample_end_position</span></span> |
|
|
304 |
<span><span class="co">## 1 0.9330443 0.06695572 50</span></span> |
|
|
305 |
<span><span class="co">## 2 0.9602452 0.03975480 75</span></span> |
|
|
306 |
<span><span class="co">## 3 0.9642879 0.03571207 100</span></span> |
|
|
307 |
<span><span class="co">## 4 0.9596730 0.04032708 125</span></span> |
|
|
308 |
<span><span class="co">## 5 0.9617251 0.03827484 150</span></span> |
|
|
309 |
<span><span class="co">## 6 0.9666333 0.03336672 175</span></span></code></pre> |
|
|
310 |
<div class="sourceCode" id="cb18"><pre class="downlit sourceCode r"> |
|
|
311 |
<code class="sourceCode R"><span><span class="va">wrong_pred</span> <span class="op"><-</span> <span class="va">pred_df</span> <span class="op"><a href="../reference/pipe.html">%>%</a></span> <span class="fu">dplyr</span><span class="fu">::</span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html" class="external-link">filter</a></span><span class="op">(</span><span class="va">high_gc_conf</span> <span class="op"><</span> <span class="fl">0.5</span><span class="op">)</span></span> |
|
|
312 |
<span><span class="va">wrong_pred</span></span></code></pre></div> |
|
|
313 |
<pre><code><span><span class="co">## high_gc_conf equal_dist_conf sample_end_position</span></span> |
|
|
314 |
<span><span class="co">## 1 0.13769490 0.8623052 675</span></span> |
|
|
315 |
<span><span class="co">## 2 0.08829107 0.9117089 800</span></span> |
|
|
316 |
<span><span class="co">## 3 0.15268661 0.8473134 1150</span></span> |
|
|
317 |
<span><span class="co">## 4 0.10348237 0.8965176 1475</span></span> |
|
|
318 |
<span><span class="co">## 5 0.10325063 0.8967494 1950</span></span> |
|
|
319 |
<span><span class="co">## 6 0.08819685 0.9118031 2700</span></span> |
|
|
320 |
<span><span class="co">## 7 0.08648270 0.9135173 3000</span></span> |
|
|
321 |
<span><span class="co">## 8 0.08025692 0.9197431 3700</span></span> |
|
|
322 |
<span><span class="co">## 9 0.10362279 0.8963772 4675</span></span> |
|
|
323 |
<span><span class="co">## 10 0.21332431 0.7866758 7550</span></span> |
|
|
324 |
<span><span class="co">## 11 0.14129026 0.8587097 7875</span></span> |
|
|
325 |
<span><span class="co">## 12 0.37184438 0.6281556 8200</span></span> |
|
|
326 |
<span><span class="co">## 13 0.07818010 0.9218199 8225</span></span> |
|
|
327 |
<span><span class="co">## 14 0.24804193 0.7519581 8425</span></span> |
|
|
328 |
<span><span class="co">## 15 0.30556497 0.6944351 9900</span></span> |
|
|
329 |
<span><span class="co">## 16 0.10370088 0.8962991 9925</span></span></code></pre> |
|
|
330 |
<div class="sourceCode" id="cb20"><pre class="downlit sourceCode r"> |
|
|
331 |
<code class="sourceCode R"><span><span class="kw">if</span> <span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/nrow.html" class="external-link">nrow</a></span><span class="op">(</span><span class="va">wrong_pred</span><span class="op">)</span> <span class="op">==</span> <span class="fl">0</span><span class="op">)</span> <span class="op">{</span></span> |
|
|
332 |
<span> <span class="fu"><a href="https://rdrr.io/r/base/print.html" class="external-link">print</a></span><span class="op">(</span><span class="st">"All predictions for high GC content class correct"</span><span class="op">)</span></span> |
|
|
333 |
<span><span class="op">}</span> <span class="kw">else</span> <span class="op">{</span></span> |
|
|
334 |
<span> </span> |
|
|
335 |
<span> <span class="co"># extract samples where model was wrong</span></span> |
|
|
336 |
<span> <span class="va">wrong_pred_seq</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/vector.html" class="external-link">vector</a></span><span class="op">(</span><span class="st">"character"</span>, <span class="fu"><a href="https://rdrr.io/r/base/nrow.html" class="external-link">nrow</a></span><span class="op">(</span><span class="va">wrong_pred</span><span class="op">)</span><span class="op">)</span></span> |
|
|
337 |
<span> <span class="kw">for</span> <span class="op">(</span><span class="va">i</span> <span class="kw">in</span> <span class="fl">1</span><span class="op">:</span><span class="fu"><a href="https://rdrr.io/r/base/length.html" class="external-link">length</a></span><span class="op">(</span><span class="va">wrong_pred_seq</span><span class="op">)</span><span class="op">)</span> <span class="op">{</span></span> |
|
|
338 |
<span> <span class="va">sample_end</span> <span class="op"><-</span> <span class="va">wrong_pred</span><span class="op">$</span><span class="va">sample_end_position</span><span class="op">[</span><span class="va">i</span><span class="op">]</span></span> |
|
|
339 |
<span> <span class="va">sample_start</span> <span class="op"><-</span> <span class="va">sample_end</span> <span class="op">-</span> <span class="va">maxlen</span> <span class="op">+</span> <span class="fl">1</span></span> |
|
|
340 |
<span> <span class="va">wrong_pred_seq</span><span class="op">[</span><span class="va">i</span><span class="op">]</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/substr.html" class="external-link">substr</a></span><span class="op">(</span><span class="va">high_gc_seq</span>, <span class="va">sample_start</span>, <span class="va">sample_end</span><span class="op">)</span></span> |
|
|
341 |
<span> <span class="op">}</span></span> |
|
|
342 |
<span> </span> |
|
|
343 |
<span> <span class="va">wrong_pred_seq</span></span> |
|
|
344 |
<span><span class="op">}</span></span></code></pre></div> |
|
|
345 |
<pre><code><span><span class="co">## [1] "CTTAGAGACCTCGCCGCCACCGCCCGAGGTTCCGCTCCGGCGTCCCGCGG"</span></span> |
|
|
346 |
<span><span class="co">## [2] "CCCACTTCGTGTCTATGCCGGACACGCCTCGATAGGCGCAGGCGATGGGC"</span></span> |
|
|
347 |
<span><span class="co">## [3] "ACAGGAGAGACCCTCGGTTGCCGGCGACGCCGTGTCGTTGGTAGGCCCAC"</span></span> |
|
|
348 |
<span><span class="co">## [4] "GATAGCTCCACACCCACCTCAGCGTCCCGGGCCGCCGGCGTTCCGCCTGC"</span></span> |
|
|
349 |
<span><span class="co">## [5] "GCCCAACAAGGACGGTGAACTCCCCCGGGTACGGAAGAGGGTATGGCCGC"</span></span> |
|
|
350 |
<span><span class="co">## [6] "AGGAGTCCTCCTAGAGCTCATGGGTTGAGACGTGCCTCGACGCCCGACCT"</span></span> |
|
|
351 |
<span><span class="co">## [7] "CCCATTAGACCGTCCTGGCGGACACCCGTACGGGTGAGACCCTCCGGGTC"</span></span> |
|
|
352 |
<span><span class="co">## [8] "TGCTTATCATGGCCGCCCTGATGACGTGTCAGGGGGAGGACTGAGCGGGG"</span></span> |
|
|
353 |
<span><span class="co">## [9] "ATCCCGCATTCGCCGACGTCTCCACAGGAGGATCAGCGGGTCCGGGGCGA"</span></span> |
|
|
354 |
<span><span class="co">## [10] "TTTGCGCCCCCTAAGGCACAGCCGCGACCCCAGGTTGGGAACCGCCGAAC"</span></span> |
|
|
355 |
<span><span class="co">## [11] "CTACGGAACGTGGCTCCGAGCATCGGCGCATCGGCATGTGTCTGCCGGCG"</span></span> |
|
|
356 |
<span><span class="co">## [12] "GTCGGGCGGAGCGCCACCACCGAGGGGCGGGCCCTTCAATTCTATAAGCG"</span></span> |
|
|
357 |
<span><span class="co">## [13] "GGCGGGCCCTTCAATTCTATAAGCGACGCCGCCCTTGTCTGACGCTGGGC"</span></span> |
|
|
358 |
<span><span class="co">## [14] "CACCCTATGTAGCCCCCTGCCTCGCCGGCCAGCCTGGGCTGATCGGGGCC"</span></span> |
|
|
359 |
<span><span class="co">## [15] "TGGCCGTCGCGCTCCGGAGCCGTCACACCGGCGTACCTGTTATAAAGTCG"</span></span> |
|
|
360 |
<span><span class="co">## [16] "CACCGGCGTACCTGTTATAAAGTCGCCCGCGCTCCCCCGGGCGCACCACG"</span></span></code></pre> |
|
|
361 |
<p>We can check the nucleotide distribution of those sequences</p> |
|
|
362 |
<div class="sourceCode" id="cb22"><pre class="downlit sourceCode r"> |
|
|
363 |
<code class="sourceCode R"><span><span class="va">l</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span><span class="op">)</span></span> |
|
|
364 |
<span><span class="kw">for</span> <span class="op">(</span><span class="va">i</span> <span class="kw">in</span> <span class="fl">1</span><span class="op">:</span><span class="fu"><a href="https://rdrr.io/r/base/length.html" class="external-link">length</a></span><span class="op">(</span><span class="va">wrong_pred_seq</span><span class="op">)</span><span class="op">)</span> <span class="op">{</span></span> |
|
|
365 |
<span> <span class="va">l</span><span class="op">[[</span><span class="va">i</span><span class="op">]</span><span class="op">]</span> <span class="op"><-</span> <span class="fu">stringr</span><span class="fu">::</span><span class="fu"><a href="https://stringr.tidyverse.org/reference/str_split.html" class="external-link">str_split</a></span><span class="op">(</span><span class="va">wrong_pred_seq</span><span class="op">[</span><span class="va">i</span><span class="op">]</span>, <span class="st">""</span><span class="op">)</span> <span class="op"><a href="../reference/pipe.html">%>%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/table.html" class="external-link">table</a></span><span class="op">(</span><span class="op">)</span> <span class="op"><a href="../reference/pipe.html">%>%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/proportions.html" class="external-link">prop.table</a></span><span class="op">(</span><span class="op">)</span> <span class="op"><a href="../reference/pipe.html">%>%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/t.html" class="external-link">t</a></span><span class="op">(</span><span class="op">)</span> <span class="op"><a href="../reference/pipe.html">%>%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/matrix.html" class="external-link">as.matrix</a></span><span class="op">(</span><span class="op">)</span></span> |
|
|
366 |
<span><span class="op">}</span></span> |
|
|
367 |
<span><span class="va">dist_matrix</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/do.call.html" class="external-link">do.call</a></span><span class="op">(</span><span class="va">rbind</span>, <span class="va">l</span><span class="op">)</span></span> |
|
|
368 |
<span><span class="va">dist_matrix</span></span></code></pre></div> |
|
|
369 |
<pre><code><span><span class="co">## A C G T</span></span> |
|
|
370 |
<span><span class="co">## [1,] 0.10 0.46 0.30 0.14</span></span> |
|
|
371 |
<span><span class="co">## [2,] 0.16 0.34 0.32 0.18</span></span> |
|
|
372 |
<span><span class="co">## [3,] 0.16 0.32 0.36 0.16</span></span> |
|
|
373 |
<span><span class="co">## [4,] 0.12 0.48 0.26 0.14</span></span> |
|
|
374 |
<span><span class="co">## [5,] 0.24 0.30 0.36 0.10</span></span> |
|
|
375 |
<span><span class="co">## [6,] 0.18 0.32 0.30 0.20</span></span> |
|
|
376 |
<span><span class="co">## [7,] 0.16 0.38 0.30 0.16</span></span> |
|
|
377 |
<span><span class="co">## [8,] 0.16 0.22 0.42 0.20</span></span> |
|
|
378 |
<span><span class="co">## [9,] 0.18 0.34 0.34 0.14</span></span> |
|
|
379 |
<span><span class="co">## [10,] 0.20 0.40 0.28 0.12</span></span> |
|
|
380 |
<span><span class="co">## [11,] 0.14 0.32 0.36 0.18</span></span> |
|
|
381 |
<span><span class="co">## [12,] 0.18 0.32 0.36 0.14</span></span> |
|
|
382 |
<span><span class="co">## [13,] 0.14 0.34 0.30 0.22</span></span> |
|
|
383 |
<span><span class="co">## [14,] 0.10 0.44 0.30 0.16</span></span> |
|
|
384 |
<span><span class="co">## [15,] 0.16 0.34 0.30 0.20</span></span> |
|
|
385 |
<span><span class="co">## [16,] 0.16 0.44 0.26 0.14</span></span></code></pre> |
|
|
386 |
<div class="sourceCode" id="cb24"><pre class="downlit sourceCode r"> |
|
|
387 |
<code class="sourceCode R"><span><span class="va">df</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>distribution <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/vector.html" class="external-link">as.vector</a></span><span class="op">(</span><span class="va">dist_matrix</span><span class="op">)</span>,</span> |
|
|
388 |
<span> nt <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html" class="external-link">factor</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/rep.html" class="external-link">rep</a></span><span class="op">(</span><span class="va">vocabulary</span>, each <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/nrow.html" class="external-link">nrow</a></span><span class="op">(</span><span class="va">dist_matrix</span><span class="op">)</span><span class="op">)</span><span class="op">)</span>,</span> |
|
|
389 |
<span> sample_id <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/rep.html" class="external-link">rep</a></span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="fu"><a href="https://rdrr.io/r/base/nrow.html" class="external-link">nrow</a></span><span class="op">(</span><span class="va">dist_matrix</span><span class="op">)</span>, <span class="fl">4</span><span class="op">)</span><span class="op">)</span></span> |
|
|
390 |
<span></span> |
|
|
391 |
<span><span class="fu">ggplot</span><span class="op">(</span><span class="va">df</span>, <span class="fu">aes</span><span class="op">(</span>fill<span class="op">=</span><span class="va">nt</span>, y<span class="op">=</span><span class="va">distribution</span>, x<span class="op">=</span><span class="va">nt</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> </span> |
|
|
392 |
<span> <span class="fu">geom_bar</span><span class="op">(</span>position<span class="op">=</span><span class="st">"dodge"</span>, stat<span class="op">=</span><span class="st">"identity"</span><span class="op">)</span> <span class="op">+</span> <span class="fu">facet_wrap</span><span class="op">(</span><span class="op">~</span><span class="va">sample_id</span><span class="op">)</span></span></code></pre></div> |
|
|
393 |
<p><img src="getting_started_files/figure-html/unnamed-chunk-10-1.png" width="700"></p> |
|
|
394 |
<p>Finally, we may want to aggregate all predictions, we made for the |
|
|
395 |
sequence. We can do this using the <code>summarize_states</code> |
|
|
396 |
function. The function returns the mean confidence, the maximum |
|
|
397 |
prediction and the vote percentages (percentage of predictions per |
|
|
398 |
class).</p> |
|
|
399 |
<div class="sourceCode" id="cb25"><pre class="downlit sourceCode r"> |
|
|
400 |
<code class="sourceCode R"><span><span class="va">label_names</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"high_gc"</span>, <span class="st">"equal_dist"</span><span class="op">)</span></span> |
|
|
401 |
<span><span class="va">pred_summary</span> <span class="op"><-</span> <span class="fu"><a href="../reference/summarize_states.html">summarize_states</a></span><span class="op">(</span>label_names <span class="op">=</span> <span class="va">label_names</span>, df <span class="op">=</span> <span class="va">pred_df</span><span class="op">[</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">2</span><span class="op">]</span><span class="op">)</span></span> |
|
|
402 |
<span><span class="fu"><a href="https://rdrr.io/r/base/print.html" class="external-link">print</a></span><span class="op">(</span><span class="va">pred_summary</span><span class="op">)</span></span></code></pre></div> |
|
|
403 |
<pre><code><span><span class="co">## file_name mean_conf_high_gc mean_conf_equal_dist max_conf_high_gc</span></span> |
|
|
404 |
<span><span class="co">## <lgcl> <num> <num> <num></span></span> |
|
|
405 |
<span><span class="co">## 1: NA 0.9148641 0.08513589 0.9714182</span></span> |
|
|
406 |
<span><span class="co">## max_conf_equal_dist vote_perc_high_gc vote_perc_equal_dist mean_prediction</span></span> |
|
|
407 |
<span><span class="co">## <num> <num> <num> <char></span></span> |
|
|
408 |
<span><span class="co">## 1: 0.9218199 0.9598997 0.04010025 high_gc</span></span> |
|
|
409 |
<span><span class="co">## max_prediction vote_prediction num_prediction</span></span> |
|
|
410 |
<span><span class="co">## <char> <char> <int></span></span> |
|
|
411 |
<span><span class="co">## 1: high_gc high_gc 399</span></span></code></pre> |
|
|
412 |
</div> |
|
|
413 |
</div> |
|
|
414 |
</main><aside class="col-md-3"><nav id="toc"><h2>On this page</h2> |
|
|
415 |
</nav></aside> |
|
|
416 |
</div> |
|
|
417 |
|
|
|
418 |
|
|
|
419 |
|
|
|
420 |
<footer><div class="pkgdown-footer-left"> |
|
|
421 |
<p>Developed by Philipp Münch, René Mreches, Martin Binder, Hüseyin Anil Gündüz, Xiao-Yin To, Alice McHardy.</p> |
|
|
422 |
</div> |
|
|
423 |
|
|
|
424 |
<div class="pkgdown-footer-right"> |
|
|
425 |
<p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.9.</p> |
|
|
426 |
</div> |
|
|
427 |
|
|
|
428 |
</footer> |
|
|
429 |
</div> |
|
|
430 |
|
|
|
431 |
|
|
|
432 |
|
|
|
433 |
|
|
|
434 |
|
|
|
435 |
</body> |
|
|
436 |
</html> |