Switch to unified view

a b/vignettes/getting_started.Rmd
1
---
2
title: "Getting started"
3
output: rmarkdown::html_vignette
4
vignette: >
5
  %\VignetteIndexEntry{Getting started}
6
  %\VignetteEngine{knitr::rmarkdown}
7
  %\VignetteEncoding{UTF-8}
8
---
9
10
```{r, echo=FALSE, warning=FALSE, message=FALSE}
11
12
if (!reticulate::py_module_available("tensorflow")) {
13
  knitr::opts_chunk$set(eval = FALSE)
14
} else {
15
  knitr::opts_chunk$set(eval = TRUE)
16
}
17
```
18
19
```{r, message=FALSE}
20
library(deepG)
21
library(keras)
22
library(magrittr)
23
library(ggplot2)
24
```
25
26
```{r, echo=FALSE, warning=FALSE, message=FALSE}
27
options(rmarkdown.html_vignette.check_title = FALSE)
28
```
29
30
31
32
```{css, echo=FALSE}
33
mark.in {
34
  background-color: CornflowerBlue;
35
}
36
37
mark.out {
38
  background-color: IndianRed;
39
}
40
41
```
42
43
## Introduction 
44
45
The goal of the deepG package is to speed up the development of bioinformatical tools for sequence classification, homology detection and other bioinformatical tasks. 
46
The package offers several functions for
47
48
+ Data (pre-) processing
49
+ Deep learning architectures
50
+ Model training 
51
+ Model evaluation
52
+ Visualizing training progress
53
54
55
### Create dummy data
56
57
We create two simple dummy training and validation data sets. Both consist of random <tt>ACGT</tt> sequences but the first category has 
58
a probability of 40% each for drawing <tt>G</tt> or <tt>C</tt> and the second has equal probability for each nucleotide (first category has around 80% <tt>GC</tt> content and second one around 50%).   
59
60
```{r warning = FALSE}
61
set.seed(123)
62
vocabulary <- c("A", "C", "G", "T")
63
64
data_type <- c("train_1", "train_2", "val_1", "val_2")
65
66
for (i in 1:length(data_type)) {
67
  
68
  temp_file <- tempfile()
69
  assign(paste0(data_type[i], "_dir"), temp_file)
70
  dir.create(temp_file)
71
  
72
  if (i %% 2 == 1) {
73
    header <- "high_gc"
74
    prob <- c(0.1, 0.4, 0.4, 0.1)
75
  } else {
76
    header <- "equal_dist"
77
    prob <- rep(0.25, 4)
78
  }
79
  
80
  fasta_name_start <- paste0(header, "_", data_type[i], "file")
81
  
82
  create_dummy_data(file_path = temp_file,
83
                    num_files = 1,
84
                    seq_length = 10000, 
85
                    num_seq = 1,
86
                    header = header,
87
                    prob = prob,
88
                    fasta_name_start = fasta_name_start,
89
                    vocabulary = vocabulary)
90
  
91
}
92
```
93
94
### Training
95
96
We can now train a model that can differentiate between the two categories. First, we can create our network architecture. We take an input size of 50 nucleotides. The model has one lstm layer with 16 cells and two dense layers with 8 and 2 neurons.
97
98
```{r warning = FALSE}
99
maxlen <- 50
100
model <- create_model_lstm_cnn(maxlen = maxlen,
101
                               layer_lstm = 16,
102
                               layer_dense = c(8, 2))
103
```
104
105
Next we can train the model using the `train_model` function. Function will internally build a data generator for training. 
106
107
```{r warning = FALSE}
108
hist <- train_model(model,
109
                    train_type = "label_folder",
110
                    run_name = "gc_model_1",
111
                    path = c(train_1_dir, train_2_dir),
112
                    path_val = c(val_1_dir, val_2_dir),
113
                    epochs = 4,
114
                    steps_per_epoch = 25, # one epoch = 25 batches
115
                    batch_size = 64,
116
                    step = 50, # take a sample every 50 nt
117
                    vocabulary_label = c("high_gc", "equal_dist"))
118
                    
119
plot(hist)
120
```
121
122
### Evaluation 
123
124
We can now evaluate the trained model on all the validation data
125
126
```{r warning = FALSE}
127
eval <- evaluate_model(path_input = c(val_1_dir, val_2_dir),
128
                       model = model,
129
                       batch_size = 100,
130
                       step = 25, # take a sample every 25 nt 
131
                       vocabulary_label = list(c("high_gc", "equal_dist")),
132
                       mode = "label_folder",
133
                       evaluate_all_files = TRUE,
134
                       verbose = FALSE,
135
                       auc = TRUE,
136
                       auprc = TRUE)
137
eval
138
```
139
140
We can check where our model made mistakes for the sequence with high GC content.
141
142
```{r warning = FALSE}
143
high_gc_file <- microseq::readFasta(list.files(val_1_dir, full.names = TRUE)[1])
144
high_gc_seq <- high_gc_file$Sequence
145
146
pred_high_gc <- predict_model(model = model, 
147
                              sequence = high_gc_seq,
148
                              filename = NULL, 
149
                              step = 25,
150
                              batch_size = 512,
151
                              verbose = TRUE,
152
                              return_states = TRUE,
153
                              mode = "label")
154
155
pred_df <- cbind(pred_high_gc$states, pred_high_gc$sample_end_position) %>% 
156
  as.data.frame()
157
names(pred_df) <- c("high_gc_conf", "equal_dist_conf", "sample_end_position")
158
head(pred_df)
159
160
wrong_pred <- pred_df %>% dplyr::filter(high_gc_conf < 0.5)
161
wrong_pred
162
163
if (nrow(wrong_pred) == 0) {
164
  print("All predictions for high GC content class correct")
165
} else {
166
  
167
  # extract samples where model was wrong
168
  wrong_pred_seq <- vector("character", nrow(wrong_pred))
169
  for (i in 1:length(wrong_pred_seq)) {
170
    sample_end <- wrong_pred$sample_end_position[i]
171
    sample_start <- sample_end - maxlen + 1
172
    wrong_pred_seq[i] <- substr(high_gc_seq, sample_start, sample_end)
173
  }
174
  
175
  wrong_pred_seq
176
}
177
```
178
179
We can check the nucleotide distribution of those sequences
180
181
```{r warning = FALSE}
182
l <- list()
183
for (i in 1:length(wrong_pred_seq)) {
184
  l[[i]] <- stringr::str_split(wrong_pred_seq[i], "") %>% table() %>% prop.table() %>% t() %>% as.matrix()
185
}
186
dist_matrix <- do.call(rbind, l)
187
dist_matrix
188
189
df <- data.frame(distribution = as.vector(dist_matrix),
190
                 nt = factor(rep(vocabulary, each = nrow(dist_matrix))),
191
                 sample_id = rep(1:nrow(dist_matrix), 4))
192
193
ggplot(df, aes(fill=nt, y=distribution, x=nt)) + 
194
    geom_bar(position="dodge", stat="identity")  + facet_wrap(~sample_id)
195
```
196
  
197
Finally, we may want to aggregate all predictions, we made for the sequence.
198
We can do this using the `summarize_states` function. The function returns 
199
the mean confidence, the maximum prediction and the vote percentages (percentage of predictions per class).
200
201
```{r warning = FALSE}
202
label_names <- c("high_gc", "equal_dist")
203
pred_summary <- summarize_states(label_names = label_names, df = pred_df[, 1:2])
204
print(pred_summary)
205
```
206