Switch to side-by-side view

--- a
+++ b/vignettes/getting_started.Rmd
@@ -0,0 +1,206 @@
+---
+title: "Getting started"
+output: rmarkdown::html_vignette
+vignette: >
+  %\VignetteIndexEntry{Getting started}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+```{r, echo=FALSE, warning=FALSE, message=FALSE}
+
+if (!reticulate::py_module_available("tensorflow")) {
+  knitr::opts_chunk$set(eval = FALSE)
+} else {
+  knitr::opts_chunk$set(eval = TRUE)
+}
+```
+
+```{r, message=FALSE}
+library(deepG)
+library(keras)
+library(magrittr)
+library(ggplot2)
+```
+
+```{r, echo=FALSE, warning=FALSE, message=FALSE}
+options(rmarkdown.html_vignette.check_title = FALSE)
+```
+
+
+
+```{css, echo=FALSE}
+mark.in {
+  background-color: CornflowerBlue;
+}
+
+mark.out {
+  background-color: IndianRed;
+}
+
+```
+
+## Introduction 
+
+The goal of the deepG package is to speed up the development of bioinformatical tools for sequence classification, homology detection and other bioinformatical tasks. 
+The package offers several functions for
+
++ Data (pre-) processing
++ Deep learning architectures
++ Model training 
++ Model evaluation
++ Visualizing training progress
+
+
+### Create dummy data
+
+We create two simple dummy training and validation data sets. Both consist of random <tt>ACGT</tt> sequences but the first category has 
+a probability of 40% each for drawing <tt>G</tt> or <tt>C</tt> and the second has equal probability for each nucleotide (first category has around 80% <tt>GC</tt> content and second one around 50%).   
+
+```{r warning = FALSE}
+set.seed(123)
+vocabulary <- c("A", "C", "G", "T")
+
+data_type <- c("train_1", "train_2", "val_1", "val_2")
+
+for (i in 1:length(data_type)) {
+  
+  temp_file <- tempfile()
+  assign(paste0(data_type[i], "_dir"), temp_file)
+  dir.create(temp_file)
+  
+  if (i %% 2 == 1) {
+    header <- "high_gc"
+    prob <- c(0.1, 0.4, 0.4, 0.1)
+  } else {
+    header <- "equal_dist"
+    prob <- rep(0.25, 4)
+  }
+  
+  fasta_name_start <- paste0(header, "_", data_type[i], "file")
+  
+  create_dummy_data(file_path = temp_file,
+                    num_files = 1,
+                    seq_length = 10000, 
+                    num_seq = 1,
+                    header = header,
+                    prob = prob,
+                    fasta_name_start = fasta_name_start,
+                    vocabulary = vocabulary)
+  
+}
+```
+
+### Training
+
+We can now train a model that can differentiate between the two categories. First, we can create our network architecture. We take an input size of 50 nucleotides. The model has one lstm layer with 16 cells and two dense layers with 8 and 2 neurons.
+
+```{r warning = FALSE}
+maxlen <- 50
+model <- create_model_lstm_cnn(maxlen = maxlen,
+                               layer_lstm = 16,
+                               layer_dense = c(8, 2))
+```
+
+Next we can train the model using the `train_model` function. Function will internally build a data generator for training. 
+
+```{r warning = FALSE}
+hist <- train_model(model,
+                    train_type = "label_folder",
+                    run_name = "gc_model_1",
+                    path = c(train_1_dir, train_2_dir),
+                    path_val = c(val_1_dir, val_2_dir),
+                    epochs = 4,
+                    steps_per_epoch = 25, # one epoch = 25 batches
+                    batch_size = 64,
+                    step = 50, # take a sample every 50 nt
+                    vocabulary_label = c("high_gc", "equal_dist"))
+                    
+plot(hist)
+```
+
+### Evaluation 
+
+We can now evaluate the trained model on all the validation data
+
+```{r warning = FALSE}
+eval <- evaluate_model(path_input = c(val_1_dir, val_2_dir),
+                       model = model,
+                       batch_size = 100,
+                       step = 25, # take a sample every 25 nt 
+                       vocabulary_label = list(c("high_gc", "equal_dist")),
+                       mode = "label_folder",
+                       evaluate_all_files = TRUE,
+                       verbose = FALSE,
+                       auc = TRUE,
+                       auprc = TRUE)
+eval
+```
+
+We can check where our model made mistakes for the sequence with high GC content.
+
+```{r warning = FALSE}
+high_gc_file <- microseq::readFasta(list.files(val_1_dir, full.names = TRUE)[1])
+high_gc_seq <- high_gc_file$Sequence
+
+pred_high_gc <- predict_model(model = model, 
+                              sequence = high_gc_seq,
+                              filename = NULL, 
+                              step = 25,
+                              batch_size = 512,
+                              verbose = TRUE,
+                              return_states = TRUE,
+                              mode = "label")
+
+pred_df <- cbind(pred_high_gc$states, pred_high_gc$sample_end_position) %>% 
+  as.data.frame()
+names(pred_df) <- c("high_gc_conf", "equal_dist_conf", "sample_end_position")
+head(pred_df)
+
+wrong_pred <- pred_df %>% dplyr::filter(high_gc_conf < 0.5)
+wrong_pred
+
+if (nrow(wrong_pred) == 0) {
+  print("All predictions for high GC content class correct")
+} else {
+  
+  # extract samples where model was wrong
+  wrong_pred_seq <- vector("character", nrow(wrong_pred))
+  for (i in 1:length(wrong_pred_seq)) {
+    sample_end <- wrong_pred$sample_end_position[i]
+    sample_start <- sample_end - maxlen + 1
+    wrong_pred_seq[i] <- substr(high_gc_seq, sample_start, sample_end)
+  }
+  
+  wrong_pred_seq
+}
+```
+
+We can check the nucleotide distribution of those sequences
+
+```{r warning = FALSE}
+l <- list()
+for (i in 1:length(wrong_pred_seq)) {
+  l[[i]] <- stringr::str_split(wrong_pred_seq[i], "") %>% table() %>% prop.table() %>% t() %>% as.matrix()
+}
+dist_matrix <- do.call(rbind, l)
+dist_matrix
+
+df <- data.frame(distribution = as.vector(dist_matrix),
+                 nt = factor(rep(vocabulary, each = nrow(dist_matrix))),
+                 sample_id = rep(1:nrow(dist_matrix), 4))
+
+ggplot(df, aes(fill=nt, y=distribution, x=nt)) + 
+    geom_bar(position="dodge", stat="identity")  + facet_wrap(~sample_id)
+```
+  
+Finally, we may want to aggregate all predictions, we made for the sequence.
+We can do this using the `summarize_states` function. The function returns 
+the mean confidence, the maximum prediction and the vote percentages (percentage of predictions per class).
+
+```{r warning = FALSE}
+label_names <- c("high_gc", "equal_dist")
+pred_summary <- summarize_states(label_names = label_names, df = pred_df[, 1:2])
+print(pred_summary)
+```
+