Introduction
The goal of the deepG package is to speed up the development of bioinformatical tools for sequence classification, homology detection and other bioinformatical tasks. The package offers several functions for
- Data (pre-) processing
- Deep learning architectures
- Model training
- Model evaluation
- Visualizing training progress
Create dummy data
We create two simple dummy training and validation data sets. Both consist of random ACGT sequences but the first category has a probability of 40% each for drawing G or C and the second has equal probability for each nucleotide (first category has around 80% GC content and second one around 50%).
set.seed(123)
vocabulary <- c("A", "C", "G", "T")
data_type <- c("train_1", "train_2", "val_1", "val_2")
for (i in 1:length(data_type)) {
temp_file <- tempfile()
assign(paste0(data_type[i], "_dir"), temp_file)
dir.create(temp_file)
if (i %% 2 == 1) {
header <- "high_gc"
prob <- c(0.1, 0.4, 0.4, 0.1)
} else {
header <- "equal_dist"
prob <- rep(0.25, 4)
}
fasta_name_start <- paste0(header, "_", data_type[i], "file")
create_dummy_data(file_path = temp_file,
num_files = 1,
seq_length = 10000,
num_seq = 1,
header = header,
prob = prob,
fasta_name_start = fasta_name_start,
vocabulary = vocabulary)
}
Training
We can now train a model that can differentiate between the two categories. First, we can create our network architecture. We take an input size of 50 nucleotides. The model has one lstm layer with 16 cells and two dense layers with 8 and 2 neurons.
maxlen <- 50
model <- create_model_lstm_cnn(maxlen = maxlen,
layer_lstm = 16,
layer_dense = c(8, 2))
## Model: "model"
## _________________________________________________________________
## Layer (type) Output Shape Param #
## =================================================================
## input_1 (InputLayer) [(None, 50, 4)] 0
##
## lstm (LSTM) (None, 16) 1344
##
## dense (Dense) (None, 8) 136
##
## dense_1 (Dense) (None, 2) 18
##
## =================================================================
## Total params: 1498 (5.85 KB)
## Trainable params: 1498 (5.85 KB)
## Non-trainable params: 0 (0.00 Byte)
## _________________________________________________________________
Next we can train the model using the train_model
function. Function will internally build a data generator for
training.
hist <- train_model(model,
train_type = "label_folder",
run_name = "gc_model_1",
path = c(train_1_dir, train_2_dir),
path_val = c(val_1_dir, val_2_dir),
epochs = 4,
steps_per_epoch = 25, # one epoch = 25 batches
batch_size = 64,
step = 50, # take a sample every 50 nt
vocabulary_label = c("high_gc", "equal_dist"))
## Epoch 1/4
## 1/25 [>.............................] - ETA: 21s - loss: 0.7058 - acc: 0.5938 4/25 [===>..........................] - ETA: 0s - loss: 0.7028 - acc: 0.5430 7/25 [=======>......................] - ETA: 0s - loss: 0.7013 - acc: 0.533510/25 [===========>..................] - ETA: 0s - loss: 0.6976 - acc: 0.539114/25 [===============>..............] - ETA: 0s - loss: 0.6935 - acc: 0.559217/25 [===================>..........] - ETA: 0s - loss: 0.6900 - acc: 0.577221/25 [========================>.....] - ETA: 0s - loss: 0.6860 - acc: 0.607124/25 [===========================>..] - ETA: 0s - loss: 0.6821 - acc: 0.624325/25 [==============================] - 2s 31ms/step - loss: 0.6813 - acc: 0.6256 - val_loss: 0.6511 - val_acc: 0.7563 - lr: 0.0010
## Epoch 2/4
## 1/25 [>.............................] - ETA: 0s - loss: 0.6379 - acc: 0.8281 4/25 [===>..........................] - ETA: 0s - loss: 0.6423 - acc: 0.7617 8/25 [========>.....................] - ETA: 0s - loss: 0.6340 - acc: 0.785212/25 [=============>................] - ETA: 0s - loss: 0.6228 - acc: 0.789116/25 [==================>...........] - ETA: 0s - loss: 0.6086 - acc: 0.805720/25 [=======================>......] - ETA: 0s - loss: 0.5892 - acc: 0.825824/25 [===========================>..] - ETA: 0s - loss: 0.5650 - acc: 0.850925/25 [==============================] - 1s 21ms/step - loss: 0.5590 - acc: 0.8556 - val_loss: 0.3910 - val_acc: 0.9719 - lr: 0.0010
## Epoch 3/4
## 1/25 [>.............................] - ETA: 0s - loss: 0.3548 - acc: 1.0000 4/25 [===>..........................] - ETA: 0s - loss: 0.3463 - acc: 0.9883 8/25 [========>.....................] - ETA: 0s - loss: 0.3230 - acc: 0.976611/25 [============>.................] - ETA: 0s - loss: 0.3052 - acc: 0.975915/25 [=================>............] - ETA: 0s - loss: 0.2893 - acc: 0.970817/25 [===================>..........] - ETA: 0s - loss: 0.2792 - acc: 0.970621/25 [========================>.....] - ETA: 0s - loss: 0.2665 - acc: 0.969524/25 [===========================>..] - ETA: 0s - loss: 0.2547 - acc: 0.971425/25 [==============================] - 1s 21ms/step - loss: 0.2533 - acc: 0.9706 - val_loss: 0.1765 - val_acc: 0.9719 - lr: 0.0010
## Epoch 4/4
## 1/25 [>.............................] - ETA: 0s - loss: 0.1369 - acc: 1.0000 4/25 [===>..........................] - ETA: 0s - loss: 0.1456 - acc: 0.9922 7/25 [=======>......................] - ETA: 0s - loss: 0.1494 - acc: 0.986610/25 [===========>..................] - ETA: 0s - loss: 0.1425 - acc: 0.987514/25 [===============>..............] - ETA: 0s - loss: 0.1376 - acc: 0.986617/25 [===================>..........] - ETA: 0s - loss: 0.1315 - acc: 0.987121/25 [========================>.....] - ETA: 0s - loss: 0.1259 - acc: 0.986625/25 [==============================] - ETA: 0s - loss: 0.1225 - acc: 0.985025/25 [==============================] - 1s 21ms/step - loss: 0.1225 - acc: 0.9850 - val_loss: 0.0992 - val_acc: 0.9812 - lr: 0.0010
## Training done.
plot(hist)
Evaluation
We can now evaluate the trained model on all the validation data
eval <- evaluate_model(path_input = c(val_1_dir, val_2_dir),
model = model,
batch_size = 100,
step = 25, # take a sample every 25 nt
vocabulary_label = list(c("high_gc", "equal_dist")),
mode = "label_folder",
evaluate_all_files = TRUE,
verbose = FALSE,
auc = TRUE,
auprc = TRUE)
## Evaluate 399 samples for class high_gc.
## Evaluate 399 samples for class equal_dist.
eval
## [[1]]
## [[1]]$confusion_matrix
## Truth
## Prediction high_gc equal_dist
## high_gc 383 5
## equal_dist 16 394
##
## [[1]]$accuracy
## [1] 0.9736842
##
## [[1]]$categorical_crossentropy_loss
## [1] 0.1157783
##
## [[1]]$AUC
## [1] 0.9968593
##
## [[1]]$AUPRC
## [1] 0.9968503
We can check where our model made mistakes for the sequence with high GC content.
high_gc_file <- microseq::readFasta(list.files(val_1_dir, full.names = TRUE)[1])
high_gc_seq <- high_gc_file$Sequence
pred_high_gc <- predict_model(model = model,
sequence = high_gc_seq,
filename = NULL,
step = 25,
batch_size = 512,
verbose = TRUE,
return_states = TRUE,
mode = "label")
## layer_name not specified. Using layer dense_1
## Computing output for model at layer dense_1
## Model: "model_1"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## input_1 (InputLayer) [(None, 50, 4)] 0
## lstm (LSTM) (None, 16) 1344
## dense (Dense) (None, 8) 136
## dense_1 (Dense) (None, 2) 18
## ================================================================================
## Total params: 1498 (5.85 KB)
## Trainable params: 1498 (5.85 KB)
## Non-trainable params: 0 (0.00 Byte)
## ________________________________________________________________________________
pred_df <- cbind(pred_high_gc$states, pred_high_gc$sample_end_position) %>%
as.data.frame()
names(pred_df) <- c("high_gc_conf", "equal_dist_conf", "sample_end_position")
head(pred_df)
## high_gc_conf equal_dist_conf sample_end_position
## 1 0.9330443 0.06695572 50
## 2 0.9602452 0.03975480 75
## 3 0.9642879 0.03571207 100
## 4 0.9596730 0.04032708 125
## 5 0.9617251 0.03827484 150
## 6 0.9666333 0.03336672 175
## high_gc_conf equal_dist_conf sample_end_position
## 1 0.13769490 0.8623052 675
## 2 0.08829107 0.9117089 800
## 3 0.15268661 0.8473134 1150
## 4 0.10348237 0.8965176 1475
## 5 0.10325063 0.8967494 1950
## 6 0.08819685 0.9118031 2700
## 7 0.08648270 0.9135173 3000
## 8 0.08025692 0.9197431 3700
## 9 0.10362279 0.8963772 4675
## 10 0.21332431 0.7866758 7550
## 11 0.14129026 0.8587097 7875
## 12 0.37184438 0.6281556 8200
## 13 0.07818010 0.9218199 8225
## 14 0.24804193 0.7519581 8425
## 15 0.30556497 0.6944351 9900
## 16 0.10370088 0.8962991 9925
if (nrow(wrong_pred) == 0) {
print("All predictions for high GC content class correct")
} else {
# extract samples where model was wrong
wrong_pred_seq <- vector("character", nrow(wrong_pred))
for (i in 1:length(wrong_pred_seq)) {
sample_end <- wrong_pred$sample_end_position[i]
sample_start <- sample_end - maxlen + 1
wrong_pred_seq[i] <- substr(high_gc_seq, sample_start, sample_end)
}
wrong_pred_seq
}
## [1] "CTTAGAGACCTCGCCGCCACCGCCCGAGGTTCCGCTCCGGCGTCCCGCGG"
## [2] "CCCACTTCGTGTCTATGCCGGACACGCCTCGATAGGCGCAGGCGATGGGC"
## [3] "ACAGGAGAGACCCTCGGTTGCCGGCGACGCCGTGTCGTTGGTAGGCCCAC"
## [4] "GATAGCTCCACACCCACCTCAGCGTCCCGGGCCGCCGGCGTTCCGCCTGC"
## [5] "GCCCAACAAGGACGGTGAACTCCCCCGGGTACGGAAGAGGGTATGGCCGC"
## [6] "AGGAGTCCTCCTAGAGCTCATGGGTTGAGACGTGCCTCGACGCCCGACCT"
## [7] "CCCATTAGACCGTCCTGGCGGACACCCGTACGGGTGAGACCCTCCGGGTC"
## [8] "TGCTTATCATGGCCGCCCTGATGACGTGTCAGGGGGAGGACTGAGCGGGG"
## [9] "ATCCCGCATTCGCCGACGTCTCCACAGGAGGATCAGCGGGTCCGGGGCGA"
## [10] "TTTGCGCCCCCTAAGGCACAGCCGCGACCCCAGGTTGGGAACCGCCGAAC"
## [11] "CTACGGAACGTGGCTCCGAGCATCGGCGCATCGGCATGTGTCTGCCGGCG"
## [12] "GTCGGGCGGAGCGCCACCACCGAGGGGCGGGCCCTTCAATTCTATAAGCG"
## [13] "GGCGGGCCCTTCAATTCTATAAGCGACGCCGCCCTTGTCTGACGCTGGGC"
## [14] "CACCCTATGTAGCCCCCTGCCTCGCCGGCCAGCCTGGGCTGATCGGGGCC"
## [15] "TGGCCGTCGCGCTCCGGAGCCGTCACACCGGCGTACCTGTTATAAAGTCG"
## [16] "CACCGGCGTACCTGTTATAAAGTCGCCCGCGCTCCCCCGGGCGCACCACG"
We can check the nucleotide distribution of those sequences
l <- list()
for (i in 1:length(wrong_pred_seq)) {
l[[i]] <- stringr::str_split(wrong_pred_seq[i], "") %>% table() %>% prop.table() %>% t() %>% as.matrix()
}
dist_matrix <- do.call(rbind, l)
dist_matrix
## A C G T
## [1,] 0.10 0.46 0.30 0.14
## [2,] 0.16 0.34 0.32 0.18
## [3,] 0.16 0.32 0.36 0.16
## [4,] 0.12 0.48 0.26 0.14
## [5,] 0.24 0.30 0.36 0.10
## [6,] 0.18 0.32 0.30 0.20
## [7,] 0.16 0.38 0.30 0.16
## [8,] 0.16 0.22 0.42 0.20
## [9,] 0.18 0.34 0.34 0.14
## [10,] 0.20 0.40 0.28 0.12
## [11,] 0.14 0.32 0.36 0.18
## [12,] 0.18 0.32 0.36 0.14
## [13,] 0.14 0.34 0.30 0.22
## [14,] 0.10 0.44 0.30 0.16
## [15,] 0.16 0.34 0.30 0.20
## [16,] 0.16 0.44 0.26 0.14
df <- data.frame(distribution = as.vector(dist_matrix),
nt = factor(rep(vocabulary, each = nrow(dist_matrix))),
sample_id = rep(1:nrow(dist_matrix), 4))
ggplot(df, aes(fill=nt, y=distribution, x=nt)) +
geom_bar(position="dodge", stat="identity") + facet_wrap(~sample_id)
Finally, we may want to aggregate all predictions, we made for the
sequence. We can do this using the summarize_states
function. The function returns the mean confidence, the maximum
prediction and the vote percentages (percentage of predictions per
class).
label_names <- c("high_gc", "equal_dist")
pred_summary <- summarize_states(label_names = label_names, df = pred_df[, 1:2])
print(pred_summary)
## file_name mean_conf_high_gc mean_conf_equal_dist max_conf_high_gc
## <lgcl> <num> <num> <num>
## 1: NA 0.9148641 0.08513589 0.9714182
## max_conf_equal_dist vote_perc_high_gc vote_perc_equal_dist mean_prediction
## <num> <num> <num> <char>
## 1: 0.9218199 0.9598997 0.04010025 high_gc
## max_prediction vote_prediction num_prediction
## <char> <char> <int>
## 1: high_gc high_gc 399