Switch to side-by-side view

--- a
+++ b/man/create_model_transformer.Rd
@@ -0,0 +1,115 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_model_transformer.R
+\name{create_model_transformer}
+\alias{create_model_transformer}
+\title{Create transformer model}
+\usage{
+create_model_transformer(
+  maxlen,
+  vocabulary_size = 4,
+  embed_dim = 64,
+  pos_encoding = "embedding",
+  head_size = 4L,
+  num_heads = 5L,
+  ff_dim = 8,
+  dropout = 0,
+  n = 10000,
+  layer_dense = 2,
+  dropout_dense = NULL,
+  flatten_method = "flatten",
+  last_layer_activation = "softmax",
+  loss_fn = "categorical_crossentropy",
+  solver = "adam",
+  learning_rate = 0.01,
+  label_noise_matrix = NULL,
+  bal_acc = FALSE,
+  f1_metric = FALSE,
+  auc_metric = FALSE,
+  label_smoothing = 0,
+  verbose = TRUE,
+  model_seed = NULL,
+  mixed_precision = FALSE,
+  mirrored_strategy = NULL
+)
+}
+\arguments{
+\item{maxlen}{Length of predictor sequence.}
+
+\item{vocabulary_size}{Number of unique character in vocabulary.}
+
+\item{embed_dim}{Dimension for token embedding. No embedding if set to 0. Should be used when input is not one-hot encoded
+(integer sequence).}
+
+\item{pos_encoding}{Either \code{"sinusoid"} or \code{"embedding"}. How to add positional information.
+If \code{"sinusoid"}, will add sine waves of different frequencies to input.
+If \code{"embedding"}, model learns positional embedding.}
+
+\item{head_size}{Dimensions of attention key.}
+
+\item{num_heads}{Number of attention heads.}
+
+\item{ff_dim}{Units of first dense layer after attention blocks.}
+
+\item{dropout}{Vector of dropout rates after attention block(s).}
+
+\item{n}{Frequency of sine waves for positional encoding. Only applied if \code{pos_encoding = "sinusoid"}.}
+
+\item{layer_dense}{Vector specifying number of neurons per dense layer after last LSTM or CNN layer (if no LSTM used).}
+
+\item{dropout_dense}{Dropout for dense layers.}
+
+\item{flatten_method}{How to process output of last attention block. Can be \code{"max_ch_first"}, \code{"max_ch_last"}, \code{"average_ch_first"},
+\code{"average_ch_last"}, \code{"both_ch_first"}, \code{"both_ch_last"}, \code{"all"}, \code{"none"} or \code{"flatten"}.
+If \code{"average_ch_last"} /  \code{"max_ch_last"}  or \code{"average_ch_first"} / \code{"max_ch_first"}, will apply global average/max pooling.
+\verb{_ch_first} / \verb{_ch_last} to decide along which axis. \code{"both_ch_first"} / \code{"both_ch_last"} to use max and average together. \code{"all"} to use all 4
+global pooling options together. If \code{"flatten"}, will flatten output after last attention block. If \code{"none"} no flattening applied.}
+
+\item{last_layer_activation}{Activation function of output layer(s). For example \code{"sigmoid"} or \code{"softmax"}.}
+
+\item{loss_fn}{Either \code{"categorical_crossentropy"} or \code{"binary_crossentropy"}. If \code{label_noise_matrix} given, will use custom \code{"noisy_loss"}.}
+
+\item{solver}{Optimization method, options are \verb{"adam", "adagrad", "rmsprop"} or \code{"sgd"}.}
+
+\item{learning_rate}{Learning rate for optimizer.}
+
+\item{label_noise_matrix}{Matrix of label noises. Every row stands for one class and columns for percentage of labels in that class.
+If first label contains 5 percent wrong labels and second label no noise, then
+
+\code{label_noise_matrix <- matrix(c(0.95, 0.05, 0, 1), nrow = 2, byrow = TRUE )}}
+
+\item{bal_acc}{Whether to add balanced accuracy.}
+
+\item{f1_metric}{Whether to add F1 metric.}
+
+\item{auc_metric}{Whether to add AUC metric.}
+
+\item{label_smoothing}{Float in [0, 1]. If 0, no smoothing is applied. If > 0, loss between the predicted
+labels and a smoothed version of the true labels, where the smoothing squeezes the labels towards 0.5.
+The closer the argument is to 1 the more the labels get smoothed.}
+
+\item{verbose}{Boolean.}
+
+\item{model_seed}{Set seed for model parameters in tensorflow if not \code{NULL}.}
+
+\item{mixed_precision}{Whether to use mixed precision (https://www.tensorflow.org/guide/mixed_precision).}
+
+\item{mirrored_strategy}{Whether to use distributed mirrored strategy. If NULL, will use distributed mirrored strategy only if >1 GPU available.}
+}
+\value{
+A keras model implementing transformer architecture.
+}
+\description{
+Creates transformer network for classification. Model can consist of several stacked attention blocks.
+}
+\examples{
+
+maxlen <- 50
+\donttest{
+library(keras)
+model <- create_model_transformer(maxlen = maxlen,
+                                  head_size=c(10,12),
+                                  num_heads=c(7,8),
+                                  ff_dim=c(5,9),
+                                  dropout=c(0.3, 0.5))
+}
+}