--- a
+++ b/R/data.R
@@ -0,0 +1,29 @@
+#' CRISPR data
+#' 
+#' Example training dataset consisting of a sequence of nucleotides of CRISPR loci
+#' Filtered for unambiguous characters and contains only characters in the vocabulary \{A,G,G,T
+#' \}.
+#' Can be loaded to workspace via `data(crispr_sample)`.
+#' @format Large character of 442.41 kB
+#' @usage data(crispr_sample)
+#' @references \url{https://github.com/philippmuench}
+"crispr_sample"
+
+#' Parenthesis data
+#' 
+#' Training dataset of synthetic parenthesis language.
+#' Can be loaded to workspace via `data(parenthesis)`.
+#' @format Large character of 1.00 MB
+#' @usage data(parenthesis)
+#' @references \url{https://github.com/philippmuench}
+"parenthesis"
+
+#' Ecoli subset
+#' 
+#' Subset of the E. coli genome for evaluation.
+#' Can be loaded to workspace via `data(ecoli_small)`.
+#' @format character 326.73 kB
+#' @usage data(ecoli_small)
+#' @references \url{https://www.science.org/doi/10.1126/science.277.5331.1453}
+"ecoli_small"
+