--- a
+++ b/man/get_class_weight.Rd
@@ -0,0 +1,89 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/preprocess.R
+\name{get_class_weight}
+\alias{get_class_weight}
+\title{Estimate frequency of different classes}
+\usage{
+get_class_weight(
+  path,
+  vocabulary_label = NULL,
+  format = "fasta",
+  file_proportion = 1,
+  train_type = "label_folder",
+  named_list = FALSE,
+  csv_path = NULL
+)
+}
+\arguments{
+\item{path}{Path to training data. If \code{train_type} is \code{label_folder}, should be a vector or list
+where each entry corresponds to a class (list elements can be directories and/or individual files). If \code{train_type} is not \code{label_folder},
+can be a single directory or file or a list of directories and/or files.}
+
+\item{vocabulary_label}{Character vector of possible targets. Targets outside \code{vocabulary_label} will get discarded.}
+
+\item{format}{File format, either \code{"fasta"} or \code{"fastq"}.}
+
+\item{file_proportion}{Proportion of files to randomly sample for estimating class distributions.}
+
+\item{train_type}{Either \code{"lm"}, \code{"lm_rds"}, \code{"masked_lm"} for language model; \code{"label_header"}, \code{"label_folder"}, \code{"label_csv"}, \code{"label_rds"} for classification or \code{"dummy_gen"}.
+\itemize{
+\item Language model is trained to predict character(s) in a sequence. \cr
+\item \code{"label_header"}/\code{"label_folder"}/\code{"label_csv"} are trained to predict a corresponding class given a sequence as input.
+\item If \code{"label_header"}, class will be read from fasta headers.
+\item If \code{"label_folder"}, class will be read from folder, i.e. all files in one folder must belong to the same class.
+\item If \code{"label_csv"}, targets are read from a csv file. This file should have one column named "file". The targets then correspond to entries in that row (except "file"
+column). Example: if we are currently working with a file called "a.fasta" and corresponding label is "label_1", there should be a row in our csv file\tabular{lll}{
+   file \tab label_1 \tab label_2 \cr
+   "a.fasta" \tab 1 \tab 0 \cr
+}
+
+
+\item If \code{"label_rds"}, generator will iterate over set of .rds files containing each a list of input and target tensors. Not implemented for model
+with multiple inputs.
+\item If \code{"lm_rds"}, generator will iterate over set of .rds files and will split tensor according to \code{target_len} argument
+(targets are last \code{target_len} nucleotides of each sequence).
+\item  If \code{"dummy_gen"}, generator creates random data once and repeatedly feeds these to model.
+\item  If \code{"masked_lm"}, generator maskes some parts of the input. See \code{masked_lm} argument for details.
+}}
+
+\item{named_list}{Whether to give class weight list names \verb{"0", "1", ...} or not.}
+
+\item{csv_path}{If \code{train_type = "label_csv"}, path to csv file containing labels.}
+}
+\value{
+A list of numeric values (class weights).
+}
+\description{
+Count number of nucleotides for each class and use as estimation for relation of class distribution.
+Outputs list of class relations. Can be used as input for \code{class_weigth} in \code{\link{train_model}} function.
+}
+\examples{
+
+# create dummy data
+path_1 <- tempfile()
+path_2 <- tempfile()
+
+for (current_path in c(path_1, path_2)) {
+  
+  dir.create(current_path)
+  # create twice as much data for first class
+  num_files <- ifelse(current_path == path_1, 6, 3)
+  create_dummy_data(file_path = current_path,
+                    num_files = num_files,
+                    seq_length = 10,
+                    num_seq = 5,
+                    vocabulary = c("a", "c", "g", "t"))
+}
+
+
+class_weight <- get_class_weight(
+  path = c(path_1, path_2),
+  vocabulary_label = c("A", "B"),
+  format = "fasta",
+  file_proportion = 1,
+  train_type = "label_folder",
+  csv_path = NULL)
+
+class_weight
+
+}