deepG / Git / [1c0e03] /man/get_class

Models:
MarcoTheBlack/
deepG
Downloads: 2
[1c0e03]: / man / get_class_weight.Rd
History
Download this file
90 lines (74 with data), 3.8 kB

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/preprocess.R
\name{get_class_weight}
\alias{get_class_weight}
\title{Estimate frequency of different classes}
\usage{
get_class_weight(
  path,
  vocabulary_label = NULL,
  format = "fasta",
  file_proportion = 1,
  train_type = "label_folder",
  named_list = FALSE,
  csv_path = NULL
)
}
\arguments{
\item{path}{Path to training data. If \code{train_type} is \code{label_folder}, should be a vector or list
where each entry corresponds to a class (list elements can be directories and/or individual files). If \code{train_type} is not \code{label_folder},
can be a single directory or file or a list of directories and/or files.}

\item{vocabulary_label}{Character vector of possible targets. Targets outside \code{vocabulary_label} will get discarded.}

\item{format}{File format, either \code{"fasta"} or \code{"fastq"}.}

\item{file_proportion}{Proportion of files to randomly sample for estimating class distributions.}

\item{train_type}{Either \code{"lm"}, \code{"lm_rds"}, \code{"masked_lm"} for language model; \code{"label_header"}, \code{"label_folder"}, \code{"label_csv"}, \code{"label_rds"} for classification or \code{"dummy_gen"}.
\itemize{
\item Language model is trained to predict character(s) in a sequence. \cr
\item \code{"label_header"}/\code{"label_folder"}/\code{"label_csv"} are trained to predict a corresponding class given a sequence as input.
\item If \code{"label_header"}, class will be read from fasta headers.
\item If \code{"label_folder"}, class will be read from folder, i.e. all files in one folder must belong to the same class.
\item If \code{"label_csv"}, targets are read from a csv file. This file should have one column named "file". The targets then correspond to entries in that row (except "file"
column). Example: if we are currently working with a file called "a.fasta" and corresponding label is "label_1", there should be a row in our csv file\tabular{lll}{
   file \tab label_1 \tab label_2 \cr
   "a.fasta" \tab 1 \tab 0 \cr
}


\item If \code{"label_rds"}, generator will iterate over set of .rds files containing each a list of input and target tensors. Not implemented for model
with multiple inputs.
\item If \code{"lm_rds"}, generator will iterate over set of .rds files and will split tensor according to \code{target_len} argument
(targets are last \code{target_len} nucleotides of each sequence).
\item  If \code{"dummy_gen"}, generator creates random data once and repeatedly feeds these to model.
\item  If \code{"masked_lm"}, generator maskes some parts of the input. See \code{masked_lm} argument for details.
}}

\item{named_list}{Whether to give class weight list names \verb{"0", "1", ...} or not.}

\item{csv_path}{If \code{train_type = "label_csv"}, path to csv file containing labels.}
}
\value{
A list of numeric values (class weights).
}
\description{
Count number of nucleotides for each class and use as estimation for relation of class distribution.
Outputs list of class relations. Can be used as input for \code{class_weigth} in \code{\link{train_model}} function.
}
\examples{

# create dummy data
path_1 <- tempfile()
path_2 <- tempfile()

for (current_path in c(path_1, path_2)) {
  
  dir.create(current_path)
  # create twice as much data for first class
  num_files <- ifelse(current_path == path_1, 6, 3)
  create_dummy_data(file_path = current_path,
                    num_files = num_files,
                    seq_length = 10,
                    num_seq = 5,
                    vocabulary = c("a", "c", "g", "t"))
}


class_weight <- get_class_weight(
  path = c(path_1, path_2),
  vocabulary_label = c("A", "B"),
  format = "fasta",
  file_proportion = 1,
  train_type = "label_folder",
  csv_path = NULL)

class_weight

}