Diff of /man/get_class_weight.Rd [000000] .. [409433]

Switch to unified view

a b/man/get_class_weight.Rd
1
% Generated by roxygen2: do not edit by hand
2
% Please edit documentation in R/preprocess.R
3
\name{get_class_weight}
4
\alias{get_class_weight}
5
\title{Estimate frequency of different classes}
6
\usage{
7
get_class_weight(
8
  path,
9
  vocabulary_label = NULL,
10
  format = "fasta",
11
  file_proportion = 1,
12
  train_type = "label_folder",
13
  named_list = FALSE,
14
  csv_path = NULL
15
)
16
}
17
\arguments{
18
\item{path}{Path to training data. If \code{train_type} is \code{label_folder}, should be a vector or list
19
where each entry corresponds to a class (list elements can be directories and/or individual files). If \code{train_type} is not \code{label_folder},
20
can be a single directory or file or a list of directories and/or files.}
21
22
\item{vocabulary_label}{Character vector of possible targets. Targets outside \code{vocabulary_label} will get discarded.}
23
24
\item{format}{File format, either \code{"fasta"} or \code{"fastq"}.}
25
26
\item{file_proportion}{Proportion of files to randomly sample for estimating class distributions.}
27
28
\item{train_type}{Either \code{"lm"}, \code{"lm_rds"}, \code{"masked_lm"} for language model; \code{"label_header"}, \code{"label_folder"}, \code{"label_csv"}, \code{"label_rds"} for classification or \code{"dummy_gen"}.
29
\itemize{
30
\item Language model is trained to predict character(s) in a sequence. \cr
31
\item \code{"label_header"}/\code{"label_folder"}/\code{"label_csv"} are trained to predict a corresponding class given a sequence as input.
32
\item If \code{"label_header"}, class will be read from fasta headers.
33
\item If \code{"label_folder"}, class will be read from folder, i.e. all files in one folder must belong to the same class.
34
\item If \code{"label_csv"}, targets are read from a csv file. This file should have one column named "file". The targets then correspond to entries in that row (except "file"
35
column). Example: if we are currently working with a file called "a.fasta" and corresponding label is "label_1", there should be a row in our csv file\tabular{lll}{
36
   file \tab label_1 \tab label_2 \cr
37
   "a.fasta" \tab 1 \tab 0 \cr
38
}
39
40
41
\item If \code{"label_rds"}, generator will iterate over set of .rds files containing each a list of input and target tensors. Not implemented for model
42
with multiple inputs.
43
\item If \code{"lm_rds"}, generator will iterate over set of .rds files and will split tensor according to \code{target_len} argument
44
(targets are last \code{target_len} nucleotides of each sequence).
45
\item  If \code{"dummy_gen"}, generator creates random data once and repeatedly feeds these to model.
46
\item  If \code{"masked_lm"}, generator maskes some parts of the input. See \code{masked_lm} argument for details.
47
}}
48
49
\item{named_list}{Whether to give class weight list names \verb{"0", "1", ...} or not.}
50
51
\item{csv_path}{If \code{train_type = "label_csv"}, path to csv file containing labels.}
52
}
53
\value{
54
A list of numeric values (class weights).
55
}
56
\description{
57
Count number of nucleotides for each class and use as estimation for relation of class distribution.
58
Outputs list of class relations. Can be used as input for \code{class_weigth} in \code{\link{train_model}} function.
59
}
60
\examples{
61
62
# create dummy data
63
path_1 <- tempfile()
64
path_2 <- tempfile()
65
66
for (current_path in c(path_1, path_2)) {
67
  
68
  dir.create(current_path)
69
  # create twice as much data for first class
70
  num_files <- ifelse(current_path == path_1, 6, 3)
71
  create_dummy_data(file_path = current_path,
72
                    num_files = num_files,
73
                    seq_length = 10,
74
                    num_seq = 5,
75
                    vocabulary = c("a", "c", "g", "t"))
76
}
77
78
79
class_weight <- get_class_weight(
80
  path = c(path_1, path_2),
81
  vocabulary_label = c("A", "B"),
82
  format = "fasta",
83
  file_proportion = 1,
84
  train_type = "label_folder",
85
  csv_path = NULL)
86
87
class_weight
88
89
}