|
a |
|
b/man/get_class_weight.Rd |
|
|
1 |
% Generated by roxygen2: do not edit by hand |
|
|
2 |
% Please edit documentation in R/preprocess.R |
|
|
3 |
\name{get_class_weight} |
|
|
4 |
\alias{get_class_weight} |
|
|
5 |
\title{Estimate frequency of different classes} |
|
|
6 |
\usage{ |
|
|
7 |
get_class_weight( |
|
|
8 |
path, |
|
|
9 |
vocabulary_label = NULL, |
|
|
10 |
format = "fasta", |
|
|
11 |
file_proportion = 1, |
|
|
12 |
train_type = "label_folder", |
|
|
13 |
named_list = FALSE, |
|
|
14 |
csv_path = NULL |
|
|
15 |
) |
|
|
16 |
} |
|
|
17 |
\arguments{ |
|
|
18 |
\item{path}{Path to training data. If \code{train_type} is \code{label_folder}, should be a vector or list |
|
|
19 |
where each entry corresponds to a class (list elements can be directories and/or individual files). If \code{train_type} is not \code{label_folder}, |
|
|
20 |
can be a single directory or file or a list of directories and/or files.} |
|
|
21 |
|
|
|
22 |
\item{vocabulary_label}{Character vector of possible targets. Targets outside \code{vocabulary_label} will get discarded.} |
|
|
23 |
|
|
|
24 |
\item{format}{File format, either \code{"fasta"} or \code{"fastq"}.} |
|
|
25 |
|
|
|
26 |
\item{file_proportion}{Proportion of files to randomly sample for estimating class distributions.} |
|
|
27 |
|
|
|
28 |
\item{train_type}{Either \code{"lm"}, \code{"lm_rds"}, \code{"masked_lm"} for language model; \code{"label_header"}, \code{"label_folder"}, \code{"label_csv"}, \code{"label_rds"} for classification or \code{"dummy_gen"}. |
|
|
29 |
\itemize{ |
|
|
30 |
\item Language model is trained to predict character(s) in a sequence. \cr |
|
|
31 |
\item \code{"label_header"}/\code{"label_folder"}/\code{"label_csv"} are trained to predict a corresponding class given a sequence as input. |
|
|
32 |
\item If \code{"label_header"}, class will be read from fasta headers. |
|
|
33 |
\item If \code{"label_folder"}, class will be read from folder, i.e. all files in one folder must belong to the same class. |
|
|
34 |
\item If \code{"label_csv"}, targets are read from a csv file. This file should have one column named "file". The targets then correspond to entries in that row (except "file" |
|
|
35 |
column). Example: if we are currently working with a file called "a.fasta" and corresponding label is "label_1", there should be a row in our csv file\tabular{lll}{ |
|
|
36 |
file \tab label_1 \tab label_2 \cr |
|
|
37 |
"a.fasta" \tab 1 \tab 0 \cr |
|
|
38 |
} |
|
|
39 |
|
|
|
40 |
|
|
|
41 |
\item If \code{"label_rds"}, generator will iterate over set of .rds files containing each a list of input and target tensors. Not implemented for model |
|
|
42 |
with multiple inputs. |
|
|
43 |
\item If \code{"lm_rds"}, generator will iterate over set of .rds files and will split tensor according to \code{target_len} argument |
|
|
44 |
(targets are last \code{target_len} nucleotides of each sequence). |
|
|
45 |
\item If \code{"dummy_gen"}, generator creates random data once and repeatedly feeds these to model. |
|
|
46 |
\item If \code{"masked_lm"}, generator maskes some parts of the input. See \code{masked_lm} argument for details. |
|
|
47 |
}} |
|
|
48 |
|
|
|
49 |
\item{named_list}{Whether to give class weight list names \verb{"0", "1", ...} or not.} |
|
|
50 |
|
|
|
51 |
\item{csv_path}{If \code{train_type = "label_csv"}, path to csv file containing labels.} |
|
|
52 |
} |
|
|
53 |
\value{ |
|
|
54 |
A list of numeric values (class weights). |
|
|
55 |
} |
|
|
56 |
\description{ |
|
|
57 |
Count number of nucleotides for each class and use as estimation for relation of class distribution. |
|
|
58 |
Outputs list of class relations. Can be used as input for \code{class_weigth} in \code{\link{train_model}} function. |
|
|
59 |
} |
|
|
60 |
\examples{ |
|
|
61 |
|
|
|
62 |
# create dummy data |
|
|
63 |
path_1 <- tempfile() |
|
|
64 |
path_2 <- tempfile() |
|
|
65 |
|
|
|
66 |
for (current_path in c(path_1, path_2)) { |
|
|
67 |
|
|
|
68 |
dir.create(current_path) |
|
|
69 |
# create twice as much data for first class |
|
|
70 |
num_files <- ifelse(current_path == path_1, 6, 3) |
|
|
71 |
create_dummy_data(file_path = current_path, |
|
|
72 |
num_files = num_files, |
|
|
73 |
seq_length = 10, |
|
|
74 |
num_seq = 5, |
|
|
75 |
vocabulary = c("a", "c", "g", "t")) |
|
|
76 |
} |
|
|
77 |
|
|
|
78 |
|
|
|
79 |
class_weight <- get_class_weight( |
|
|
80 |
path = c(path_1, path_2), |
|
|
81 |
vocabulary_label = c("A", "B"), |
|
|
82 |
format = "fasta", |
|
|
83 |
file_proportion = 1, |
|
|
84 |
train_type = "label_folder", |
|
|
85 |
csv_path = NULL) |
|
|
86 |
|
|
|
87 |
class_weight |
|
|
88 |
|
|
|
89 |
} |