[a4ee51]: / R / IntegratedLearnerFromMAE.R

Download this file

219 lines (210 with data), 9.7 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
#' MultiAssayExperiment wrapper for IntegratedLearner
#'
#' This function offers a simple interface to the \code{IntegratedLearner()}
#' from a \code{\link{MultiAssayExperiment}} object. Instead of wrangling the
#' data, the function does it for the user and feeds it to
#' \code{IntegratedLearner()}.
#'
#' @param mae \code{\link{MultiAssayExperiment}}
#'
#' @param experiment \code{Character vector} or \code{Integer vector}. Specifies
#' experiments from \code{mae}.
#'
#' @param assay.type \code{Character vector}. Specified the name of assay for
#' each experiment. The length must be equal to \code{experiment}.
#'
#' @param outcome.col \code{Character scalar}. Specifies the name of column
#' from \code{colData} that includes the outcome variable.
#'
#' @param valid.col \code{Character scalar}. Specifies the name of column
#' from \code{colData} that includes the information on which sample belongs to
#' validation set. The variable must be \code{logical}. (Default: \code{NULL})
#'
#' @param ... additional arguments passed to \code{IntegratedLearner()}
#' \itemize{
#' \item \code{na.rm}: \code{Logical scalar}. Should features with missing
#' values be dropped? (Default: \code{FALSE})
#' }
#'
#' @return A \code{SuperLearner} object containing the trained model fits.
#'
#' @export
IntegratedLearnerFromMAE <- function(
mae, experiment, assay.type, outcome.col, valid.col = NULL, ...){
############################### INPUT CHECK ################################
# MultiAssayExperiment and mia are required to run this function. However,
# they are not direct dependency of the package; they are only needed when
# this function is run. tidyr is needed for tidyr::pivot_wider
.require_package("MultiAssayExperiment")
.require_package("mia")
.require_package("tidyr")
# The object must be MAE
if( !is(mae, "MultiAssayExperiment") ){
stop("'mae' must be MultiAssayExperiment.", call. = FALSE)
}
# Experiments inside MAR must be SE objects
if( !all(vapply(MultiAssayExperiment::experiments(mae), function(x)
is(x, "SummarizedExperiment"), FUN.VALUE = logical(1))) ){
stop("Experiments in 'mae' must be SummarizedExperiment objects.",
call. = FALSE)
}
# experiment must be either integers specifying the index of experiments or
# characters specifying the names of experiments
is_index <- mia:::.is_integer(experiment) &&
all(experiment <= length(experiments(mae)))
is_character <- is.character(experiment) && all(experiment %in% names(mae))
if( !(is_index || is_character) ){
stop("'experiment' must be a vector specifying either names or ",
"indices of experiments.", call. = FALSE)
}
# assay.type must be a vector of character values. At this point, we do not
# check if the assays can be found. Moreover, there must be as many
# assay.types as there are experiments.
if( !(is.character(assay.type) &&
length(assay.type) == length(experiment)) ){
stop("'assay.type' must be a character vector specifying assay for ",
"each experiment.", call. = FALSE)
}
# outcome.col and valid.col must be character values but we dot check
# here whether they can be found since they can be inside experiments.
if( !mia:::.is_a_string(outcome.col) ){
stop("'outcome.col' must be a single character value.")
}
if( !(is.null(valid.col) || mia:::.is_a_string(valid.col)) ){
stop("'valid.col' must be a single character value.")
}
############################# INPUT CHECK END ##############################
# Subset the MAE
mae <- mae[ , , experiment]
# Get data from experiments as a single large table
data <- .get_data_from_MAE(
mae, experiment, assay.type, outcome.col, valid.col)
# Divide data into feature table, and metadata
data <- .wrangle_data(data, outcome.col, valid.col, ...)
# Fit the model
fit <- do.call(IntegratedLearner, data)
return(fit)
}
################################ HELP FUNCTIONS ################################
# This function fetches data from single experiments and create a single table.
.get_data_from_MAE <- function(
mae, experiment, assay.type, outcome.col, valid.col){
# Loop through experiments
feature_table <- lapply(seq_len(length(experiment)), function(i){
# Get experiment and assay name for this experiment
exp <- experiment[[i]]
assay_name <- assay.type[[i]]
# Get SE object
tse <- MultiAssayExperiment::getWithColData(mae, exp, verbose = FALSE)
# Check if the assay name is correct
mia:::.check_assay_present(assay_name, tse)
# Outcome column must be found from colData
if( !(mia:::.is_a_string(outcome.col) &&
outcome.col %in% colnames(colData(tse))) ){
stop("'outcome.col' must specify a column from colData.",
call. = FALSE)
}
if( !(is.null(valid.col) || (mia:::.is_a_string(valid.col) &&
valid.col %in% colnames(colData(tse)))) ){
stop("'valid.col' must be NULL or specify a column from colData.",
call. = FALSE)
}
# Also check that the values in validation set are correct
if( !is.null(valid.col) && !is.logical(tse[[valid.col]]) ){
stop("'valid.col' must specify a column that has boolean values.",
call. = FALSE)
}
# Melt data to single table
df <- mia::meltSE(
tse, assay.type = assay_name,
row.name = "featureID", col.name = "subjectID",
add.col = c(outcome.col, valid.col))
# Replace the assay name with general name
colnames(df)[ colnames(df) == assay_name ] <- "value"
# Add omic type
df[["featureType"]] <- as.character(exp)
return(df)
})
# Combine data
feature_table <- do.call(rbind, feature_table)
# Convert outcome to numeric
feature_table[[outcome.col]] <- as.numeric(
as.factor(feature_table[[outcome.col]]))
return(feature_table)
}
# This function gets single table as input and outputs an arguments ready for
# IntegratedLearner()
.wrangle_data <- function(
feature_table, outcome.col, valid.col, na.rm = FALSE, ...){
#
if( !mia:::.is_a_bool(na.rm) ){
stop("'na.rm' must be TRUE or FALSE.", call. = FALSE)
}
#
# Divide data into metadata and feature table
sample_metadata <- feature_table[
, c("subjectID", outcome.col, valid.col), drop = FALSE]
sample_metadata <- sample_metadata[ !duplicated(sample_metadata), ]
feature_metadata <- feature_table[
, c("featureID", "featureType"), drop = FALSE]
feature_metadata <- feature_metadata[ !duplicated(feature_metadata), ]
feature_table <- feature_table[
, c("featureID", "subjectID", "value"), drop = FALSE]
# Rename outcome column to Y which is require by IntegratedLearner
colnames(sample_metadata)[ colnames(sample_metadata) == outcome.col ] <- "Y"
# Convert feature table into wide format where rows represent features
# and column samples
feature_table <- tidyr::pivot_wider(
feature_table, names_from = "subjectID", values_from = "value")
# Convert datasets into data.frame and add rownames
sample_metadata <- sample_metadata |> as.data.frame()
rownames(sample_metadata) <- sample_metadata[["subjectID"]]
feature_metadata <- feature_metadata |> as.data.frame()
rownames(feature_metadata) <- feature_metadata[["featureID"]]
feature_table <- feature_table |> as.data.frame()
rownames(feature_table) <- feature_table[["featureID"]]
# Feature table must have only numeric values
feature_table[["featureID"]] <- NULL
# Check if there are missing values. Drop them if specified. SuperLearner
# does not support missing values.
if( any(is.na(feature_table)) && na.rm ){
keep <- complete.cases(feature_table)
feature_table <- feature_table[keep, ]
feature_metadata <- feature_metadata[keep, ]
# Give error or warning depending on if there are still features left
# after removing those that are not complete.
FUN <- if( sum(keep) == 0 ) stop else warning
FUN(sum(!keep), "/", length(keep),
" features dropped due to missing data.", call. = FALSE)
}
# If user specified validation set, divide data into training and validation
# datasets
if( !is.null(valid.col) ){
# Divide feature table
feature_table_valid <- feature_table[
, sample_metadata[[valid.col]], drop = FALSE]
feature_table <- feature_table[
, !sample_metadata[[valid.col]], drop = FALSE]
# Divide sample metadata
sample_metadata_valid <- sample_metadata[sample_metadata[[valid.col]], ]
sample_metadata <- sample_metadata[!sample_metadata[[valid.col]], ]
# Create an argument list
args <- list(
feature_table = feature_table,
sample_metadata = sample_metadata,
feature_metadata = feature_metadata,
feature_table_valid = feature_table_valid,
sample_metadata_valid = sample_metadata_valid
)
} else{
# Create an argument list
args <- list(
feature_table = feature_table,
sample_metadata = sample_metadata,
feature_metadata = feature_metadata
)
}
# Add those arguments that are passed with "..."
args <- c(args, list(...))
return(args)
}