|
a |
|
b/R/IntegratedLearnerFromMAE.R |
|
|
1 |
#' MultiAssayExperiment wrapper for IntegratedLearner |
|
|
2 |
#' |
|
|
3 |
#' This function offers a simple interface to the \code{IntegratedLearner()} |
|
|
4 |
#' from a \code{\link{MultiAssayExperiment}} object. Instead of wrangling the |
|
|
5 |
#' data, the function does it for the user and feeds it to |
|
|
6 |
#' \code{IntegratedLearner()}. |
|
|
7 |
#' |
|
|
8 |
#' @param mae \code{\link{MultiAssayExperiment}} |
|
|
9 |
#' |
|
|
10 |
#' @param experiment \code{Character vector} or \code{Integer vector}. Specifies |
|
|
11 |
#' experiments from \code{mae}. |
|
|
12 |
#' |
|
|
13 |
#' @param assay.type \code{Character vector}. Specified the name of assay for |
|
|
14 |
#' each experiment. The length must be equal to \code{experiment}. |
|
|
15 |
#' |
|
|
16 |
#' @param outcome.col \code{Character scalar}. Specifies the name of column |
|
|
17 |
#' from \code{colData} that includes the outcome variable. |
|
|
18 |
#' |
|
|
19 |
#' @param valid.col \code{Character scalar}. Specifies the name of column |
|
|
20 |
#' from \code{colData} that includes the information on which sample belongs to |
|
|
21 |
#' validation set. The variable must be \code{logical}. (Default: \code{NULL}) |
|
|
22 |
#' |
|
|
23 |
#' @param ... additional arguments passed to \code{IntegratedLearner()} |
|
|
24 |
#' \itemize{ |
|
|
25 |
#' \item \code{na.rm}: \code{Logical scalar}. Should features with missing |
|
|
26 |
#' values be dropped? (Default: \code{FALSE}) |
|
|
27 |
#' } |
|
|
28 |
#' |
|
|
29 |
#' @return A \code{SuperLearner} object containing the trained model fits. |
|
|
30 |
#' |
|
|
31 |
#' @export |
|
|
32 |
|
|
|
33 |
IntegratedLearnerFromMAE <- function( |
|
|
34 |
mae, experiment, assay.type, outcome.col, valid.col = NULL, ...){ |
|
|
35 |
############################### INPUT CHECK ################################ |
|
|
36 |
# MultiAssayExperiment and mia are required to run this function. However, |
|
|
37 |
# they are not direct dependency of the package; they are only needed when |
|
|
38 |
# this function is run. tidyr is needed for tidyr::pivot_wider |
|
|
39 |
.require_package("MultiAssayExperiment") |
|
|
40 |
.require_package("mia") |
|
|
41 |
.require_package("tidyr") |
|
|
42 |
# The object must be MAE |
|
|
43 |
if( !is(mae, "MultiAssayExperiment") ){ |
|
|
44 |
stop("'mae' must be MultiAssayExperiment.", call. = FALSE) |
|
|
45 |
} |
|
|
46 |
# Experiments inside MAR must be SE objects |
|
|
47 |
if( !all(vapply(MultiAssayExperiment::experiments(mae), function(x) |
|
|
48 |
is(x, "SummarizedExperiment"), FUN.VALUE = logical(1))) ){ |
|
|
49 |
stop("Experiments in 'mae' must be SummarizedExperiment objects.", |
|
|
50 |
call. = FALSE) |
|
|
51 |
} |
|
|
52 |
# experiment must be either integers specifying the index of experiments or |
|
|
53 |
# characters specifying the names of experiments |
|
|
54 |
is_index <- mia:::.is_integer(experiment) && |
|
|
55 |
all(experiment <= length(experiments(mae))) |
|
|
56 |
is_character <- is.character(experiment) && all(experiment %in% names(mae)) |
|
|
57 |
if( !(is_index || is_character) ){ |
|
|
58 |
stop("'experiment' must be a vector specifying either names or ", |
|
|
59 |
"indices of experiments.", call. = FALSE) |
|
|
60 |
} |
|
|
61 |
# assay.type must be a vector of character values. At this point, we do not |
|
|
62 |
# check if the assays can be found. Moreover, there must be as many |
|
|
63 |
# assay.types as there are experiments. |
|
|
64 |
if( !(is.character(assay.type) && |
|
|
65 |
length(assay.type) == length(experiment)) ){ |
|
|
66 |
stop("'assay.type' must be a character vector specifying assay for ", |
|
|
67 |
"each experiment.", call. = FALSE) |
|
|
68 |
} |
|
|
69 |
# outcome.col and valid.col must be character values but we dot check |
|
|
70 |
# here whether they can be found since they can be inside experiments. |
|
|
71 |
if( !mia:::.is_a_string(outcome.col) ){ |
|
|
72 |
stop("'outcome.col' must be a single character value.") |
|
|
73 |
} |
|
|
74 |
if( !(is.null(valid.col) || mia:::.is_a_string(valid.col)) ){ |
|
|
75 |
stop("'valid.col' must be a single character value.") |
|
|
76 |
} |
|
|
77 |
############################# INPUT CHECK END ############################## |
|
|
78 |
# Subset the MAE |
|
|
79 |
mae <- mae[ , , experiment] |
|
|
80 |
# Get data from experiments as a single large table |
|
|
81 |
data <- .get_data_from_MAE( |
|
|
82 |
mae, experiment, assay.type, outcome.col, valid.col) |
|
|
83 |
# Divide data into feature table, and metadata |
|
|
84 |
data <- .wrangle_data(data, outcome.col, valid.col, ...) |
|
|
85 |
# Fit the model |
|
|
86 |
fit <- do.call(IntegratedLearner, data) |
|
|
87 |
return(fit) |
|
|
88 |
} |
|
|
89 |
|
|
|
90 |
################################ HELP FUNCTIONS ################################ |
|
|
91 |
|
|
|
92 |
# This function fetches data from single experiments and create a single table. |
|
|
93 |
.get_data_from_MAE <- function( |
|
|
94 |
mae, experiment, assay.type, outcome.col, valid.col){ |
|
|
95 |
# Loop through experiments |
|
|
96 |
feature_table <- lapply(seq_len(length(experiment)), function(i){ |
|
|
97 |
# Get experiment and assay name for this experiment |
|
|
98 |
exp <- experiment[[i]] |
|
|
99 |
assay_name <- assay.type[[i]] |
|
|
100 |
# Get SE object |
|
|
101 |
tse <- MultiAssayExperiment::getWithColData(mae, exp, verbose = FALSE) |
|
|
102 |
# Check if the assay name is correct |
|
|
103 |
mia:::.check_assay_present(assay_name, tse) |
|
|
104 |
# Outcome column must be found from colData |
|
|
105 |
if( !(mia:::.is_a_string(outcome.col) && |
|
|
106 |
outcome.col %in% colnames(colData(tse))) ){ |
|
|
107 |
stop("'outcome.col' must specify a column from colData.", |
|
|
108 |
call. = FALSE) |
|
|
109 |
} |
|
|
110 |
if( !(is.null(valid.col) || (mia:::.is_a_string(valid.col) && |
|
|
111 |
valid.col %in% colnames(colData(tse)))) ){ |
|
|
112 |
stop("'valid.col' must be NULL or specify a column from colData.", |
|
|
113 |
call. = FALSE) |
|
|
114 |
} |
|
|
115 |
# Also check that the values in validation set are correct |
|
|
116 |
if( !is.null(valid.col) && !is.logical(tse[[valid.col]]) ){ |
|
|
117 |
stop("'valid.col' must specify a column that has boolean values.", |
|
|
118 |
call. = FALSE) |
|
|
119 |
} |
|
|
120 |
# Melt data to single table |
|
|
121 |
df <- mia::meltSE( |
|
|
122 |
tse, assay.type = assay_name, |
|
|
123 |
row.name = "featureID", col.name = "subjectID", |
|
|
124 |
add.col = c(outcome.col, valid.col)) |
|
|
125 |
# Replace the assay name with general name |
|
|
126 |
colnames(df)[ colnames(df) == assay_name ] <- "value" |
|
|
127 |
# Add omic type |
|
|
128 |
df[["featureType"]] <- as.character(exp) |
|
|
129 |
return(df) |
|
|
130 |
}) |
|
|
131 |
# Combine data |
|
|
132 |
feature_table <- do.call(rbind, feature_table) |
|
|
133 |
# Convert outcome to numeric |
|
|
134 |
feature_table[[outcome.col]] <- as.numeric( |
|
|
135 |
as.factor(feature_table[[outcome.col]])) |
|
|
136 |
return(feature_table) |
|
|
137 |
} |
|
|
138 |
|
|
|
139 |
# This function gets single table as input and outputs an arguments ready for |
|
|
140 |
# IntegratedLearner() |
|
|
141 |
.wrangle_data <- function( |
|
|
142 |
feature_table, outcome.col, valid.col, na.rm = FALSE, ...){ |
|
|
143 |
# |
|
|
144 |
if( !mia:::.is_a_bool(na.rm) ){ |
|
|
145 |
stop("'na.rm' must be TRUE or FALSE.", call. = FALSE) |
|
|
146 |
} |
|
|
147 |
# |
|
|
148 |
# Divide data into metadata and feature table |
|
|
149 |
sample_metadata <- feature_table[ |
|
|
150 |
, c("subjectID", outcome.col, valid.col), drop = FALSE] |
|
|
151 |
sample_metadata <- sample_metadata[ !duplicated(sample_metadata), ] |
|
|
152 |
feature_metadata <- feature_table[ |
|
|
153 |
, c("featureID", "featureType"), drop = FALSE] |
|
|
154 |
feature_metadata <- feature_metadata[ !duplicated(feature_metadata), ] |
|
|
155 |
feature_table <- feature_table[ |
|
|
156 |
, c("featureID", "subjectID", "value"), drop = FALSE] |
|
|
157 |
|
|
|
158 |
# Rename outcome column to Y which is require by IntegratedLearner |
|
|
159 |
colnames(sample_metadata)[ colnames(sample_metadata) == outcome.col ] <- "Y" |
|
|
160 |
# Convert feature table into wide format where rows represent features |
|
|
161 |
# and column samples |
|
|
162 |
feature_table <- tidyr::pivot_wider( |
|
|
163 |
feature_table, names_from = "subjectID", values_from = "value") |
|
|
164 |
|
|
|
165 |
# Convert datasets into data.frame and add rownames |
|
|
166 |
sample_metadata <- sample_metadata |> as.data.frame() |
|
|
167 |
rownames(sample_metadata) <- sample_metadata[["subjectID"]] |
|
|
168 |
feature_metadata <- feature_metadata |> as.data.frame() |
|
|
169 |
rownames(feature_metadata) <- feature_metadata[["featureID"]] |
|
|
170 |
feature_table <- feature_table |> as.data.frame() |
|
|
171 |
rownames(feature_table) <- feature_table[["featureID"]] |
|
|
172 |
# Feature table must have only numeric values |
|
|
173 |
feature_table[["featureID"]] <- NULL |
|
|
174 |
|
|
|
175 |
# Check if there are missing values. Drop them if specified. SuperLearner |
|
|
176 |
# does not support missing values. |
|
|
177 |
if( any(is.na(feature_table)) && na.rm ){ |
|
|
178 |
keep <- complete.cases(feature_table) |
|
|
179 |
feature_table <- feature_table[keep, ] |
|
|
180 |
feature_metadata <- feature_metadata[keep, ] |
|
|
181 |
# Give error or warning depending on if there are still features left |
|
|
182 |
# after removing those that are not complete. |
|
|
183 |
FUN <- if( sum(keep) == 0 ) stop else warning |
|
|
184 |
FUN(sum(!keep), "/", length(keep), |
|
|
185 |
" features dropped due to missing data.", call. = FALSE) |
|
|
186 |
} |
|
|
187 |
|
|
|
188 |
# If user specified validation set, divide data into training and validation |
|
|
189 |
# datasets |
|
|
190 |
if( !is.null(valid.col) ){ |
|
|
191 |
# Divide feature table |
|
|
192 |
feature_table_valid <- feature_table[ |
|
|
193 |
, sample_metadata[[valid.col]], drop = FALSE] |
|
|
194 |
feature_table <- feature_table[ |
|
|
195 |
, !sample_metadata[[valid.col]], drop = FALSE] |
|
|
196 |
# Divide sample metadata |
|
|
197 |
sample_metadata_valid <- sample_metadata[sample_metadata[[valid.col]], ] |
|
|
198 |
sample_metadata <- sample_metadata[!sample_metadata[[valid.col]], ] |
|
|
199 |
# Create an argument list |
|
|
200 |
args <- list( |
|
|
201 |
feature_table = feature_table, |
|
|
202 |
sample_metadata = sample_metadata, |
|
|
203 |
feature_metadata = feature_metadata, |
|
|
204 |
feature_table_valid = feature_table_valid, |
|
|
205 |
sample_metadata_valid = sample_metadata_valid |
|
|
206 |
) |
|
|
207 |
} else{ |
|
|
208 |
# Create an argument list |
|
|
209 |
args <- list( |
|
|
210 |
feature_table = feature_table, |
|
|
211 |
sample_metadata = sample_metadata, |
|
|
212 |
feature_metadata = feature_metadata |
|
|
213 |
) |
|
|
214 |
} |
|
|
215 |
# Add those arguments that are passed with "..." |
|
|
216 |
args <- c(args, list(...)) |
|
|
217 |
return(args) |
|
|
218 |
} |