--- a +++ b/R/IntegratedLearnerFromMAE.R @@ -0,0 +1,218 @@ +#' MultiAssayExperiment wrapper for IntegratedLearner +#' +#' This function offers a simple interface to the \code{IntegratedLearner()} +#' from a \code{\link{MultiAssayExperiment}} object. Instead of wrangling the +#' data, the function does it for the user and feeds it to +#' \code{IntegratedLearner()}. +#' +#' @param mae \code{\link{MultiAssayExperiment}} +#' +#' @param experiment \code{Character vector} or \code{Integer vector}. Specifies +#' experiments from \code{mae}. +#' +#' @param assay.type \code{Character vector}. Specified the name of assay for +#' each experiment. The length must be equal to \code{experiment}. +#' +#' @param outcome.col \code{Character scalar}. Specifies the name of column +#' from \code{colData} that includes the outcome variable. +#' +#' @param valid.col \code{Character scalar}. Specifies the name of column +#' from \code{colData} that includes the information on which sample belongs to +#' validation set. The variable must be \code{logical}. (Default: \code{NULL}) +#' +#' @param ... additional arguments passed to \code{IntegratedLearner()} +#' \itemize{ +#' \item \code{na.rm}: \code{Logical scalar}. Should features with missing +#' values be dropped? (Default: \code{FALSE}) +#' } +#' +#' @return A \code{SuperLearner} object containing the trained model fits. +#' +#' @export + +IntegratedLearnerFromMAE <- function( + mae, experiment, assay.type, outcome.col, valid.col = NULL, ...){ + ############################### INPUT CHECK ################################ + # MultiAssayExperiment and mia are required to run this function. However, + # they are not direct dependency of the package; they are only needed when + # this function is run. tidyr is needed for tidyr::pivot_wider + .require_package("MultiAssayExperiment") + .require_package("mia") + .require_package("tidyr") + # The object must be MAE + if( !is(mae, "MultiAssayExperiment") ){ + stop("'mae' must be MultiAssayExperiment.", call. = FALSE) + } + # Experiments inside MAR must be SE objects + if( !all(vapply(MultiAssayExperiment::experiments(mae), function(x) + is(x, "SummarizedExperiment"), FUN.VALUE = logical(1))) ){ + stop("Experiments in 'mae' must be SummarizedExperiment objects.", + call. = FALSE) + } + # experiment must be either integers specifying the index of experiments or + # characters specifying the names of experiments + is_index <- mia:::.is_integer(experiment) && + all(experiment <= length(experiments(mae))) + is_character <- is.character(experiment) && all(experiment %in% names(mae)) + if( !(is_index || is_character) ){ + stop("'experiment' must be a vector specifying either names or ", + "indices of experiments.", call. = FALSE) + } + # assay.type must be a vector of character values. At this point, we do not + # check if the assays can be found. Moreover, there must be as many + # assay.types as there are experiments. + if( !(is.character(assay.type) && + length(assay.type) == length(experiment)) ){ + stop("'assay.type' must be a character vector specifying assay for ", + "each experiment.", call. = FALSE) + } + # outcome.col and valid.col must be character values but we dot check + # here whether they can be found since they can be inside experiments. + if( !mia:::.is_a_string(outcome.col) ){ + stop("'outcome.col' must be a single character value.") + } + if( !(is.null(valid.col) || mia:::.is_a_string(valid.col)) ){ + stop("'valid.col' must be a single character value.") + } + ############################# INPUT CHECK END ############################## + # Subset the MAE + mae <- mae[ , , experiment] + # Get data from experiments as a single large table + data <- .get_data_from_MAE( + mae, experiment, assay.type, outcome.col, valid.col) + # Divide data into feature table, and metadata + data <- .wrangle_data(data, outcome.col, valid.col, ...) + # Fit the model + fit <- do.call(IntegratedLearner, data) + return(fit) +} + +################################ HELP FUNCTIONS ################################ + +# This function fetches data from single experiments and create a single table. +.get_data_from_MAE <- function( + mae, experiment, assay.type, outcome.col, valid.col){ + # Loop through experiments + feature_table <- lapply(seq_len(length(experiment)), function(i){ + # Get experiment and assay name for this experiment + exp <- experiment[[i]] + assay_name <- assay.type[[i]] + # Get SE object + tse <- MultiAssayExperiment::getWithColData(mae, exp, verbose = FALSE) + # Check if the assay name is correct + mia:::.check_assay_present(assay_name, tse) + # Outcome column must be found from colData + if( !(mia:::.is_a_string(outcome.col) && + outcome.col %in% colnames(colData(tse))) ){ + stop("'outcome.col' must specify a column from colData.", + call. = FALSE) + } + if( !(is.null(valid.col) || (mia:::.is_a_string(valid.col) && + valid.col %in% colnames(colData(tse)))) ){ + stop("'valid.col' must be NULL or specify a column from colData.", + call. = FALSE) + } + # Also check that the values in validation set are correct + if( !is.null(valid.col) && !is.logical(tse[[valid.col]]) ){ + stop("'valid.col' must specify a column that has boolean values.", + call. = FALSE) + } + # Melt data to single table + df <- mia::meltSE( + tse, assay.type = assay_name, + row.name = "featureID", col.name = "subjectID", + add.col = c(outcome.col, valid.col)) + # Replace the assay name with general name + colnames(df)[ colnames(df) == assay_name ] <- "value" + # Add omic type + df[["featureType"]] <- as.character(exp) + return(df) + }) + # Combine data + feature_table <- do.call(rbind, feature_table) + # Convert outcome to numeric + feature_table[[outcome.col]] <- as.numeric( + as.factor(feature_table[[outcome.col]])) + return(feature_table) +} + +# This function gets single table as input and outputs an arguments ready for +# IntegratedLearner() +.wrangle_data <- function( + feature_table, outcome.col, valid.col, na.rm = FALSE, ...){ + # + if( !mia:::.is_a_bool(na.rm) ){ + stop("'na.rm' must be TRUE or FALSE.", call. = FALSE) + } + # + # Divide data into metadata and feature table + sample_metadata <- feature_table[ + , c("subjectID", outcome.col, valid.col), drop = FALSE] + sample_metadata <- sample_metadata[ !duplicated(sample_metadata), ] + feature_metadata <- feature_table[ + , c("featureID", "featureType"), drop = FALSE] + feature_metadata <- feature_metadata[ !duplicated(feature_metadata), ] + feature_table <- feature_table[ + , c("featureID", "subjectID", "value"), drop = FALSE] + + # Rename outcome column to Y which is require by IntegratedLearner + colnames(sample_metadata)[ colnames(sample_metadata) == outcome.col ] <- "Y" + # Convert feature table into wide format where rows represent features + # and column samples + feature_table <- tidyr::pivot_wider( + feature_table, names_from = "subjectID", values_from = "value") + + # Convert datasets into data.frame and add rownames + sample_metadata <- sample_metadata |> as.data.frame() + rownames(sample_metadata) <- sample_metadata[["subjectID"]] + feature_metadata <- feature_metadata |> as.data.frame() + rownames(feature_metadata) <- feature_metadata[["featureID"]] + feature_table <- feature_table |> as.data.frame() + rownames(feature_table) <- feature_table[["featureID"]] + # Feature table must have only numeric values + feature_table[["featureID"]] <- NULL + + # Check if there are missing values. Drop them if specified. SuperLearner + # does not support missing values. + if( any(is.na(feature_table)) && na.rm ){ + keep <- complete.cases(feature_table) + feature_table <- feature_table[keep, ] + feature_metadata <- feature_metadata[keep, ] + # Give error or warning depending on if there are still features left + # after removing those that are not complete. + FUN <- if( sum(keep) == 0 ) stop else warning + FUN(sum(!keep), "/", length(keep), + " features dropped due to missing data.", call. = FALSE) + } + + # If user specified validation set, divide data into training and validation + # datasets + if( !is.null(valid.col) ){ + # Divide feature table + feature_table_valid <- feature_table[ + , sample_metadata[[valid.col]], drop = FALSE] + feature_table <- feature_table[ + , !sample_metadata[[valid.col]], drop = FALSE] + # Divide sample metadata + sample_metadata_valid <- sample_metadata[sample_metadata[[valid.col]], ] + sample_metadata <- sample_metadata[!sample_metadata[[valid.col]], ] + # Create an argument list + args <- list( + feature_table = feature_table, + sample_metadata = sample_metadata, + feature_metadata = feature_metadata, + feature_table_valid = feature_table_valid, + sample_metadata_valid = sample_metadata_valid + ) + } else{ + # Create an argument list + args <- list( + feature_table = feature_table, + sample_metadata = sample_metadata, + feature_metadata = feature_metadata + ) + } + # Add those arguments that are passed with "..." + args <- c(args, list(...)) + return(args) +}