IntegratedLearner / Git / Diff of /R/IntegratedLearnerFromMAE.R

Models:
AlyssaS/
IntegratedLearner
Downloads: 1
Diff of /R/IntegratedLearnerFromMAE.R [000000] .. [a4ee51]
Switch to side-by-side view

--- a
+++ b/R/IntegratedLearnerFromMAE.R
@@ -0,0 +1,218 @@
+#' MultiAssayExperiment wrapper for IntegratedLearner
+#' 
+#' This function offers a simple interface to the \code{IntegratedLearner()}
+#' from a \code{\link{MultiAssayExperiment}} object. Instead of wrangling the
+#' data, the function does it for the user and feeds it to
+#' \code{IntegratedLearner()}.
+#'
+#' @param mae \code{\link{MultiAssayExperiment}}
+#' 
+#' @param experiment \code{Character vector} or \code{Integer vector}. Specifies
+#' experiments from \code{mae}.
+#' 
+#' @param assay.type \code{Character vector}. Specified the name of assay for
+#' each experiment. The length must be equal to \code{experiment}.
+#' 
+#' @param outcome.col \code{Character scalar}. Specifies the name of column
+#' from \code{colData} that includes the outcome variable.
+#' 
+#' @param valid.col \code{Character scalar}. Specifies the name of column
+#' from \code{colData} that includes the information on which sample belongs to
+#' validation set. The variable must be \code{logical}. (Default: \code{NULL})
+#' 
+#' @param ... additional arguments passed to \code{IntegratedLearner()}
+#' \itemize{
+#'   \item \code{na.rm}: \code{Logical scalar}. Should features with missing
+#'   values be dropped? (Default: \code{FALSE})
+#' }
+#' 
+#' @return A \code{SuperLearner} object containing the trained model fits.
+#' 
+#' @export
+
+IntegratedLearnerFromMAE <- function(
+        mae, experiment, assay.type, outcome.col, valid.col = NULL, ...){
+    ############################### INPUT CHECK ################################
+    # MultiAssayExperiment and mia are required to run this function. However,
+    # they are not direct dependency of the package; they are only needed when
+    # this function is run. tidyr is needed for tidyr::pivot_wider
+    .require_package("MultiAssayExperiment")
+    .require_package("mia")
+    .require_package("tidyr")
+    # The object must be MAE
+    if( !is(mae, "MultiAssayExperiment") ){
+        stop("'mae' must be MultiAssayExperiment.", call. = FALSE)
+    }
+    # Experiments inside MAR must be SE objects
+    if( !all(vapply(MultiAssayExperiment::experiments(mae), function(x)
+        is(x, "SummarizedExperiment"), FUN.VALUE = logical(1))) ){
+        stop("Experiments in 'mae' must be SummarizedExperiment objects.",
+             call. = FALSE)
+    }
+    # experiment must be either integers specifying the index of experiments or
+    # characters specifying the names of experiments
+    is_index <- mia:::.is_integer(experiment) &&
+        all(experiment <= length(experiments(mae)))
+    is_character <- is.character(experiment) && all(experiment %in% names(mae))
+    if( !(is_index || is_character) ){
+        stop("'experiment' must be a vector specifying either names or ",
+             "indices of experiments.", call. = FALSE)
+    }
+    # assay.type must be a vector of character values. At this point, we do not
+    # check if the assays can be found. Moreover, there must be as many
+    # assay.types as there are experiments.
+    if( !(is.character(assay.type) &&
+          length(assay.type) == length(experiment)) ){
+        stop("'assay.type' must be a character vector specifying assay for ",
+             "each experiment.", call. = FALSE)
+    }
+    # outcome.col and valid.col must be character values but we dot check
+    # here whether they can be found since they can be inside experiments.
+    if( !mia:::.is_a_string(outcome.col) ){
+        stop("'outcome.col' must be a single character value.")
+    }
+    if( !(is.null(valid.col) || mia:::.is_a_string(valid.col)) ){
+        stop("'valid.col' must be a single character value.")
+    }
+    ############################# INPUT CHECK END ##############################
+    # Subset the MAE
+    mae <- mae[ , , experiment]
+    # Get data from experiments as a single large table
+    data <- .get_data_from_MAE(
+        mae, experiment, assay.type, outcome.col, valid.col)
+    # Divide data into feature table, and metadata
+    data <- .wrangle_data(data, outcome.col, valid.col, ...)
+    # Fit the model
+    fit <- do.call(IntegratedLearner, data)
+    return(fit)
+}
+
+################################ HELP FUNCTIONS ################################
+
+# This function fetches data from single experiments and create a single table.
+.get_data_from_MAE <- function(
+        mae, experiment, assay.type, outcome.col, valid.col){
+    # Loop through experiments
+    feature_table <- lapply(seq_len(length(experiment)), function(i){
+        # Get experiment and assay name for this experiment
+        exp <- experiment[[i]]
+        assay_name <- assay.type[[i]]
+        # Get SE object
+        tse <- MultiAssayExperiment::getWithColData(mae, exp, verbose = FALSE)
+        # Check if the assay name is correct
+        mia:::.check_assay_present(assay_name, tse)
+        # Outcome column must be found from colData
+        if( !(mia:::.is_a_string(outcome.col) &&
+              outcome.col %in% colnames(colData(tse))) ){
+            stop("'outcome.col' must specify a column from colData.",
+                 call. = FALSE)
+        }
+        if( !(is.null(valid.col) || (mia:::.is_a_string(valid.col) &&
+                                     valid.col %in% colnames(colData(tse)))) ){
+            stop("'valid.col' must be NULL or specify a column from colData.",
+                 call. = FALSE)
+        }
+        # Also check that the values in validation set are correct
+        if( !is.null(valid.col) && !is.logical(tse[[valid.col]]) ){
+            stop("'valid.col' must specify a column that has boolean values.",
+                 call. = FALSE)
+        }
+        # Melt data to single table
+        df <- mia::meltSE(
+            tse, assay.type = assay_name,
+            row.name = "featureID", col.name = "subjectID",
+            add.col = c(outcome.col, valid.col))
+        # Replace the assay name with general name
+        colnames(df)[ colnames(df) == assay_name ] <- "value"
+        # Add omic type
+        df[["featureType"]] <- as.character(exp)
+        return(df)
+    })
+    # Combine data
+    feature_table <- do.call(rbind, feature_table)
+    # Convert outcome to numeric
+    feature_table[[outcome.col]] <- as.numeric(
+        as.factor(feature_table[[outcome.col]]))
+    return(feature_table)
+}
+
+# This function gets single table as input and outputs an arguments ready for
+# IntegratedLearner()
+.wrangle_data <- function(
+        feature_table, outcome.col, valid.col, na.rm = FALSE, ...){
+    #
+    if( !mia:::.is_a_bool(na.rm) ){
+        stop("'na.rm' must be TRUE or FALSE.", call. = FALSE)
+    }
+    #
+    # Divide data into metadata and feature table
+    sample_metadata <- feature_table[
+        , c("subjectID", outcome.col, valid.col), drop = FALSE]
+    sample_metadata <- sample_metadata[ !duplicated(sample_metadata), ]
+    feature_metadata <- feature_table[
+        , c("featureID", "featureType"), drop = FALSE]
+    feature_metadata <- feature_metadata[ !duplicated(feature_metadata), ]
+    feature_table <- feature_table[
+        , c("featureID", "subjectID", "value"), drop = FALSE]
+    
+    # Rename outcome column to Y which is require by IntegratedLearner
+    colnames(sample_metadata)[ colnames(sample_metadata) == outcome.col ] <- "Y"
+    # Convert feature table into wide format where rows represent features
+    # and column samples
+    feature_table <- tidyr::pivot_wider(
+        feature_table, names_from = "subjectID", values_from = "value")
+    
+    # Convert datasets into data.frame and add rownames
+    sample_metadata <- sample_metadata |> as.data.frame()
+    rownames(sample_metadata) <- sample_metadata[["subjectID"]]
+    feature_metadata <- feature_metadata |> as.data.frame()
+    rownames(feature_metadata) <- feature_metadata[["featureID"]]
+    feature_table <- feature_table |> as.data.frame()
+    rownames(feature_table) <- feature_table[["featureID"]]
+    # Feature table must have only numeric values
+    feature_table[["featureID"]] <- NULL
+    
+    # Check if there are missing values. Drop them if specified. SuperLearner
+    # does not support missing values.
+    if( any(is.na(feature_table)) && na.rm ){
+        keep <- complete.cases(feature_table)
+        feature_table <- feature_table[keep, ]
+        feature_metadata <- feature_metadata[keep, ]
+        # Give error or warning depending on if there are still features left
+        # after removing those that are not complete.
+        FUN <- if( sum(keep) == 0 ) stop else warning
+        FUN(sum(!keep), "/", length(keep),
+            " features dropped due to missing data.", call. = FALSE)
+    }
+    
+    # If user specified validation set, divide data into training and validation
+    # datasets
+    if( !is.null(valid.col) ){
+        # Divide feature table
+        feature_table_valid <- feature_table[
+            , sample_metadata[[valid.col]], drop = FALSE]
+        feature_table <- feature_table[
+            , !sample_metadata[[valid.col]], drop = FALSE]
+        # Divide sample metadata
+        sample_metadata_valid <- sample_metadata[sample_metadata[[valid.col]], ]
+        sample_metadata <- sample_metadata[!sample_metadata[[valid.col]], ]
+        # Create an argument list
+        args <- list(
+            feature_table = feature_table,
+            sample_metadata = sample_metadata,
+            feature_metadata = feature_metadata,
+            feature_table_valid = feature_table_valid,
+            sample_metadata_valid = sample_metadata_valid
+        )
+    } else{
+        # Create an argument list
+        args <- list(
+            feature_table = feature_table,
+            sample_metadata = sample_metadata,
+            feature_metadata = feature_metadata
+        )
+    }
+    # Add those arguments that are passed with "..."
+    args <- c(args, list(...))
+    return(args)
+}