--- a +++ b/R/feature_processing.R @@ -0,0 +1,688 @@ +#' Wrapper to extract features (nodes) from given pathways. +#' +#' Function to extract the features (nodes) from a given list of pathways. These +#' pathways have to be compiled with the +#' \code{\link[graphite:pathways]{pathways}} function. Features can only be +#' extracted for \'proteins\' or \'metabolites\'. Features will by default be +#' mapped to gene symbols. +#' +#' @param pathway A pathway created with \code{\link[graphite:pathways]{pathways}} command. +#' @param which Mode to extract the features, either \'proteins\' or +#' \'metabolites\'. +#' @param org String specifying the organism, which is necessary for featureID +#' mapping. Default: human +#' @param returntype String that specifies the returning ID type. Default: +#' SYMBOL Options (genes/proteins): SYMBOL, ENTREZID, UNIPROT, ENSEMBL, REFSEQ +#' Options (metabolites): HMDB, CAS, DTXCID, DTXSID, SID, CID, ChEBI, KEGG, Drugbank +#' +#' @return Feature list with gene symbols (genes/proteins) or CHEBI IDs +#' (metabolites) +#' +#' @examples +#' pathways <- graphite::pathways("hsapiens", "kegg")[[1]] +#' getFeatures(pathways) +#' \donttest{ +#' pathways <- graphite::pathways("mmusculus", "kegg")[[1]] +#' getFeatures(pathways, which = "metabolites", org = "mmusculus", returntype = "HMDB") +#' +#' pathways <- graphite::pathways("mmusculus", "kegg")[[1]] +#' getFeatures(pathways, which = "proteins", org = "mmusculus", returntype = "SYMBOL") +#' } +#' +#' @importFrom graphite nodes +#' +#' @export +getFeatures <- function(pathway, which = "proteins", org = "hsapiens", returntype = "SYMBOL") { + if (which != "proteins" && which != "metabolites") { + stop("Only 'proteins' and 'metabolites' are supported for 'which'.", + call. = FALSE + ) + } + + org <- tolower(org) + + ## check for the correct organism + if (!(org %in% getOrganisms())) { + stop("Please insert a correct organism name! Use getOrganisms() + to see all supported organisms.", + call. = FALSE + ) + } + + ## extract the features (genes/proteins/metabolites) from the pathway nodes. + features <- graphite::nodes(pathway, which = which) + + if (length(features) == 0) { + return(list()) + } + + if (which == "proteins") { + ## extract the keytype of the ID format + kt <- gsub(":.*", "", features[1]) + mapped <- gsub("[A-Z]+:", "", features) + + ## get the mapping from the keytype to the user-specific type + mapped <- getGeneMapping( + features = mapped, keytype = kt, + org = org, returntype = returntype + ) + } + + ## its special for metabolites, because sometimes there are different + ## identifiers used in the same pathway + if (which == "metabolites") { + chebi <- mapIDType( + features = features, keytype = "CHEBI", + maptype = "ChEBI", returntype = returntype + ) + + kegg <- mapIDType( + features = features, keytype = "KEGGCOMP", + maptype = "KEGG", returntype = returntype + ) + + pubchem <- mapIDType( + features = features, keytype = "PUBCHEM", + maptype = "CID", returntype = returntype + ) + + cas <- mapIDType( + features = features, keytype = "CAS", + maptype = "CAS", returntype = returntype + ) + + hmdb <- mapIDType( + features = features, keytype = "HMDB", + maptype = "HMDB", returntype = returntype + ) + + mapped <- c(chebi, kegg, pubchem, cas, hmdb) + } + + return(mapped) +} + + + +#' Mapping between pathway encoded genes/proteins and different ID formats. +#' +#' Function to retrieve the gene/protein identifier mapping. Ongoing from +#' genes/proteins retrieved from pathway definitions, which often include two or +#' more ID formats or a format that is not present in your omics measurement, +#' this function maps those IDs to a given format. Depending on the organism, +#' additional packages have to be installed. +#' +#' @param features List of identifiers to be mapped. +#' @param keytype String specifying the ID type, e.g., "ENTREZID" or "UNIPROT". +#' @param org String that defines the organism. Default: hsapiens +#' Options: see \code{\link[multiGSEA]{getOrganisms}} +#' @param returntype String that specifies the returning ID type. Default: +#' SYMBOL, Options: SYMBOL, ENTREZID, UNIPROT, ENSEMBL, REFSEQ +#' +#' @return List containing mapped gene/protein IDs. +#' +#' @examples +#' features <- graphite::nodes(graphite::pathways("hsapiens", "kegg")[[1]]) +#' features <- gsub("ENTREZID:", "", features) +#' keytype <- "ENTREZID" +#' getGeneMapping(features, keytype) +#' +#' getGeneMapping(features, keytype, returntype = "UNIPROT") +#' \donttest{ +#' features <- graphite::nodes(graphite::pathways("rnorvegicus", "reactome")[[1]]) +#' features <- gsub("UNIPROT:", "", features) +#' getGeneMapping(features, keytype = "UNIPROT", org = "rnorvegicus") +#' +#' getGeneMapping(features, +#' keytype = "UNIPROT", +#' org = "rnorvegicus", +#' returntype = "ENSEMBL" +#' ) +#' } +#' +#' @importFrom AnnotationDbi select +#' +#' @export +getGeneMapping <- function(features, keytype, org = "hsapiens", returntype = "SYMBOL") { + org <- tolower(org) + + ## check for the correct organism + if (!(org %in% getOrganisms())) { + stop("Please insert a correct organism name! Use getOrganisms() + to see all supported organisms.", + call. = FALSE + ) + } + + db <- getIDMappingDatabase(org) + + supportedIDs <- c("SYMBOL", "ENTREZID", "UNIPROT", "ENSEMBL", "REFSEQ") + if (org != "dmelanogaster" && !returntype %in% supportedIDs) { + stop("Insert one of the following IDs to be returned (returntype): + SYMBOL, ENTREZID, UNIPROT, ENSEMBL, REFSEQ.", + call. = FALSE + ) + } + + supportedIDs <- c(supportedIDs, "FLYBASE", "FLYBASECG") + if (org == "dmelanogaster" && !returntype %in% supportedIDs) { + stop("Insert one of the following IDs to be returned (returntype): + SYMBOL, ENTREZID, UNIPROT, ENSEMBL, REFSEQ, FLYBASE, FLYBASECG.", + call. = FALSE + ) + } + + ## design the columns field such that we create a triple ID mapping between + ## ENTREZIDs, UNIPROT, and gene symbols + if (keytype == "UNIPROT") { + col <- unique(c("SYMBOL", "ENTREZID", returntype)) + } else { + col <- unique(c("SYMBOL", "UNIPROT", returntype)) + } + + ## run the actual mapping of IDS and return a list of the user-given type + map <- tryCatch( + { + map <- AnnotationDbi::select(db, + keys = features, + columns = col, keytype = keytype + ) + m <- match(unique(map[[keytype]]), map[[keytype]]) + map <- map[m, ] + map[[returntype]][!is.na(map[[returntype]])] + }, + error = function(cond) { + return(list()) + } + ) + + return(map) +} + + +#' Mapping between pathway encoded metabolites and different metabolite ID +#' formats. +#' +#' Function to retrieve the metabolite identifier mapping. Ongoing from +#' metabolites retrieved from pathway definitions, which often include two or +#' more ID formats, this function maps those IDs to a given format. The complete +#' mapping table based on \href{https://comptox.epa.gov/dashboard}{Comptox +#' Dashboard}, \href{https://pubchem.ncbi.nlm.nih.gov/}{PubChem}, +#' \href{https://hmdb.ca/}{HMDB}, and \href{https://www.ebi.ac.uk/chebi}{ChEBI} +#' is provided in the AnnotationHub package metaboliteIDmapping. +#' +#' @param features List of identifiers to be mapped. +#' @param keytype String specifying the ID type, e.g., "ChEBI" or "KEGGCOMP". +#' +#' @param returntype String that specifies the returning ID type. +#' Default: HMDB +#' Options: HMDB, CAS, DTXCID, DTXSID, SID, CID, ChEBI, KEGG, Drugbank +#' +#' @return List containing mapped gene/protein IDs. +#' +#' @examples +#' features <- graphite::nodes(graphite::pathways("hsapiens", "kegg")[[1]], which = "metabolites") +#' features <- gsub("KEGGCOMP:", "", features) +#' keytype <- "KEGG" +#' +#' getMetaboliteMapping(features, keytype) +#' +#' getMetaboliteMapping(features, keytype = "KEGG", returntype = "CID") +#' +#' @importFrom dplyr pull filter distinct +#' @importFrom metaboliteIDmapping metabolitesMapping +#' @importFrom magrittr %>% +#' +#' @export +getMetaboliteMapping <- function(features, keytype, returntype = "HMDB") { + ## check for the correct metabolite mapping format + supportedIDs <- c( + "HMDB", "ChEBI", "KEGG", "CAS", "DTXCID", + "DTXSID", "SID", "CID", "Drugbank" + ) + if (!returntype %in% supportedIDs) { + stop("Insert one of the following IDs to be returned (returntype): + HMDB, CAS, ChEBI, KEGG, SID, CID, DTXCID, DTXSID, Drugbank, Name", + call. = FALSE + ) + } + + ## load the mapping table which is deposited in the + ## metaboliteIDmapping package. + if (!requireNamespace("metaboliteIDmapping", quietly = TRUE)) { + stop("The necessary package metaboliteIDmapping is not installed.", + call. = FALSE + ) + } + + + ## run the actual mapping of IDS and return a list of the user-given type + map <- tryCatch( + { + ## to speed up the mapping, we need to subest the whole + ## metabolitesIDmapping table in the first place to contain + ## only thoses entries that match the given feature list + SUBmappingTable <- metaboliteIDmapping::metabolitesMapping %>% + dplyr::select(!!as.name(keytype), !!as.name(returntype)) %>% + dplyr::filter(!!as.name(keytype) %in% unique(features)) %>% + dplyr::distinct() + colnames(SUBmappingTable) <- c("Original", "Mapped") + + SUBmappingTable %>% dplyr::pull("Mapped") + }, + error = function(cond) { + return( + rep("NA", length(features)) + ) + } + ) + + map <- map[!is.na(map)] + return(map) +} + + + +#' Collect feature mapping for user given databases and omics layer. +#' +#' The functions makes use of the graphite R package to collect pathways from +#' user specified databases. Depending on the omics layer specified, the +#' function extracts either annotated genes/proteins (for transcriptome, +#' proteome layer) or metabolites (for metabolite layer). The data structure +#' that is returned is mandatory to calculate the multi-omics pathway +#' enrichment. +#' +#' @param dbs List of databases that should be queried for pathways. Default: +#' all available databases +#' @param layer List of omics layer that should be addressed. Default: all three +#' layer (transcriptome, proteome, metabolome) +#' @param returnTranscriptome String specifying the returned gene ID format. +#' Default: SYMBOL Options: SYMBOL, ENTREZID, UNIPROT, ENSEMBL, REFSEQ +#' @param returnProteome String specifying the returned protein ID format. +#' Default: SYMBOL Options: SYMBOL, ENTREZID, UNIPROT, ENSEMBL, REFSEQ +#' @param returnMetabolome String specifying the returned metabolite ID format. +#' Default: HMDB Options: HMDB, CAS, DTXCID, DTXSID, SID, CID, ChEBI, KEGG, Drugbank +#' @param organism String specifying the organism of interest. This has direct +#' influence on the available pathway databases. Default: "hsapiens" +#' Options: see \code{\link[multiGSEA]{getOrganisms}} +#' @param useLocal Boolean to use local pathway/feature descriptions. In case +#' useLocal is set to FALSE, pathway definitions and feature extraction +#' will be recalculated. This could take several minutes depending on the +#' database used. Pathbank, for example, contains nearly 50000 pathway +#' definition that have to be re-mapped. useLocal has no effect when +#' pathway definitions are retrieved for the first time. Default: TRUE +#' +#' @return Nested list with extracted and mapped pathway features. +#' +#' @examples +#' +#' getMultiOmicsFeatures( +#' dbs = c("kegg"), +#' layer = c("transcriptome", "proteome"), +#' organism = "hsapiens" +#' ) +#' \donttest{ +#' getMultiOmicsFeatures( +#' dbs = c("kegg", "reactome"), +#' layer = c("transcriptome", "metabolome"), +#' organism = "mmusculus" +#' ) +#' +#' getMultiOmicsFeatures( +#' dbs = c("reactome"), +#' layer = c("proteome"), +#' organism = "rnorvegicus", +#' returnProteome = "ENTREZID" +#' ) +#' } +#' @importFrom graphite pathwayDatabases pathways +#' @importFrom magrittr %>% +#' @importFrom dplyr filter pull +#' @importFrom rlang .data +#' +#' @export +getMultiOmicsFeatures <- function(dbs = c("all"), layer = c("all"), + returnTranscriptome = "SYMBOL", + returnProteome = "SYMBOL", + returnMetabolome = "HMDB", + organism = "hsapiens", + useLocal = TRUE) { + layers <- c("all", "metabolome", "proteome", "transcriptome") + organism <- tolower(organism) + + returnTranscriptome <- toupper(returnTranscriptome) + returnProteome <- toupper(returnProteome) + returnMetabolome <- toupper(returnMetabolome) + if(returnMetabolome == "CHEBI") returnMetabolome <- "ChEBI" + if(returnMetabolome == "DRUGBANK") returnMetabolome <- "Drugbank" + + ## check for the correct transcriptome mapping format + supportedIDs <- c("SYMBOL", "ENTREZID", "UNIPROT", "ENSEMBL", "REFSEQ") + if (!returnTranscriptome %in% supportedIDs) { + stop("Insert one of the following IDs to be returned (returnTranscriptome): + SYMBOL, ENTREZID, UNIPROT, ENSEMBL, REFSEQ", + call. = FALSE + ) + } + + ## check for the correct proteome mapping format + if (!returnProteome %in% supportedIDs) { + stop("Insert one of the following IDs to be returned (returnProteome): + SYMBOL, ENTREZID, UNIPROT, ENSEMBL, REFSEQ", + call. = FALSE + ) + } + + ## check for the correct metabolite mapping format + supportedIDs <- c( + "HMDB", "ChEBI", "KEGG", "CAS", "DTXCID", + "DTXSID", "SID", "CID", "Drugbank" + ) + if (!returnMetabolome %in% supportedIDs) { + stop("Insert one of the following IDs to be returned (returnMetabolome): + HMDB, CAS, ChEBI, KEGG, SID, CID, DTXCID, DTXSID, Drugbank", + call. = FALSE + ) + } + + ## check if the given organism is supported + if (!(organism %in% getOrganisms())) { + stop("You entered an organism that is not supported! + Use getOrganisms() to get a list of all suported organisms.", + call. = FALSE + ) + } + + if (sum(tolower(layer) %in% layers) != length(layer)) { + stop("You entered wrong input for the omics layer specification. + Options are: all, transcriptome, proteome, metabolome, or a combination thereof.", + call. = FALSE + ) + } + + pDBs <- graphite::pathwayDatabases() + dbs0 <- pDBs %>% + dplyr::filter(.data$species == organism) %>% + dplyr::pull(.data$database) + databases <- c("all", as.vector(dbs0)) + + if (sum(tolower(dbs) %in% databases) != length(dbs)) { + stop(paste0("You entered wrong input for the omics layer specification. + Options are: ", paste(databases, collapse = " "), " or a combination thereof."), + call. = FALSE + ) + } + + if ("all" %in% dbs) dbs <- as.vector(dbs0) + + if ("all" %in% layer) { + layer <- c("transcriptome", "proteome", "metabolome") + } + + pathways <- lapply(dbs, function(x) { + graphite::pathways(organism, x) + }) + names(pathways) <- dbs + + features <- list() + if ("transcriptome" %in% layer) { + features$transcriptome <- getMappedFeatures( + pathways = pathways, + organism = organism, + returnID = returnTranscriptome, + useLocal = useLocal + ) + + ## adapt for duplicated pathways + names(features$transcriptome) <- rename_duplicates(names(features$transcriptome)) + } + + if ("proteome" %in% layer) { + if ("transcriptome" %in% layer && returnProteome == returnTranscriptome) { + features$proteome <- features$transcriptome + } else { + features$proteome <- getMappedFeatures( + pathways = pathways, + organism = organism, + returnID = returnProteome, + useLocal = useLocal + ) + + ## adapt for duplicated pathways + names(features$proteome) <- rename_duplicates(names(features$proteome)) + } + } + + + if ("metabolome" %in% layer) { + features$metabolome <- getMappedFeatures( + pathways = pathways, + organism = organism, + returnID = returnMetabolome, + which = "metabolites", + useLocal = useLocal + ) + + ## adapt for duplicated pathways + names(features$metabolome) <- rename_duplicates(names(features$metabolome)) + } + + return(features) +} + + + +#' Wrapper to get feature mappings. +#' +#' Feature mappings will be used from hard disk in case they have been +#' mapped before and `useLocal` is not set to be FALSE. +#' In other cases, a feature extraction will be done and the results are +#' stored for a following occasion. +#' +#' @param pathways List of pathway definitions. +#' @param returnID String specifying the returned ID format. +#' @param organism String defining the organism of analysis. +#' @param which Mode to extract the features, either \'proteins\' or +#' \'metabolites\'. +#' @param useLocal Boolean specifying whether or not to use the local +#' preprocessed mapping. +#' +#' @return List of mapped features for an omics layer. +getMappedFeatures <- function(pathways, returnID = "SYMBOL", organism = "hsapiens", which = "proteins", useLocal = TRUE) { + feat <- unlist(lapply(names(pathways), function(db) { + ap <- archivePath(paste0(organism, "_", db, "_", returnID)) + + if (file.exists(ap) && useLocal) { + loadLocal(ap) + } else { + tmp <- lapply(pathways[[db]], function(p) { + getFeatures( + pathway = p, org = organism, + which = which, + returntype = returnID + ) + }) + + header <- rep(paste0("(", toupper(db), ") "), length(pathways[[db]])) + names(tmp) <- paste0(header, names(tmp)) + saveRDS(tmp, file = ap) + tmp + } + }), recursive = FALSE) + + return(feat) +} + + + +#' Helper function to map only a subset of metabolite IDs +#' +#' This helper function becomes necessary since there are sometimes multiple ID +#' formats used in a single pathway definition. +#' +#' @param features List of metabolite feature IDs of the pathway. +#' @param keytype String specifying the ID format in pathway definition. +#' @param maptype String specifying the corresponding ID format in multiGSEA. +#' @param returntype String identifying the ID type that should be mapped. +#' +#' @return List of mapped metabolite IDs. +mapIDType <- function(features, keytype = "CHEBI", maptype = "ChEBI", returntype = "HMDB") { + mapped <- c() + ids <- gsub(paste0(keytype, ":"), "", features[grep(keytype, features)]) + + if (returntype != maptype) { + mapped <- getMetaboliteMapping( + features = ids, + keytype = maptype, + returntype = returntype + ) + } else { + mapped <- ids + } + + return(mapped) +} + + + +#' Get list of supported organisms +#' +#' Get a list of organisms that are covered in our workflow through a supporting +#' `AnnotationDBi` package. Without such a package we would not be able to map +#' transcript and protein identifier between different formats. All the +#' organisms that are listed here have at lest kegg and or reactome pathway +#' annotation that can be queried by `graphite`. +#' +#' @return List of supported organisms +#' +#' @examples +#' getOrganisms() +#' @export +getOrganisms <- function() { + orglist <- c( + "hsapiens", "rnorvegicus", "mmusculus", "sscrofa", + "btaurus", "celegans", "dmelanogaster", "drerio", + "ggallus", "xlaevis", "cfamiliaris" + ) + + return(orglist) +} + + + +#' Get the correct ID mapping database +#' +#' Check by means of the given organism name if the required `AnnotationDbi` +#' package is installed. Select the ID mapping table based on the organism name +#' and return it. +#' +#' @param organism String that defines the organism. +#' +#' @return AnnotationDbi database for ID mapping. +getIDMappingDatabase <- function(organism) { + map <- c( + hsapiens = "org.Hs.eg.db", rnorvegicus = "org.Rn.eg.db", + mmusculus = "org.Mm.eg.db", sscrofa = "org.Ss.eg.db", + btaurus = "org.Bt.eg.db", celegans = "org.Ce.eg.db", + dmelanogaster = "org.Dm.eg.db", drerio = "org.Dr.eg.db", + ggallus = "org.Gg.eg.db", xlaevis = "org.Xl.eg.db", + cfamiliaris = "org.Cf.eg.db" + ) + + stopifnot(organism %in% names(map)) + pkg <- map[[organism]] + + if (!requireNamespace(pkg, quietly = TRUE)) { + stop(paste0("The necessary package ", pkg, " is not installed."), + call. = FALSE + ) + } + + return(get(pkg, envir = getNamespace(pkg))) +} + + + +#' Pre-rank features prior to calculating enrichment scores. +#' +#' Rank features based on the direction of their fold change and their magnitude +#' implicated through their assigned p-value. +#' +#' @param logFC Vector containing the log-transformed fold changes of features. +#' @param pvalues Vector containing the p-values associated with those logFCs. +#' @param base Integer specifying the base of the logarithm. Default: 10 +#' +#' @return Vector of pre-ranked features, still unsorted +#' +#' @examples +#' logFC <- rnorm(10) +#' pvalues <- runif(10) +#' rankFeatures(logFC, pvalues) +#' +#' @export +rankFeatures <- function(logFC, pvalues, base = 10) { + return(sign(logFC) * -log(pvalues, base = base)) +} + + + +#' Create an empty data structure for measured omics features +#' +#' This function creates a data structure of nested but empty lists. One list +#' for each omics layer. By default all three supported omics layer are used to +#' create a data structures with three empty sublists: transcriptome, proteome, +#' and metabolome. +#' +#' @param layer List specifying the omics layer which should be created +#' +#' @return List with length(layer) empty sublists +#' +#' @examples +#' initOmicsDataStructure() +#' initOmicsDataStructure(c("transcriptome", "proteome")) +#' initOmicsDataStructure(c("Transcriptome", "Metabolome")) +#' @export +initOmicsDataStructure <- function(layer = c("transcriptome", "proteome", "metabolome")) { + l <- c() + layer <- tolower(layer) + if (length(grep("trans*", layer)) > 0) l <- c(l, "transcriptome") + if (length(grep("prote*", layer)) > 0) l <- c(l, "proteome") + if (length(grep("metabo*", layer)) > 0) l <- c(l, "metabolome") + + if (length(l) != length(layer)) { + stop("Not all your omics layer could be mapped to + 'transcriptome', 'proteome', or 'metabolome'. ", + call. = TRUE + ) + } + + empty_structure <- rep(list(list()), length(l)) + names(empty_structure) <- l + + return(empty_structure) +} + + + +#' Helper function to get all different metabolite ID formats +#' +#' This helper function extracts all used ID formats in all pathways +#' and returns a nested list for each pathway database. +#' +#' @param pathways List of pathway databases and their pathway definition. +#' +#' @return List of metabolite ID formats. +#' +#' @importFrom graphite nodes +getMetaboliteIDformats <- function(pathways) { + n1 <- lapply(names(pathways), function(dbs) { + n2 <- lapply(pathways[[dbs]], function(p) { + unique(gsub(":.*", "", graphite::nodes(p, which = "metabolites"))) + }) + + unique(unlist(n2)) + }) + + names(n1) <- names(pathways) + return(n1) +}