--- a +++ b/R/fetch.R @@ -0,0 +1,315 @@ +#' Fetch Data from UCSC Xena Hosts +#' +#' When you want to query just data for several genes/samples from UCSC Xena datasets, a better way +#' is to use these `fetch_` functions instead of downloading a whole dataset. Details about functions +#' please see the following sections. +#' +#' There are three primary data types: dense matrix (samples by probes (or say identifiers)), +#' sparse (sample, position, variant), and segmented (sample, position, value). +#' +#' Dense matrices can be genotypic or phenotypic, it is a sample-by-identifiers matrix. +#' Phenotypic matrices have associated field metadata (descriptive names, codes, etc.). +#' Genotypic matricies may have an associated probeMap, which maps probes to genomic locations. +#' If a matrix has hugo probeMap, the probes themselves are gene names. Otherwise, a probeMap is +#' used to map a gene location to a set of probes. +#' +#' @param host a UCSC Xena host, like "https://toil.xenahubs.net". +#' All available hosts can be printed by [xena_default_hosts()]. +#' @param dataset a UCSC Xena dataset, like "tcga_RSEM_gene_tpm". +#' All available datasets can be printed by running `XenaData$XenaDatasets` or +#' obtained from [UCSC Xena datapages](https://xenabrowser.net/datapages/). +#' @param identifiers Identifiers could be probe (like "ENSG00000000419.12"), +#' gene (like "TP53") etc.. If it is `NULL`, all identifiers in the dataset will be used. +#' @param samples ID of samples, like "TCGA-02-0047-01". +#' If it is `NULL`, all samples in the dataset will be used. However, it is better to download +#' the whole datasets if you query many samples and genes. +#' @param check if `TRUE`, check whether specified `identifiers` and `samples` exist the dataset +#' (all failed items will be filtered out). However, if `FALSE`, the code is much faster. +#' @param use_probeMap if `TRUE`, will check if the dataset has ProbeMap firstly. +#' When the dataset you want to query has a identifier-to-gene mapping, identifiers can be +#' gene symbols even the identifiers of dataset are probes or others. +#' @param time_limit time limit for getting response in seconds. +#' @return a `matirx` or character vector or a `list`. +#' @examples +#' library(UCSCXenaTools) +#' +#' host <- "https://toil.xenahubs.net" +#' dataset <- "tcga_RSEM_gene_tpm" +#' samples <- c("TCGA-02-0047-01", "TCGA-02-0055-01", "TCGA-02-2483-01", "TCGA-02-2485-01") +#' probes <- c("ENSG00000282740.1", "ENSG00000000005.5", "ENSG00000000419.12") +#' genes <- c("TP53", "RB1", "PIK3CA") +#' +#' \donttest{ +#' # Fetch samples +#' fetch_dataset_samples(host, dataset, 2) +#' # Fetch identifiers +#' fetch_dataset_identifiers(host, dataset) +#' # Fetch expression value by probes +#' fetch_dense_values(host, dataset, probes, samples, check = FALSE) +#' # Fetch expression value by gene symbol (if the dataset has probeMap) +#' has_probeMap(host, dataset) +#' fetch_dense_values(host, dataset, genes, samples, check = FALSE, use_probeMap = TRUE) +#' } +#' @export +fetch <- function(host, dataset) { + message("This function is used to build consistent documentation.") + message("Please use a function starts with fetch_") +} + +check_hiplot <- function(host) { + use_hiplot <- getOption("use_hiplot", default = FALSE) + if (use_hiplot) { + # Check website status + use_hiplot <- tryCatch( + { + httr::http_error(host) + TRUE + }, + error = function(e) { + message("The hiplot server may down, we will not use it for now.") + FALSE + } + ) + } + if (use_hiplot) { + if (!grepl("hiplot", host)) { + message("Use hiplot server (China) for mirrored data hubs (set 'options(use_hiplot = FALSE)' to disable it)") + host2 <- as.character(.xena_mirror_map_rv[host]) + if (!is.na(host2)) { + host <- host2 + } + } + } + return(host) +} + +#' @describeIn fetch fetches values from a dense matrix. +#' @export +fetch_dense_values <- function(host, dataset, identifiers = NULL, samples = NULL, + check = TRUE, use_probeMap = FALSE, time_limit = 30) { + stopifnot( + length(host) == 1, length(dataset) == 1, + is.character(host), is.character(dataset), + is.logical(check), is.logical(use_probeMap) + ) + .attach_this() + + host <- check_hiplot(host) + if (check) { + # obtain all samples + all_samples <- fetch_dataset_samples(host, dataset) + # obtain all identifiers + use_probe <- use_probeMap && has_probeMap(host, dataset) + if (use_probe) { + message("-> Obtaining gene symbols from probeMap...") + url <- has_probeMap(host, dataset, return_url = TRUE) + all_identifiers <- use_cache(url, op = "readr::read_tsv(url, + col_types = readr::cols() + )[[2]]") + } else { + all_identifiers <- fetch_dataset_identifiers(host, dataset) + } + + message("-> Checking identifiers...") + if (is.null(identifiers)) { + identifiers <- all_identifiers + } else { + if (!is.character(identifiers)) stop("Bad type for identifiers.") + if (!all(identifiers %in% all_identifiers)) { + which_in <- identifiers %in% all_identifiers + message("The following identifiers have been removed from host ", host, " dataset ", dataset) + print(identifiers[!which_in]) + identifiers <- identifiers[which_in] + if (length(identifiers) == 0) { + stop("Bad identifiers, no one left, check input?") + } + } + } + message("-> Done.") + + message("-> Checking samples...") + if (is.null(samples)) { + samples <- all_samples + } else { + if (!is.character(samples)) stop("Bad type for samples.") + if (!all(samples %in% all_samples)) { + which_in <- samples %in% all_samples + message("The following samples have been removed from host ", host, " dataset ", dataset) + print(samples[!which_in]) + samples <- samples[which_in] + if (length(samples) == 0) { + stop("Bad samples, no one left, check input?") + } + } + } + message("-> Done.") + } else { + if (is.null(samples)) { + # obtain all samples + samples <- fetch_dataset_samples(host, dataset) + } + if (is.null(identifiers)) { + # obtain all identifiers + identifiers <- fetch_dataset_identifiers(host, dataset) + } + } + + + if (length(samples) == 1) { + samples <- as.list(samples) + } + if (length(identifiers) == 1) { + identifiers <- as.list(identifiers) + } + + if (use_probeMap) { + message("-> Checking if the dataset has probeMap...") + if (has_probeMap(host, dataset)) { + message("-> Done. ProbeMap is found.") + + t_start = Sys.time() + while (as.numeric(Sys.time() - t_start) < time_limit) { + res <- tryCatch( + { + .p_dataset_gene_probe_avg(host, dataset, samples, identifiers) + }, + error = function(e) { + message("-> Query faild. Retrying...") + list(has_error = TRUE, error_info = e) + } + ) + if (is.data.frame(res)) { + break() + } + Sys.sleep(1) + } + + if (!is.data.frame(res)) { + stop(paste( + "The response times out and still returns an error", + res$error_info$message, + sep = "\n" + )) + } + + if (!is.null(unlist(res[["scores"]]))) { + res <- t(sapply(res[["scores"]], base::rbind)) + rownames(res) <- identifiers + colnames(res) <- samples + return(res) + } + } + message("-> Done. No probeMap found or error happened, use old way...") + } + + t_start = Sys.time() + while (as.numeric(Sys.time() - t_start) < time_limit) { + res <- tryCatch( + { + .p_dataset_fetch(host, dataset, samples, identifiers) + }, + error = function(e) { + message("-> Query faild. Retrying...") + list(has_error = TRUE, error_info = e) + } + ) + if (is.atomic(res)) { + break() + } + Sys.sleep(1) + } + + if (!is.atomic(res)) { + stop(paste( + "The response times out and still returns an error", + res$error_info$message, + sep = "\n" + )) + } + + message("-> Query done.") + rownames(res) <- identifiers + colnames(res) <- samples + res +} + +#' @describeIn fetch fetches values from a sparse `data.frame`. +#' @param genes gene names. +#' @export +fetch_sparse_values <- function(host, dataset, genes, samples = NULL, + time_limit = 30) { + # fetch_sparse_values("https://ucscpublic.xenahubs.net", "ccle/CCLE_DepMap_18Q2_maf_20180502", c("TP53", "KRAS")) -> mm + stopifnot( + length(host) == 1, length(dataset) == 1, + is.character(host), is.character(dataset) + ) + .attach_this() + + host <- check_hiplot(host) + if (is.null(samples)) { + samples <- fetch_dataset_samples(host, dataset) + } + + t_start = Sys.time() + while (as.numeric(Sys.time() - t_start) < time_limit) { + res <- tryCatch( + { + .p_sparse_data(host, dataset, samples, genes) + }, + error = function(e) { + message("-> Query faild. Retrying...") + list(has_error = TRUE, error_info = e) + } + ) + if (is.list(res)) { + break() + } + Sys.sleep(1) + } + + res +} + +#' @describeIn fetch fetches samples from a dataset +#' @param limit number of samples, if `NULL`, return all samples. +#' @export +fetch_dataset_samples <- function(host, dataset, limit = NULL) { + .attach_this() + host <- check_hiplot(host) + if (is.null(limit)) limit = -1 + .p_dataset_samples(host = host, dataset = dataset, limit = limit) +} + +#' @describeIn fetch fetches identifies from a dataset. +#' @export +fetch_dataset_identifiers <- function(host, dataset) { + .attach_this() + host <- check_hiplot(host) + .p_dataset_field(host = host, dataset = dataset) +} + +#' @describeIn fetch checks if a dataset has ProbeMap. +#' @param return_url if `TRUE`, returns the info of probeMap +#' instead of a logical value when the result exists. +#' @export +has_probeMap <- function(host, dataset, return_url = FALSE) { + .attach_this() + if (! host %in% .xena_mirror_map) { + host <- .xena_mirror_map[host] + } + df <- base::subset(UCSCXenaTools::XenaData, XenaHosts == host & XenaDatasets == dataset) + rv <- !is.na(df[["ProbeMap"]]) + if (rv && return_url) { + paste0( + df$XenaHosts, "/download/", + df[["ProbeMap"]] + ) + } else rv +} + +utils::globalVariables( + c( + ".p_dataset_fetch", ".p_dataset_field", ".p_dataset_gene_probe_avg", + ".p_dataset_samples", ".p_sparse_data" + ) +)