--- a +++ b/R/XenaDownload.R @@ -0,0 +1,142 @@ +##' Download Datasets from UCSC Xena Hubs +##' +##' Avaliable datasets list: <https://xenabrowser.net/datapages/> +##' +##' @author Shixiang Wang <w_shixiang@163.com> +##' @param xquery a tibble object generated by [XenaQuery] function. +##' @param destdir specify a location to store download data. Default is system temp directory. +##' @param download_probeMap if `TRUE`, also download ProbeMap data, which used for id mapping. +##' @param trans_slash logical, default is `FALSE`. If `TRUE`, transform slash '/' in dataset id +##' to '__'. This option is for backwards compatibility. +##' @param max_try time limit to try downloading the data. +##' @param force logical. if `TRUE`, force to download data no matter whether files exist. +##' Default is `FALSE`. +##' @param ... other argument to `download.file` function +##' @return a `tibble` +##' @export +##' @importFrom utils download.file +##' @importFrom dplyr filter +##' @examples +##' \dontrun{ +##' xe = XenaGenerate(subset = XenaHostNames == "tcgaHub") +##' hosts(xe) +##' xe_query = XenaQuery(xe) +##' xe_download = XenaDownload(xe_query) +##' } + +XenaDownload <- function(xquery, + destdir = tempdir(), + download_probeMap = FALSE, + trans_slash = FALSE, + force = FALSE, + max_try = 3L, + ...) { + stopifnot(is.data.frame(xquery), c("url") %in% names(xquery), is.logical(download_probeMap)) + + if (download_probeMap) { + xquery_probe <- UCSCXenaTools::XenaData %>% + dplyr::filter(XenaDatasets %in% xquery$datasets) %>% + XenaGenerate() %>% + XenaQueryProbeMap() + xquery <- dplyr::bind_rows(xquery, xquery_probe) + } + + if (trans_slash) { + xquery$fileNames <- gsub( + pattern = "/", + replacement = "__", + x = xquery$datasets + ) + } else { + xquery$fileNames <- xquery$datasets + } + + xquery$fileNames <- ifelse(grepl("\\.gz", xquery$url), + paste0(xquery$fileNames, ".gz"), + xquery$fileNames + ) + # destdir = paste0(destdir,"/") + xquery$destfiles <- file.path(destdir, xquery$fileNames) + + if (!dir.exists(destdir)) { + dir.create(destdir, recursive = TRUE, showWarnings = FALSE) + } + + message("All downloaded files will under directory ", destdir, ".") + if (!trans_slash) { + dir_names <- dirname(xquery$destfiles) + message("The 'trans_slash' option is FALSE, keep same directory structure as Xena.") + message("Creating directories for datasets...") + for (i in dir_names) { + dir.create(i, recursive = TRUE, showWarnings = FALSE) + } + } + + # Make sure the order is right + xquery <- dplyr::select(xquery, c("hosts", "datasets", "url", "fileNames", "destfiles"), dplyr::everything()) + download_dataset <- function(x) { # nocov start + tryCatch( + { + if (!file.exists(x[5]) | force) { + message("Downloading ", x[4]) + download.file2(x[3], destfile = x[5], max_try = max_try, ...) + } else { + message(x[5], ", the file has been download!") + } + }, + error = function(e) { + message( + "Can not find file", + x[4], + ", this file maybe not compressed." + ) + x[3] <- gsub(pattern = "\\.gz$", "", x[3]) + x[4] <- gsub(pattern = "\\.gz$", "", x[4]) + x[5] <- gsub(pattern = "\\.gz$", "", x[5]) + message("Try downloading file", x[4], "...") + tryCatch( + { + download.file2(x[3], destfile = x[5], max_try = max_try, ...) + }, + error = function(e) { + message("Your network is bad (try again) or the data source is invalid (report to the developer).") + invisible(NULL) + } + ) + } + ) + } + + apply(xquery, 1, download_dataset) # nocov end + + if (trans_slash) { + message( + "Note file names inherit from names in datasets column\n and '/' all changed to '__'." + ) + } + + invisible(xquery) +} + +download.file2 <- function(url, destfile, + max_try = 3L, + ...) { + Sys.sleep(0.01) + tryCatch( + { + if (abs(max_try - 4L) > 1) { + message("==> Trying #", abs(max_try - 4L)) + } + download.file(url, destfile, ...) + }, + error = function(e) { + if (max_try == 1) { + message("Tried 3 times but failed, please check your internet connection!") + invisible(NULL) + } else { + download.file2(url, destfile, max_try = max_try - 1L, ...) + } + } + ) + +}