|
a |
|
b/R/XenaDownload.R |
|
|
1 |
##' Download Datasets from UCSC Xena Hubs |
|
|
2 |
##' |
|
|
3 |
##' Avaliable datasets list: <https://xenabrowser.net/datapages/> |
|
|
4 |
##' |
|
|
5 |
##' @author Shixiang Wang <w_shixiang@163.com> |
|
|
6 |
##' @param xquery a tibble object generated by [XenaQuery] function. |
|
|
7 |
##' @param destdir specify a location to store download data. Default is system temp directory. |
|
|
8 |
##' @param download_probeMap if `TRUE`, also download ProbeMap data, which used for id mapping. |
|
|
9 |
##' @param trans_slash logical, default is `FALSE`. If `TRUE`, transform slash '/' in dataset id |
|
|
10 |
##' to '__'. This option is for backwards compatibility. |
|
|
11 |
##' @param max_try time limit to try downloading the data. |
|
|
12 |
##' @param force logical. if `TRUE`, force to download data no matter whether files exist. |
|
|
13 |
##' Default is `FALSE`. |
|
|
14 |
##' @param ... other argument to `download.file` function |
|
|
15 |
##' @return a `tibble` |
|
|
16 |
##' @export |
|
|
17 |
##' @importFrom utils download.file |
|
|
18 |
##' @importFrom dplyr filter |
|
|
19 |
##' @examples |
|
|
20 |
##' \dontrun{ |
|
|
21 |
##' xe = XenaGenerate(subset = XenaHostNames == "tcgaHub") |
|
|
22 |
##' hosts(xe) |
|
|
23 |
##' xe_query = XenaQuery(xe) |
|
|
24 |
##' xe_download = XenaDownload(xe_query) |
|
|
25 |
##' } |
|
|
26 |
|
|
|
27 |
XenaDownload <- function(xquery, |
|
|
28 |
destdir = tempdir(), |
|
|
29 |
download_probeMap = FALSE, |
|
|
30 |
trans_slash = FALSE, |
|
|
31 |
force = FALSE, |
|
|
32 |
max_try = 3L, |
|
|
33 |
...) { |
|
|
34 |
stopifnot(is.data.frame(xquery), c("url") %in% names(xquery), is.logical(download_probeMap)) |
|
|
35 |
|
|
|
36 |
if (download_probeMap) { |
|
|
37 |
xquery_probe <- UCSCXenaTools::XenaData %>% |
|
|
38 |
dplyr::filter(XenaDatasets %in% xquery$datasets) %>% |
|
|
39 |
XenaGenerate() %>% |
|
|
40 |
XenaQueryProbeMap() |
|
|
41 |
xquery <- dplyr::bind_rows(xquery, xquery_probe) |
|
|
42 |
} |
|
|
43 |
|
|
|
44 |
if (trans_slash) { |
|
|
45 |
xquery$fileNames <- gsub( |
|
|
46 |
pattern = "/", |
|
|
47 |
replacement = "__", |
|
|
48 |
x = xquery$datasets |
|
|
49 |
) |
|
|
50 |
} else { |
|
|
51 |
xquery$fileNames <- xquery$datasets |
|
|
52 |
} |
|
|
53 |
|
|
|
54 |
xquery$fileNames <- ifelse(grepl("\\.gz", xquery$url), |
|
|
55 |
paste0(xquery$fileNames, ".gz"), |
|
|
56 |
xquery$fileNames |
|
|
57 |
) |
|
|
58 |
# destdir = paste0(destdir,"/") |
|
|
59 |
xquery$destfiles <- file.path(destdir, xquery$fileNames) |
|
|
60 |
|
|
|
61 |
if (!dir.exists(destdir)) { |
|
|
62 |
dir.create(destdir, recursive = TRUE, showWarnings = FALSE) |
|
|
63 |
} |
|
|
64 |
|
|
|
65 |
message("All downloaded files will under directory ", destdir, ".") |
|
|
66 |
if (!trans_slash) { |
|
|
67 |
dir_names <- dirname(xquery$destfiles) |
|
|
68 |
message("The 'trans_slash' option is FALSE, keep same directory structure as Xena.") |
|
|
69 |
message("Creating directories for datasets...") |
|
|
70 |
for (i in dir_names) { |
|
|
71 |
dir.create(i, recursive = TRUE, showWarnings = FALSE) |
|
|
72 |
} |
|
|
73 |
} |
|
|
74 |
|
|
|
75 |
# Make sure the order is right |
|
|
76 |
xquery <- dplyr::select(xquery, c("hosts", "datasets", "url", "fileNames", "destfiles"), dplyr::everything()) |
|
|
77 |
download_dataset <- function(x) { # nocov start |
|
|
78 |
tryCatch( |
|
|
79 |
{ |
|
|
80 |
if (!file.exists(x[5]) | force) { |
|
|
81 |
message("Downloading ", x[4]) |
|
|
82 |
download.file2(x[3], destfile = x[5], max_try = max_try, ...) |
|
|
83 |
} else { |
|
|
84 |
message(x[5], ", the file has been download!") |
|
|
85 |
} |
|
|
86 |
}, |
|
|
87 |
error = function(e) { |
|
|
88 |
message( |
|
|
89 |
"Can not find file", |
|
|
90 |
x[4], |
|
|
91 |
", this file maybe not compressed." |
|
|
92 |
) |
|
|
93 |
x[3] <- gsub(pattern = "\\.gz$", "", x[3]) |
|
|
94 |
x[4] <- gsub(pattern = "\\.gz$", "", x[4]) |
|
|
95 |
x[5] <- gsub(pattern = "\\.gz$", "", x[5]) |
|
|
96 |
message("Try downloading file", x[4], "...") |
|
|
97 |
tryCatch( |
|
|
98 |
{ |
|
|
99 |
download.file2(x[3], destfile = x[5], max_try = max_try, ...) |
|
|
100 |
}, |
|
|
101 |
error = function(e) { |
|
|
102 |
message("Your network is bad (try again) or the data source is invalid (report to the developer).") |
|
|
103 |
invisible(NULL) |
|
|
104 |
} |
|
|
105 |
) |
|
|
106 |
} |
|
|
107 |
) |
|
|
108 |
} |
|
|
109 |
|
|
|
110 |
apply(xquery, 1, download_dataset) # nocov end |
|
|
111 |
|
|
|
112 |
if (trans_slash) { |
|
|
113 |
message( |
|
|
114 |
"Note file names inherit from names in datasets column\n and '/' all changed to '__'." |
|
|
115 |
) |
|
|
116 |
} |
|
|
117 |
|
|
|
118 |
invisible(xquery) |
|
|
119 |
} |
|
|
120 |
|
|
|
121 |
download.file2 <- function(url, destfile, |
|
|
122 |
max_try = 3L, |
|
|
123 |
...) { |
|
|
124 |
Sys.sleep(0.01) |
|
|
125 |
tryCatch( |
|
|
126 |
{ |
|
|
127 |
if (abs(max_try - 4L) > 1) { |
|
|
128 |
message("==> Trying #", abs(max_try - 4L)) |
|
|
129 |
} |
|
|
130 |
download.file(url, destfile, ...) |
|
|
131 |
}, |
|
|
132 |
error = function(e) { |
|
|
133 |
if (max_try == 1) { |
|
|
134 |
message("Tried 3 times but failed, please check your internet connection!") |
|
|
135 |
invisible(NULL) |
|
|
136 |
} else { |
|
|
137 |
download.file2(url, destfile, max_try = max_try - 1L, ...) |
|
|
138 |
} |
|
|
139 |
} |
|
|
140 |
) |
|
|
141 |
|
|
|
142 |
} |