|
a |
|
b/R/XenaHub-class.R |
|
|
1 |
# FUN: Set Class & operate XenaHub object directly |
|
|
2 |
|
|
|
3 |
##' Class XenaHub |
|
|
4 |
##' @description a S4 class to represent UCSC Xena Data Hubs |
|
|
5 |
##' @slot hosts hosts of data hubs |
|
|
6 |
##' @slot cohorts cohorts of data hubs |
|
|
7 |
##' @slot datasets datasets of data hubs |
|
|
8 |
##' @importFrom methods new |
|
|
9 |
##' @export |
|
|
10 |
.XenaHub <- setClass( |
|
|
11 |
"XenaHub", |
|
|
12 |
representation = representation( |
|
|
13 |
hosts = "character", |
|
|
14 |
cohorts = "character", |
|
|
15 |
datasets = "character" |
|
|
16 |
) |
|
|
17 |
) |
|
|
18 |
|
|
|
19 |
setMethod("show", "XenaHub", function(object) { |
|
|
20 |
showsome <- function(label, x) { |
|
|
21 |
len <- length(x) |
|
|
22 |
if (len > 6) { |
|
|
23 |
x <- c(head(x, 3), "...", tail(x, 2)) |
|
|
24 |
} |
|
|
25 |
cat(label, |
|
|
26 |
"() (", |
|
|
27 |
len, |
|
|
28 |
" total):", |
|
|
29 |
"\n ", |
|
|
30 |
paste0(x, collapse = "\n "), |
|
|
31 |
"\n", |
|
|
32 |
sep = "" |
|
|
33 |
) |
|
|
34 |
} |
|
|
35 |
cat("class:", class(object), "\n") |
|
|
36 |
cat("hosts():", |
|
|
37 |
"\n ", paste0(hosts(object), collapse = "\n "), |
|
|
38 |
"\n", |
|
|
39 |
sep = "" |
|
|
40 |
) |
|
|
41 |
showsome("cohorts", cohorts(object)) |
|
|
42 |
showsome("datasets", datasets(object)) |
|
|
43 |
}) |
|
|
44 |
|
|
|
45 |
|
|
|
46 |
##' @title UCSC Xena Default Hosts |
|
|
47 |
##' @description Return Xena default hosts |
|
|
48 |
##' @return A character vector include current defalut hosts |
|
|
49 |
##' @author Shixiang Wang <w_shixiang@163.com> |
|
|
50 |
##' @seealso [UCSCXenaTools::XenaHub()] |
|
|
51 |
##' @export |
|
|
52 |
xena_default_hosts <- function() { |
|
|
53 |
c( |
|
|
54 |
"https://ucscpublic.xenahubs.net", |
|
|
55 |
"https://tcga.xenahubs.net", |
|
|
56 |
"https://gdc.xenahubs.net", |
|
|
57 |
"https://gdcV18.xenahubs.net", |
|
|
58 |
"https://icgc.xenahubs.net", |
|
|
59 |
"https://toil.xenahubs.net", |
|
|
60 |
"https://pancanatlas.xenahubs.net", |
|
|
61 |
"https://xena.treehouse.gi.ucsc.edu:443", |
|
|
62 |
"https://pcawg.xenahubs.net", |
|
|
63 |
"https://atacseq.xenahubs.net", |
|
|
64 |
"https://singlecellnew.xenahubs.net", |
|
|
65 |
"https://kidsfirst.xenahubs.net", |
|
|
66 |
"https://tdi.xenahubs.net" |
|
|
67 |
) |
|
|
68 |
} |
|
|
69 |
|
|
|
70 |
.xena_hosts <- c( |
|
|
71 |
"publicHub", |
|
|
72 |
"tcgaHub", |
|
|
73 |
"gdcHub", |
|
|
74 |
"gdcHubV18", |
|
|
75 |
"icgcHub", |
|
|
76 |
"toilHub", |
|
|
77 |
"pancanAtlasHub", |
|
|
78 |
"treehouseHub", |
|
|
79 |
"pcawgHub", |
|
|
80 |
"atacseqHub", |
|
|
81 |
"singlecellHub", |
|
|
82 |
"kidsfirstHub", |
|
|
83 |
"tdiHub" |
|
|
84 |
) |
|
|
85 |
|
|
|
86 |
names(.xena_hosts) <- xena_default_hosts() |
|
|
87 |
|
|
|
88 |
# Add Hiplot mirror url |
|
|
89 |
# Still use UCSC Xena URL if it is not available |
|
|
90 |
.xena_hosts_hiplot <- .xena_hosts |
|
|
91 |
names(.xena_hosts_hiplot) <- c( |
|
|
92 |
"https://xena-ucscpublic.hiplot.com.cn", |
|
|
93 |
"https://xena-tcga.hiplot.com.cn", |
|
|
94 |
"https://xena-gdc.hiplot.com.cn", |
|
|
95 |
"https://xena-icgc.hiplot.com.cn", |
|
|
96 |
"https://xena-toil.hiplot.com.cn", |
|
|
97 |
"https://xena-pancanatlas.hiplot.com.cn", |
|
|
98 |
"https://xena.treehouse.gi.ucsc.edu:443", #! |
|
|
99 |
"https://xena-pcawg.hiplot.com.cn", |
|
|
100 |
"https://xena-atacseq.hiplot.com.cn", |
|
|
101 |
"https://singlecellnew.xenahubs.net", #! |
|
|
102 |
"https://kidsfirst.xenahubs.net" #! |
|
|
103 |
#"https://tdi.xenahubs.net" #! |
|
|
104 |
) |
|
|
105 |
# Map hiplot to ucsc |
|
|
106 |
.xena_mirror_map <- names(.xena_hosts) |
|
|
107 |
names(.xena_mirror_map) <- names(.xena_hosts_hiplot) |
|
|
108 |
# Map ucsc to hiplot |
|
|
109 |
.xena_mirror_map_rv <- names(.xena_hosts_hiplot) |
|
|
110 |
names(.xena_mirror_map_rv) <- names(.xena_hosts) |
|
|
111 |
|
|
|
112 |
##' Generate a XenaHub Object |
|
|
113 |
##' |
|
|
114 |
##' It is used to generate original |
|
|
115 |
##' `XenaHub` object according to hosts, cohorts, datasets or hostName. |
|
|
116 |
##' If these arguments not specified, all hosts and corresponding datasets |
|
|
117 |
##' will be returned as a `XenaHub` object. All datasets can be found |
|
|
118 |
##' at <https://xenabrowser.net/datapages/>. |
|
|
119 |
##' |
|
|
120 |
##' |
|
|
121 |
##' @param hosts a character vector specify UCSC Xena hosts, all available hosts can be |
|
|
122 |
##' found by `xena_default_hosts()` function. `hostName` is a more recommend option. |
|
|
123 |
##' @param cohorts default is empty character vector, all cohorts will be returned. |
|
|
124 |
##' @param datasets default is empty character vector, all datasets will be returned. |
|
|
125 |
##' @param hostName name of host, available options can be accessed by `.xena_hosts` |
|
|
126 |
##' This is an easier option for user than `hosts` option. Note, this option |
|
|
127 |
##' will overlap `hosts`. |
|
|
128 |
##' @return a [XenaHub] object |
|
|
129 |
##' @author Shixiang Wang <w_shixiang@163.com> |
|
|
130 |
##' @export |
|
|
131 |
##' @importFrom httr stop_for_status POST content |
|
|
132 |
##' @importFrom utils head tail |
|
|
133 |
##' @examples |
|
|
134 |
##' \dontrun{ |
|
|
135 |
##' #1 query all hosts, cohorts and datasets |
|
|
136 |
##' xe = XenaHub() |
|
|
137 |
##' xe |
|
|
138 |
##' #2 query only TCGA hosts |
|
|
139 |
##' xe = XenaHub(hostName = "tcgaHub") |
|
|
140 |
##' xe |
|
|
141 |
##' hosts(xe) # get hosts |
|
|
142 |
##' cohorts(xe) # get cohorts |
|
|
143 |
##' datasets(xe) # get datasets |
|
|
144 |
##' samples(xe) # get samples |
|
|
145 |
##' } |
|
|
146 |
XenaHub <- function(hosts = xena_default_hosts(), |
|
|
147 |
cohorts = character(), |
|
|
148 |
datasets = character(), |
|
|
149 |
hostName = c( |
|
|
150 |
"publicHub", |
|
|
151 |
"tcgaHub", |
|
|
152 |
"gdcHub", |
|
|
153 |
"gdcHubV18", |
|
|
154 |
"icgcHub", |
|
|
155 |
"toilHub", |
|
|
156 |
"pancanAtlasHub", |
|
|
157 |
"treehouseHub", |
|
|
158 |
"pcawgHub", |
|
|
159 |
"atacseqHub", |
|
|
160 |
"singlecellHub", |
|
|
161 |
"kidsfirstHub", |
|
|
162 |
"tdiHub" |
|
|
163 |
)) { |
|
|
164 |
stopifnot( |
|
|
165 |
is.character(hosts), |
|
|
166 |
is.character(cohorts), |
|
|
167 |
is.character(datasets) |
|
|
168 |
) |
|
|
169 |
|
|
|
170 |
hostName <- unique(hostName) |
|
|
171 |
|
|
|
172 |
if (length(hostName) != length(.xena_hosts) & |
|
|
173 |
all( |
|
|
174 |
hostName %in% .xena_hosts |
|
|
175 |
)) { |
|
|
176 |
.temp <- names(.xena_hosts) |
|
|
177 |
names(.temp) <- .xena_hosts |
|
|
178 |
hostNames <- .temp %>% |
|
|
179 |
as.data.frame() %>% |
|
|
180 |
t() %>% |
|
|
181 |
as.data.frame() |
|
|
182 |
rm(.temp) |
|
|
183 |
|
|
|
184 |
hosts <- as.character(hostNames[, hostName]) |
|
|
185 |
} else if (!all(hostName %in% .xena_hosts)) { |
|
|
186 |
stop("Bad hostName, please check") |
|
|
187 |
} |
|
|
188 |
|
|
|
189 |
if (is.null(names(hosts))) { |
|
|
190 |
names(hosts) <- hosts |
|
|
191 |
} |
|
|
192 |
|
|
|
193 |
hosts0 <- hosts |
|
|
194 |
hosts <- Filter(.host_is_alive, hosts) |
|
|
195 |
if (length(hosts) == 0L) { # nocov start |
|
|
196 |
stop( |
|
|
197 |
"\n no hosts responding:", |
|
|
198 |
"\n ", |
|
|
199 |
paste0(hosts0, collapse = "\n ") |
|
|
200 |
) |
|
|
201 |
} # nocov end |
|
|
202 |
|
|
|
203 |
all_cohorts <- unlist(.host_cohorts(hosts), use.names = FALSE) |
|
|
204 |
if (length(cohorts) == 0L) { |
|
|
205 |
cohorts <- all_cohorts |
|
|
206 |
} else { |
|
|
207 |
hosts <- hosts[.cohort_datasets_count(hosts, cohorts) != 0L] |
|
|
208 |
} |
|
|
209 |
|
|
|
210 |
all_datasets <- unlist(.cohort_datasets(hosts, cohorts), |
|
|
211 |
use.names = FALSE |
|
|
212 |
) |
|
|
213 |
if (length(datasets) == 0L) { |
|
|
214 |
datasets <- all_datasets |
|
|
215 |
} else { |
|
|
216 |
if (!all(datasets %in% all_datasets)) { # nocov start |
|
|
217 |
bad_dataset <- datasets[!datasets %in% all_datasets] |
|
|
218 |
message("Following datasets are not in datasets of hosts, ignore them...") |
|
|
219 |
message(bad_dataset) |
|
|
220 |
} # nocov end |
|
|
221 |
datasets <- all_datasets[all_datasets %in% datasets] |
|
|
222 |
} |
|
|
223 |
|
|
|
224 |
|
|
|
225 |
.XenaHub( |
|
|
226 |
hosts = hosts, |
|
|
227 |
cohorts = cohorts, |
|
|
228 |
datasets = datasets |
|
|
229 |
) |
|
|
230 |
} |
|
|
231 |
|
|
|
232 |
##' Get or Update Newest Data Information of UCSC Xena Data Hubs |
|
|
233 |
##' @param saveTolocal logical. Whether save to local R package data directory for permanent use |
|
|
234 |
##' or Not. |
|
|
235 |
##' @return a `data.frame` contains all datasets information of Xena. |
|
|
236 |
##' @author Shixiang Wang <w_shixiang@163.com> |
|
|
237 |
##' @export |
|
|
238 |
##' @examples |
|
|
239 |
##' \dontrun{ |
|
|
240 |
##' XenaDataUpdate() |
|
|
241 |
##' XenaDataUpdate(saveTolocal = TRUE) |
|
|
242 |
##' } |
|
|
243 |
XenaDataUpdate <- function(saveTolocal = TRUE) { # nocov start |
|
|
244 |
# .p_all_cohorts(list(unique(XenaData$XenaHosts)[10]), exclude = list(NULL)) |
|
|
245 |
# .p_dataset_list(list(XenaData$XenaHosts[1]), list(XenaData$XenaCohorts[1])) |
|
|
246 |
message("Disable hiplot firstly.") |
|
|
247 |
options(use_hiplot = FALSE) |
|
|
248 |
try_query = function(h, max_try = 3L) { |
|
|
249 |
Sys.sleep(0.1) |
|
|
250 |
tryCatch( |
|
|
251 |
{ |
|
|
252 |
message("==> Trying #", abs(max_try - 4L)) |
|
|
253 |
.p_all_cohorts(list(h), exclude = list(NULL)) |
|
|
254 |
}, |
|
|
255 |
error = function(e) { |
|
|
256 |
if (max_try == 1) { |
|
|
257 |
warning("Tried 3 times but failed, this hub may down or please check URL or your internet connection!", immediate. = TRUE) |
|
|
258 |
return(NULL) |
|
|
259 |
} else { |
|
|
260 |
try_query(h, max_try - 1L) |
|
|
261 |
} |
|
|
262 |
} |
|
|
263 |
) |
|
|
264 |
} |
|
|
265 |
|
|
|
266 |
query_host = function(h) { |
|
|
267 |
message("==> Searching cohorts for host ", h, "...") |
|
|
268 |
|
|
|
269 |
chs <- try_query(h, max_try = 3L) |
|
|
270 |
if (is.null(chs)) { |
|
|
271 |
return(NULL) |
|
|
272 |
} |
|
|
273 |
chs <- setdiff(chs, "(unassigned)") |
|
|
274 |
message("===> #", length(chs), " cohorts found.") |
|
|
275 |
message("===> Querying datasets info...") |
|
|
276 |
zz <- lapply(chs, function(x, h) { |
|
|
277 |
.p_dataset_list(list(h), list(x)) |
|
|
278 |
}, h = h) %>% |
|
|
279 |
stats::setNames(chs) %>% |
|
|
280 |
dplyr::bind_rows(.id = "XenaCohorts") |
|
|
281 |
message("===> #", nrow(zz), " datasets found.") |
|
|
282 |
message("==> Done for host ", h, "...") |
|
|
283 |
zz |
|
|
284 |
} |
|
|
285 |
|
|
|
286 |
message("=> Obtaining info from UCSC Xena hubs...") |
|
|
287 |
XenaInfo <- lapply(names(.xena_hosts), query_host) %>% |
|
|
288 |
stats::setNames(names(.xena_hosts)) %>% |
|
|
289 |
dplyr::bind_rows(.id = "XenaHosts") |
|
|
290 |
|
|
|
291 |
message("=> Done for obtaining.") |
|
|
292 |
message("=> Parsing datasets metadata...") |
|
|
293 |
|
|
|
294 |
XenaInfo <- XenaInfo %>% |
|
|
295 |
dplyr::rename( |
|
|
296 |
XenaDatasets = .data$name, |
|
|
297 |
SampleCount = .data$count, |
|
|
298 |
DataSubtype = .data$datasubtype, |
|
|
299 |
Type = .data$type, |
|
|
300 |
LongTitle = .data$longtitle, |
|
|
301 |
ProbeMap = .data$probemap |
|
|
302 |
) %>% |
|
|
303 |
dplyr::mutate(XenaHostNames = .xena_hosts[.data$XenaHosts]) |
|
|
304 |
|
|
|
305 |
j_data <- lapply(XenaInfo$text, function(x) { |
|
|
306 |
# decode metadata from json format |
|
|
307 |
# note json data may have different elements for |
|
|
308 |
# different cohort datasets |
|
|
309 |
# more work need to be done here |
|
|
310 |
# |
|
|
311 |
# tt$text contains metadata for dataset |
|
|
312 |
# tt$pmtext contains metadata for probemap |
|
|
313 |
json_df <- jsonlite::parse_json(x) |
|
|
314 |
dplyr::tibble( |
|
|
315 |
Citation = json_df[["citation"]] %||% NA, |
|
|
316 |
Label = json_df[["label"]] %||% NA, |
|
|
317 |
Tags = .collapse_list(json_df[["tags"]]) %||% NA, |
|
|
318 |
AnatomicalOrigin = .collapse_list(json_df[["anatomical_origin"]]) %||% NA, |
|
|
319 |
SampleType = .collapse_list(json_df[["sample_type"]]) %||% NA, |
|
|
320 |
Version = json_df[["version"]] %||% NA, |
|
|
321 |
PrimaryDisease = json_df[["primary_disease"]] %||% NA, |
|
|
322 |
Platform = json_df[["platform"]] %||% NA, |
|
|
323 |
Unit = json_df[["unit"]] %||% NA |
|
|
324 |
) |
|
|
325 |
}) |
|
|
326 |
|
|
|
327 |
message("=> Done for parsing. Tidying...") |
|
|
328 |
|
|
|
329 |
tidy_data <- dplyr::bind_rows(j_data) |
|
|
330 |
XenaData <- dplyr::bind_cols(XenaInfo, tidy_data) |
|
|
331 |
XenaData <- dplyr::as_tibble(XenaData) |
|
|
332 |
|
|
|
333 |
XenaData <- XenaData %>% |
|
|
334 |
dplyr::select( |
|
|
335 |
c( |
|
|
336 |
"XenaHosts", "XenaHostNames", "XenaCohorts", "XenaDatasets", "SampleCount", |
|
|
337 |
"DataSubtype", "Label", "Type", "AnatomicalOrigin", "SampleType", |
|
|
338 |
"Tags", "ProbeMap", "LongTitle", "Citation", "Version", |
|
|
339 |
"Unit", "Platform" |
|
|
340 |
) |
|
|
341 |
) |
|
|
342 |
|
|
|
343 |
message("=> Tidying done.") |
|
|
344 |
|
|
|
345 |
if (saveTolocal) { |
|
|
346 |
message("=> Saving...") |
|
|
347 |
data_dir <- base::system.file("data", package = "UCSCXenaTools") |
|
|
348 |
if (dir.exists(data_dir)) { |
|
|
349 |
save(XenaData, file = file.path(data_dir, "XenaData.rda"), compress = "xz") |
|
|
350 |
} else { |
|
|
351 |
message("There is no data directory ", data_dir) |
|
|
352 |
message("Please check it.") |
|
|
353 |
} |
|
|
354 |
} |
|
|
355 |
message("=> Done.") |
|
|
356 |
XenaData |
|
|
357 |
} # nocov end |
|
|
358 |
|
|
|
359 |
.collapse_list <- function(x) { |
|
|
360 |
sapply(x, function(x) x) %>% paste0(collapse = ",") |
|
|
361 |
} |
|
|
362 |
|
|
|
363 |
`%||%` <- function(x, y) { |
|
|
364 |
# ifelse(is.null(x), y, x) |
|
|
365 |
if (is.null(x)) { |
|
|
366 |
y |
|
|
367 |
} else { |
|
|
368 |
x |
|
|
369 |
} |
|
|
370 |
} |
|
|
371 |
|
|
|
372 |
utils::globalVariables(c( |
|
|
373 |
".p_dataset_metadata", |
|
|
374 |
".p_all_cohorts", |
|
|
375 |
".p_dataset_list" |
|
|
376 |
)) |