[ea2224]: / R / XenaHub-class.R

Download this file

377 lines (344 with data), 10.9 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
# FUN: Set Class & operate XenaHub object directly
##' Class XenaHub
##' @description a S4 class to represent UCSC Xena Data Hubs
##' @slot hosts hosts of data hubs
##' @slot cohorts cohorts of data hubs
##' @slot datasets datasets of data hubs
##' @importFrom methods new
##' @export
.XenaHub <- setClass(
"XenaHub",
representation = representation(
hosts = "character",
cohorts = "character",
datasets = "character"
)
)
setMethod("show", "XenaHub", function(object) {
showsome <- function(label, x) {
len <- length(x)
if (len > 6) {
x <- c(head(x, 3), "...", tail(x, 2))
}
cat(label,
"() (",
len,
" total):",
"\n ",
paste0(x, collapse = "\n "),
"\n",
sep = ""
)
}
cat("class:", class(object), "\n")
cat("hosts():",
"\n ", paste0(hosts(object), collapse = "\n "),
"\n",
sep = ""
)
showsome("cohorts", cohorts(object))
showsome("datasets", datasets(object))
})
##' @title UCSC Xena Default Hosts
##' @description Return Xena default hosts
##' @return A character vector include current defalut hosts
##' @author Shixiang Wang <w_shixiang@163.com>
##' @seealso [UCSCXenaTools::XenaHub()]
##' @export
xena_default_hosts <- function() {
c(
"https://ucscpublic.xenahubs.net",
"https://tcga.xenahubs.net",
"https://gdc.xenahubs.net",
"https://gdcV18.xenahubs.net",
"https://icgc.xenahubs.net",
"https://toil.xenahubs.net",
"https://pancanatlas.xenahubs.net",
"https://xena.treehouse.gi.ucsc.edu:443",
"https://pcawg.xenahubs.net",
"https://atacseq.xenahubs.net",
"https://singlecellnew.xenahubs.net",
"https://kidsfirst.xenahubs.net",
"https://tdi.xenahubs.net"
)
}
.xena_hosts <- c(
"publicHub",
"tcgaHub",
"gdcHub",
"gdcHubV18",
"icgcHub",
"toilHub",
"pancanAtlasHub",
"treehouseHub",
"pcawgHub",
"atacseqHub",
"singlecellHub",
"kidsfirstHub",
"tdiHub"
)
names(.xena_hosts) <- xena_default_hosts()
# Add Hiplot mirror url
# Still use UCSC Xena URL if it is not available
.xena_hosts_hiplot <- .xena_hosts
names(.xena_hosts_hiplot) <- c(
"https://xena-ucscpublic.hiplot.com.cn",
"https://xena-tcga.hiplot.com.cn",
"https://xena-gdc.hiplot.com.cn",
"https://xena-icgc.hiplot.com.cn",
"https://xena-toil.hiplot.com.cn",
"https://xena-pancanatlas.hiplot.com.cn",
"https://xena.treehouse.gi.ucsc.edu:443", #!
"https://xena-pcawg.hiplot.com.cn",
"https://xena-atacseq.hiplot.com.cn",
"https://singlecellnew.xenahubs.net", #!
"https://kidsfirst.xenahubs.net" #!
#"https://tdi.xenahubs.net" #!
)
# Map hiplot to ucsc
.xena_mirror_map <- names(.xena_hosts)
names(.xena_mirror_map) <- names(.xena_hosts_hiplot)
# Map ucsc to hiplot
.xena_mirror_map_rv <- names(.xena_hosts_hiplot)
names(.xena_mirror_map_rv) <- names(.xena_hosts)
##' Generate a XenaHub Object
##'
##' It is used to generate original
##' `XenaHub` object according to hosts, cohorts, datasets or hostName.
##' If these arguments not specified, all hosts and corresponding datasets
##' will be returned as a `XenaHub` object. All datasets can be found
##' at <https://xenabrowser.net/datapages/>.
##'
##'
##' @param hosts a character vector specify UCSC Xena hosts, all available hosts can be
##' found by `xena_default_hosts()` function. `hostName` is a more recommend option.
##' @param cohorts default is empty character vector, all cohorts will be returned.
##' @param datasets default is empty character vector, all datasets will be returned.
##' @param hostName name of host, available options can be accessed by `.xena_hosts`
##' This is an easier option for user than `hosts` option. Note, this option
##' will overlap `hosts`.
##' @return a [XenaHub] object
##' @author Shixiang Wang <w_shixiang@163.com>
##' @export
##' @importFrom httr stop_for_status POST content
##' @importFrom utils head tail
##' @examples
##' \dontrun{
##' #1 query all hosts, cohorts and datasets
##' xe = XenaHub()
##' xe
##' #2 query only TCGA hosts
##' xe = XenaHub(hostName = "tcgaHub")
##' xe
##' hosts(xe) # get hosts
##' cohorts(xe) # get cohorts
##' datasets(xe) # get datasets
##' samples(xe) # get samples
##' }
XenaHub <- function(hosts = xena_default_hosts(),
cohorts = character(),
datasets = character(),
hostName = c(
"publicHub",
"tcgaHub",
"gdcHub",
"gdcHubV18",
"icgcHub",
"toilHub",
"pancanAtlasHub",
"treehouseHub",
"pcawgHub",
"atacseqHub",
"singlecellHub",
"kidsfirstHub",
"tdiHub"
)) {
stopifnot(
is.character(hosts),
is.character(cohorts),
is.character(datasets)
)
hostName <- unique(hostName)
if (length(hostName) != length(.xena_hosts) &
all(
hostName %in% .xena_hosts
)) {
.temp <- names(.xena_hosts)
names(.temp) <- .xena_hosts
hostNames <- .temp %>%
as.data.frame() %>%
t() %>%
as.data.frame()
rm(.temp)
hosts <- as.character(hostNames[, hostName])
} else if (!all(hostName %in% .xena_hosts)) {
stop("Bad hostName, please check")
}
if (is.null(names(hosts))) {
names(hosts) <- hosts
}
hosts0 <- hosts
hosts <- Filter(.host_is_alive, hosts)
if (length(hosts) == 0L) { # nocov start
stop(
"\n no hosts responding:",
"\n ",
paste0(hosts0, collapse = "\n ")
)
} # nocov end
all_cohorts <- unlist(.host_cohorts(hosts), use.names = FALSE)
if (length(cohorts) == 0L) {
cohorts <- all_cohorts
} else {
hosts <- hosts[.cohort_datasets_count(hosts, cohorts) != 0L]
}
all_datasets <- unlist(.cohort_datasets(hosts, cohorts),
use.names = FALSE
)
if (length(datasets) == 0L) {
datasets <- all_datasets
} else {
if (!all(datasets %in% all_datasets)) { # nocov start
bad_dataset <- datasets[!datasets %in% all_datasets]
message("Following datasets are not in datasets of hosts, ignore them...")
message(bad_dataset)
} # nocov end
datasets <- all_datasets[all_datasets %in% datasets]
}
.XenaHub(
hosts = hosts,
cohorts = cohorts,
datasets = datasets
)
}
##' Get or Update Newest Data Information of UCSC Xena Data Hubs
##' @param saveTolocal logical. Whether save to local R package data directory for permanent use
##' or Not.
##' @return a `data.frame` contains all datasets information of Xena.
##' @author Shixiang Wang <w_shixiang@163.com>
##' @export
##' @examples
##' \dontrun{
##' XenaDataUpdate()
##' XenaDataUpdate(saveTolocal = TRUE)
##' }
XenaDataUpdate <- function(saveTolocal = TRUE) { # nocov start
# .p_all_cohorts(list(unique(XenaData$XenaHosts)[10]), exclude = list(NULL))
# .p_dataset_list(list(XenaData$XenaHosts[1]), list(XenaData$XenaCohorts[1]))
message("Disable hiplot firstly.")
options(use_hiplot = FALSE)
try_query = function(h, max_try = 3L) {
Sys.sleep(0.1)
tryCatch(
{
message("==> Trying #", abs(max_try - 4L))
.p_all_cohorts(list(h), exclude = list(NULL))
},
error = function(e) {
if (max_try == 1) {
warning("Tried 3 times but failed, this hub may down or please check URL or your internet connection!", immediate. = TRUE)
return(NULL)
} else {
try_query(h, max_try - 1L)
}
}
)
}
query_host = function(h) {
message("==> Searching cohorts for host ", h, "...")
chs <- try_query(h, max_try = 3L)
if (is.null(chs)) {
return(NULL)
}
chs <- setdiff(chs, "(unassigned)")
message("===> #", length(chs), " cohorts found.")
message("===> Querying datasets info...")
zz <- lapply(chs, function(x, h) {
.p_dataset_list(list(h), list(x))
}, h = h) %>%
stats::setNames(chs) %>%
dplyr::bind_rows(.id = "XenaCohorts")
message("===> #", nrow(zz), " datasets found.")
message("==> Done for host ", h, "...")
zz
}
message("=> Obtaining info from UCSC Xena hubs...")
XenaInfo <- lapply(names(.xena_hosts), query_host) %>%
stats::setNames(names(.xena_hosts)) %>%
dplyr::bind_rows(.id = "XenaHosts")
message("=> Done for obtaining.")
message("=> Parsing datasets metadata...")
XenaInfo <- XenaInfo %>%
dplyr::rename(
XenaDatasets = .data$name,
SampleCount = .data$count,
DataSubtype = .data$datasubtype,
Type = .data$type,
LongTitle = .data$longtitle,
ProbeMap = .data$probemap
) %>%
dplyr::mutate(XenaHostNames = .xena_hosts[.data$XenaHosts])
j_data <- lapply(XenaInfo$text, function(x) {
# decode metadata from json format
# note json data may have different elements for
# different cohort datasets
# more work need to be done here
#
# tt$text contains metadata for dataset
# tt$pmtext contains metadata for probemap
json_df <- jsonlite::parse_json(x)
dplyr::tibble(
Citation = json_df[["citation"]] %||% NA,
Label = json_df[["label"]] %||% NA,
Tags = .collapse_list(json_df[["tags"]]) %||% NA,
AnatomicalOrigin = .collapse_list(json_df[["anatomical_origin"]]) %||% NA,
SampleType = .collapse_list(json_df[["sample_type"]]) %||% NA,
Version = json_df[["version"]] %||% NA,
PrimaryDisease = json_df[["primary_disease"]] %||% NA,
Platform = json_df[["platform"]] %||% NA,
Unit = json_df[["unit"]] %||% NA
)
})
message("=> Done for parsing. Tidying...")
tidy_data <- dplyr::bind_rows(j_data)
XenaData <- dplyr::bind_cols(XenaInfo, tidy_data)
XenaData <- dplyr::as_tibble(XenaData)
XenaData <- XenaData %>%
dplyr::select(
c(
"XenaHosts", "XenaHostNames", "XenaCohorts", "XenaDatasets", "SampleCount",
"DataSubtype", "Label", "Type", "AnatomicalOrigin", "SampleType",
"Tags", "ProbeMap", "LongTitle", "Citation", "Version",
"Unit", "Platform"
)
)
message("=> Tidying done.")
if (saveTolocal) {
message("=> Saving...")
data_dir <- base::system.file("data", package = "UCSCXenaTools")
if (dir.exists(data_dir)) {
save(XenaData, file = file.path(data_dir, "XenaData.rda"), compress = "xz")
} else {
message("There is no data directory ", data_dir)
message("Please check it.")
}
}
message("=> Done.")
XenaData
} # nocov end
.collapse_list <- function(x) {
sapply(x, function(x) x) %>% paste0(collapse = ",")
}
`%||%` <- function(x, y) {
# ifelse(is.null(x), y, x)
if (is.null(x)) {
y
} else {
x
}
}
utils::globalVariables(c(
".p_dataset_metadata",
".p_all_cohorts",
".p_dataset_list"
))