Diff of /R/XenaHub-class.R [000000] .. [0bdad5]

Switch to unified view

a b/R/XenaHub-class.R
1
# FUN: Set Class & operate XenaHub object directly
2
3
##' Class XenaHub
4
##' @description  a S4 class to represent UCSC Xena Data Hubs
5
##' @slot hosts hosts of data hubs
6
##' @slot cohorts cohorts of data hubs
7
##' @slot datasets datasets of data hubs
8
##' @importFrom methods new
9
##' @export
10
.XenaHub <- setClass(
11
  "XenaHub",
12
  representation = representation(
13
    hosts = "character",
14
    cohorts = "character",
15
    datasets = "character"
16
  )
17
)
18
19
setMethod("show", "XenaHub", function(object) {
20
  showsome <- function(label, x) {
21
    len <- length(x)
22
    if (len > 6) {
23
      x <- c(head(x, 3), "...", tail(x, 2))
24
    }
25
    cat(label,
26
      "() (",
27
      len,
28
      " total):",
29
      "\n  ",
30
      paste0(x, collapse = "\n  "),
31
      "\n",
32
      sep = ""
33
    )
34
  }
35
  cat("class:", class(object), "\n")
36
  cat("hosts():",
37
    "\n  ", paste0(hosts(object), collapse = "\n  "),
38
    "\n",
39
    sep = ""
40
  )
41
  showsome("cohorts", cohorts(object))
42
  showsome("datasets", datasets(object))
43
})
44
45
46
##' @title UCSC Xena Default Hosts
47
##' @description Return Xena default hosts
48
##' @return A character vector include current defalut hosts
49
##' @author Shixiang Wang <w_shixiang@163.com>
50
##' @seealso [UCSCXenaTools::XenaHub()]
51
##' @export
52
xena_default_hosts <- function() {
53
  c(
54
    "https://ucscpublic.xenahubs.net",
55
    "https://tcga.xenahubs.net",
56
    "https://gdc.xenahubs.net",
57
    "https://gdcV18.xenahubs.net",
58
    "https://icgc.xenahubs.net",
59
    "https://toil.xenahubs.net",
60
    "https://pancanatlas.xenahubs.net",
61
    "https://xena.treehouse.gi.ucsc.edu:443",
62
    "https://pcawg.xenahubs.net",
63
    "https://atacseq.xenahubs.net",
64
    "https://singlecellnew.xenahubs.net",
65
    "https://kidsfirst.xenahubs.net",
66
    "https://tdi.xenahubs.net"
67
  )
68
}
69
70
.xena_hosts <- c(
71
  "publicHub",
72
  "tcgaHub",
73
  "gdcHub",
74
  "gdcHubV18",
75
  "icgcHub",
76
  "toilHub",
77
  "pancanAtlasHub",
78
  "treehouseHub",
79
  "pcawgHub",
80
  "atacseqHub",
81
  "singlecellHub",
82
  "kidsfirstHub",
83
  "tdiHub"
84
)
85
86
names(.xena_hosts) <- xena_default_hosts()
87
88
# Add Hiplot mirror url
89
# Still use UCSC Xena URL if it is not available
90
.xena_hosts_hiplot <- .xena_hosts
91
names(.xena_hosts_hiplot) <- c(
92
  "https://xena-ucscpublic.hiplot.com.cn",
93
  "https://xena-tcga.hiplot.com.cn",
94
  "https://xena-gdc.hiplot.com.cn",
95
  "https://xena-icgc.hiplot.com.cn",
96
  "https://xena-toil.hiplot.com.cn",
97
  "https://xena-pancanatlas.hiplot.com.cn",
98
  "https://xena.treehouse.gi.ucsc.edu:443", #!
99
  "https://xena-pcawg.hiplot.com.cn",
100
  "https://xena-atacseq.hiplot.com.cn",
101
  "https://singlecellnew.xenahubs.net", #!
102
  "https://kidsfirst.xenahubs.net" #!
103
  #"https://tdi.xenahubs.net" #!
104
)
105
# Map hiplot to ucsc
106
.xena_mirror_map <- names(.xena_hosts)
107
names(.xena_mirror_map) <- names(.xena_hosts_hiplot)
108
# Map ucsc to hiplot
109
.xena_mirror_map_rv <- names(.xena_hosts_hiplot)
110
names(.xena_mirror_map_rv) <- names(.xena_hosts)
111
112
##' Generate a XenaHub Object
113
##'
114
##' It is used to generate original
115
##' `XenaHub` object according to hosts, cohorts, datasets or hostName.
116
##' If these arguments not specified, all hosts and corresponding datasets
117
##' will be returned as a `XenaHub` object. All datasets can be found
118
##' at <https://xenabrowser.net/datapages/>.
119
##'
120
##'
121
##' @param hosts a character vector specify UCSC Xena hosts, all available hosts can be
122
##' found by `xena_default_hosts()` function. `hostName` is a more recommend option.
123
##' @param cohorts default is empty character vector, all cohorts will be returned.
124
##' @param datasets default is empty character vector, all datasets will be returned.
125
##' @param hostName name of host, available options can be accessed by `.xena_hosts`
126
##'  This is an easier option for user than `hosts` option. Note, this option
127
##'  will overlap `hosts`.
128
##' @return a [XenaHub] object
129
##' @author Shixiang Wang <w_shixiang@163.com>
130
##' @export
131
##' @importFrom httr stop_for_status POST content
132
##' @importFrom utils head tail
133
##' @examples
134
##' \dontrun{
135
##' #1 query all hosts, cohorts and datasets
136
##' xe = XenaHub()
137
##' xe
138
##' #2 query only TCGA hosts
139
##' xe = XenaHub(hostName = "tcgaHub")
140
##' xe
141
##' hosts(xe)     # get hosts
142
##' cohorts(xe)   # get cohorts
143
##' datasets(xe)  # get datasets
144
##' samples(xe)   # get samples
145
##' }
146
XenaHub <- function(hosts = xena_default_hosts(),
147
                    cohorts = character(),
148
                    datasets = character(),
149
                    hostName = c(
150
                      "publicHub",
151
                      "tcgaHub",
152
                      "gdcHub",
153
                      "gdcHubV18",
154
                      "icgcHub",
155
                      "toilHub",
156
                      "pancanAtlasHub",
157
                      "treehouseHub",
158
                      "pcawgHub",
159
                      "atacseqHub",
160
                      "singlecellHub",
161
                      "kidsfirstHub",
162
                      "tdiHub"
163
                    )) {
164
  stopifnot(
165
    is.character(hosts),
166
    is.character(cohorts),
167
    is.character(datasets)
168
  )
169
170
  hostName <- unique(hostName)
171
172
  if (length(hostName) != length(.xena_hosts) &
173
    all(
174
      hostName %in% .xena_hosts
175
    )) {
176
    .temp <- names(.xena_hosts)
177
    names(.temp) <- .xena_hosts
178
    hostNames <- .temp %>%
179
      as.data.frame() %>%
180
      t() %>%
181
      as.data.frame()
182
    rm(.temp)
183
184
    hosts <- as.character(hostNames[, hostName])
185
  } else if (!all(hostName %in% .xena_hosts)) {
186
    stop("Bad hostName, please check")
187
  }
188
189
  if (is.null(names(hosts))) {
190
    names(hosts) <- hosts
191
  }
192
193
  hosts0 <- hosts
194
  hosts <- Filter(.host_is_alive, hosts)
195
  if (length(hosts) == 0L) { # nocov start
196
    stop(
197
      "\n  no hosts responding:",
198
      "\n    ",
199
      paste0(hosts0, collapse = "\n  ")
200
    )
201
  } # nocov end
202
203
  all_cohorts <- unlist(.host_cohorts(hosts), use.names = FALSE)
204
  if (length(cohorts) == 0L) {
205
    cohorts <- all_cohorts
206
  } else {
207
    hosts <- hosts[.cohort_datasets_count(hosts, cohorts) != 0L]
208
  }
209
210
  all_datasets <- unlist(.cohort_datasets(hosts, cohorts),
211
    use.names = FALSE
212
  )
213
  if (length(datasets) == 0L) {
214
    datasets <- all_datasets
215
  } else {
216
    if (!all(datasets %in% all_datasets)) { # nocov start
217
      bad_dataset <- datasets[!datasets %in% all_datasets]
218
      message("Following datasets are not in datasets of hosts, ignore them...")
219
      message(bad_dataset)
220
    } # nocov end
221
    datasets <- all_datasets[all_datasets %in% datasets]
222
  }
223
224
225
  .XenaHub(
226
    hosts = hosts,
227
    cohorts = cohorts,
228
    datasets = datasets
229
  )
230
}
231
232
##' Get or Update Newest Data Information of UCSC Xena Data Hubs
233
##' @param saveTolocal logical. Whether save to local R package data directory for permanent use
234
##' or Not.
235
##' @return a `data.frame` contains all datasets information of Xena.
236
##' @author Shixiang Wang <w_shixiang@163.com>
237
##' @export
238
##' @examples
239
##' \dontrun{
240
##' XenaDataUpdate()
241
##' XenaDataUpdate(saveTolocal = TRUE)
242
##' }
243
XenaDataUpdate <- function(saveTolocal = TRUE) { # nocov start
244
  # .p_all_cohorts(list(unique(XenaData$XenaHosts)[10]), exclude = list(NULL))
245
  # .p_dataset_list(list(XenaData$XenaHosts[1]), list(XenaData$XenaCohorts[1]))
246
  message("Disable hiplot firstly.")
247
  options(use_hiplot = FALSE)
248
  try_query = function(h, max_try = 3L) {
249
    Sys.sleep(0.1)
250
    tryCatch(
251
      {
252
        message("==> Trying #", abs(max_try - 4L))
253
        .p_all_cohorts(list(h), exclude = list(NULL))
254
      },
255
      error = function(e) {
256
        if (max_try == 1) {
257
          warning("Tried 3 times but failed, this hub may down or please check URL or your internet connection!", immediate. = TRUE)
258
          return(NULL)
259
        } else {
260
          try_query(h, max_try - 1L)
261
        }
262
      }
263
    )
264
  }
265
266
  query_host = function(h) {
267
      message("==> Searching cohorts for host ", h, "...")
268
269
      chs <- try_query(h, max_try = 3L)
270
      if (is.null(chs)) {
271
          return(NULL)
272
      }
273
      chs <- setdiff(chs, "(unassigned)")
274
      message("===> #", length(chs), " cohorts found.")
275
      message("===> Querying datasets info...")
276
      zz <- lapply(chs, function(x, h) {
277
          .p_dataset_list(list(h), list(x))
278
      }, h = h) %>%
279
          stats::setNames(chs) %>%
280
          dplyr::bind_rows(.id = "XenaCohorts")
281
      message("===> #", nrow(zz), " datasets found.")
282
      message("==> Done for host ", h, "...")
283
      zz
284
  }
285
286
  message("=> Obtaining info from UCSC Xena hubs...")
287
  XenaInfo <- lapply(names(.xena_hosts), query_host) %>%
288
    stats::setNames(names(.xena_hosts)) %>%
289
    dplyr::bind_rows(.id = "XenaHosts")
290
291
  message("=> Done for obtaining.")
292
  message("=> Parsing datasets metadata...")
293
294
  XenaInfo <- XenaInfo %>%
295
    dplyr::rename(
296
      XenaDatasets = .data$name,
297
      SampleCount = .data$count,
298
      DataSubtype = .data$datasubtype,
299
      Type = .data$type,
300
      LongTitle = .data$longtitle,
301
      ProbeMap = .data$probemap
302
    ) %>%
303
    dplyr::mutate(XenaHostNames = .xena_hosts[.data$XenaHosts])
304
305
  j_data <- lapply(XenaInfo$text, function(x) {
306
    # decode metadata from json format
307
    # note json data may have different elements for
308
    #                different cohort datasets
309
    # more work need to be done here
310
    #
311
    # tt$text contains metadata for dataset
312
    # tt$pmtext contains metadata for probemap
313
    json_df <- jsonlite::parse_json(x)
314
    dplyr::tibble(
315
      Citation = json_df[["citation"]] %||% NA,
316
      Label = json_df[["label"]] %||% NA,
317
      Tags = .collapse_list(json_df[["tags"]]) %||% NA,
318
      AnatomicalOrigin = .collapse_list(json_df[["anatomical_origin"]]) %||% NA,
319
      SampleType = .collapse_list(json_df[["sample_type"]]) %||% NA,
320
      Version = json_df[["version"]] %||% NA,
321
      PrimaryDisease = json_df[["primary_disease"]] %||% NA,
322
      Platform = json_df[["platform"]] %||% NA,
323
      Unit = json_df[["unit"]] %||% NA
324
    )
325
  })
326
327
  message("=> Done for parsing. Tidying...")
328
329
  tidy_data <- dplyr::bind_rows(j_data)
330
  XenaData <- dplyr::bind_cols(XenaInfo, tidy_data)
331
  XenaData <- dplyr::as_tibble(XenaData)
332
333
  XenaData <- XenaData %>%
334
    dplyr::select(
335
      c(
336
        "XenaHosts", "XenaHostNames", "XenaCohorts", "XenaDatasets", "SampleCount",
337
        "DataSubtype", "Label", "Type", "AnatomicalOrigin", "SampleType",
338
        "Tags", "ProbeMap", "LongTitle", "Citation", "Version",
339
        "Unit", "Platform"
340
      )
341
    )
342
343
  message("=> Tidying done.")
344
345
  if (saveTolocal) {
346
    message("=> Saving...")
347
    data_dir <- base::system.file("data", package = "UCSCXenaTools")
348
    if (dir.exists(data_dir)) {
349
      save(XenaData, file = file.path(data_dir, "XenaData.rda"), compress = "xz")
350
    } else {
351
      message("There is no data directory ", data_dir)
352
      message("Please check it.")
353
    }
354
  }
355
  message("=> Done.")
356
  XenaData
357
} # nocov end
358
359
.collapse_list <- function(x) {
360
  sapply(x, function(x) x) %>% paste0(collapse = ",")
361
}
362
363
`%||%` <- function(x, y) {
364
  # ifelse(is.null(x), y, x)
365
  if (is.null(x)) {
366
    y
367
  } else {
368
    x
369
  }
370
}
371
372
utils::globalVariables(c(
373
  ".p_dataset_metadata",
374
  ".p_all_cohorts",
375
  ".p_dataset_list"
376
))