[0bdad5]: / R / fetch.R

Download this file

316 lines (293 with data), 10.4 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
#' Fetch Data from UCSC Xena Hosts
#'
#' When you want to query just data for several genes/samples from UCSC Xena datasets, a better way
#' is to use these `fetch_` functions instead of downloading a whole dataset. Details about functions
#' please see the following sections.
#'
#' There are three primary data types: dense matrix (samples by probes (or say identifiers)),
#' sparse (sample, position, variant), and segmented (sample, position, value).
#'
#' Dense matrices can be genotypic or phenotypic, it is a sample-by-identifiers matrix.
#' Phenotypic matrices have associated field metadata (descriptive names, codes, etc.).
#' Genotypic matricies may have an associated probeMap, which maps probes to genomic locations.
#' If a matrix has hugo probeMap, the probes themselves are gene names. Otherwise, a probeMap is
#' used to map a gene location to a set of probes.
#'
#' @param host a UCSC Xena host, like "https://toil.xenahubs.net".
#' All available hosts can be printed by [xena_default_hosts()].
#' @param dataset a UCSC Xena dataset, like "tcga_RSEM_gene_tpm".
#' All available datasets can be printed by running `XenaData$XenaDatasets` or
#' obtained from [UCSC Xena datapages](https://xenabrowser.net/datapages/).
#' @param identifiers Identifiers could be probe (like "ENSG00000000419.12"),
#' gene (like "TP53") etc.. If it is `NULL`, all identifiers in the dataset will be used.
#' @param samples ID of samples, like "TCGA-02-0047-01".
#' If it is `NULL`, all samples in the dataset will be used. However, it is better to download
#' the whole datasets if you query many samples and genes.
#' @param check if `TRUE`, check whether specified `identifiers` and `samples` exist the dataset
#' (all failed items will be filtered out). However, if `FALSE`, the code is much faster.
#' @param use_probeMap if `TRUE`, will check if the dataset has ProbeMap firstly.
#' When the dataset you want to query has a identifier-to-gene mapping, identifiers can be
#' gene symbols even the identifiers of dataset are probes or others.
#' @param time_limit time limit for getting response in seconds.
#' @return a `matirx` or character vector or a `list`.
#' @examples
#' library(UCSCXenaTools)
#'
#' host <- "https://toil.xenahubs.net"
#' dataset <- "tcga_RSEM_gene_tpm"
#' samples <- c("TCGA-02-0047-01", "TCGA-02-0055-01", "TCGA-02-2483-01", "TCGA-02-2485-01")
#' probes <- c("ENSG00000282740.1", "ENSG00000000005.5", "ENSG00000000419.12")
#' genes <- c("TP53", "RB1", "PIK3CA")
#'
#' \donttest{
#' # Fetch samples
#' fetch_dataset_samples(host, dataset, 2)
#' # Fetch identifiers
#' fetch_dataset_identifiers(host, dataset)
#' # Fetch expression value by probes
#' fetch_dense_values(host, dataset, probes, samples, check = FALSE)
#' # Fetch expression value by gene symbol (if the dataset has probeMap)
#' has_probeMap(host, dataset)
#' fetch_dense_values(host, dataset, genes, samples, check = FALSE, use_probeMap = TRUE)
#' }
#' @export
fetch <- function(host, dataset) {
message("This function is used to build consistent documentation.")
message("Please use a function starts with fetch_")
}
check_hiplot <- function(host) {
use_hiplot <- getOption("use_hiplot", default = FALSE)
if (use_hiplot) {
# Check website status
use_hiplot <- tryCatch(
{
httr::http_error(host)
TRUE
},
error = function(e) {
message("The hiplot server may down, we will not use it for now.")
FALSE
}
)
}
if (use_hiplot) {
if (!grepl("hiplot", host)) {
message("Use hiplot server (China) for mirrored data hubs (set 'options(use_hiplot = FALSE)' to disable it)")
host2 <- as.character(.xena_mirror_map_rv[host])
if (!is.na(host2)) {
host <- host2
}
}
}
return(host)
}
#' @describeIn fetch fetches values from a dense matrix.
#' @export
fetch_dense_values <- function(host, dataset, identifiers = NULL, samples = NULL,
check = TRUE, use_probeMap = FALSE, time_limit = 30) {
stopifnot(
length(host) == 1, length(dataset) == 1,
is.character(host), is.character(dataset),
is.logical(check), is.logical(use_probeMap)
)
.attach_this()
host <- check_hiplot(host)
if (check) {
# obtain all samples
all_samples <- fetch_dataset_samples(host, dataset)
# obtain all identifiers
use_probe <- use_probeMap && has_probeMap(host, dataset)
if (use_probe) {
message("-> Obtaining gene symbols from probeMap...")
url <- has_probeMap(host, dataset, return_url = TRUE)
all_identifiers <- use_cache(url, op = "readr::read_tsv(url,
col_types = readr::cols()
)[[2]]")
} else {
all_identifiers <- fetch_dataset_identifiers(host, dataset)
}
message("-> Checking identifiers...")
if (is.null(identifiers)) {
identifiers <- all_identifiers
} else {
if (!is.character(identifiers)) stop("Bad type for identifiers.")
if (!all(identifiers %in% all_identifiers)) {
which_in <- identifiers %in% all_identifiers
message("The following identifiers have been removed from host ", host, " dataset ", dataset)
print(identifiers[!which_in])
identifiers <- identifiers[which_in]
if (length(identifiers) == 0) {
stop("Bad identifiers, no one left, check input?")
}
}
}
message("-> Done.")
message("-> Checking samples...")
if (is.null(samples)) {
samples <- all_samples
} else {
if (!is.character(samples)) stop("Bad type for samples.")
if (!all(samples %in% all_samples)) {
which_in <- samples %in% all_samples
message("The following samples have been removed from host ", host, " dataset ", dataset)
print(samples[!which_in])
samples <- samples[which_in]
if (length(samples) == 0) {
stop("Bad samples, no one left, check input?")
}
}
}
message("-> Done.")
} else {
if (is.null(samples)) {
# obtain all samples
samples <- fetch_dataset_samples(host, dataset)
}
if (is.null(identifiers)) {
# obtain all identifiers
identifiers <- fetch_dataset_identifiers(host, dataset)
}
}
if (length(samples) == 1) {
samples <- as.list(samples)
}
if (length(identifiers) == 1) {
identifiers <- as.list(identifiers)
}
if (use_probeMap) {
message("-> Checking if the dataset has probeMap...")
if (has_probeMap(host, dataset)) {
message("-> Done. ProbeMap is found.")
t_start = Sys.time()
while (as.numeric(Sys.time() - t_start) < time_limit) {
res <- tryCatch(
{
.p_dataset_gene_probe_avg(host, dataset, samples, identifiers)
},
error = function(e) {
message("-> Query faild. Retrying...")
list(has_error = TRUE, error_info = e)
}
)
if (is.data.frame(res)) {
break()
}
Sys.sleep(1)
}
if (!is.data.frame(res)) {
stop(paste(
"The response times out and still returns an error",
res$error_info$message,
sep = "\n"
))
}
if (!is.null(unlist(res[["scores"]]))) {
res <- t(sapply(res[["scores"]], base::rbind))
rownames(res) <- identifiers
colnames(res) <- samples
return(res)
}
}
message("-> Done. No probeMap found or error happened, use old way...")
}
t_start = Sys.time()
while (as.numeric(Sys.time() - t_start) < time_limit) {
res <- tryCatch(
{
.p_dataset_fetch(host, dataset, samples, identifiers)
},
error = function(e) {
message("-> Query faild. Retrying...")
list(has_error = TRUE, error_info = e)
}
)
if (is.atomic(res)) {
break()
}
Sys.sleep(1)
}
if (!is.atomic(res)) {
stop(paste(
"The response times out and still returns an error",
res$error_info$message,
sep = "\n"
))
}
message("-> Query done.")
rownames(res) <- identifiers
colnames(res) <- samples
res
}
#' @describeIn fetch fetches values from a sparse `data.frame`.
#' @param genes gene names.
#' @export
fetch_sparse_values <- function(host, dataset, genes, samples = NULL,
time_limit = 30) {
# fetch_sparse_values("https://ucscpublic.xenahubs.net", "ccle/CCLE_DepMap_18Q2_maf_20180502", c("TP53", "KRAS")) -> mm
stopifnot(
length(host) == 1, length(dataset) == 1,
is.character(host), is.character(dataset)
)
.attach_this()
host <- check_hiplot(host)
if (is.null(samples)) {
samples <- fetch_dataset_samples(host, dataset)
}
t_start = Sys.time()
while (as.numeric(Sys.time() - t_start) < time_limit) {
res <- tryCatch(
{
.p_sparse_data(host, dataset, samples, genes)
},
error = function(e) {
message("-> Query faild. Retrying...")
list(has_error = TRUE, error_info = e)
}
)
if (is.list(res)) {
break()
}
Sys.sleep(1)
}
res
}
#' @describeIn fetch fetches samples from a dataset
#' @param limit number of samples, if `NULL`, return all samples.
#' @export
fetch_dataset_samples <- function(host, dataset, limit = NULL) {
.attach_this()
host <- check_hiplot(host)
if (is.null(limit)) limit = -1
.p_dataset_samples(host = host, dataset = dataset, limit = limit)
}
#' @describeIn fetch fetches identifies from a dataset.
#' @export
fetch_dataset_identifiers <- function(host, dataset) {
.attach_this()
host <- check_hiplot(host)
.p_dataset_field(host = host, dataset = dataset)
}
#' @describeIn fetch checks if a dataset has ProbeMap.
#' @param return_url if `TRUE`, returns the info of probeMap
#' instead of a logical value when the result exists.
#' @export
has_probeMap <- function(host, dataset, return_url = FALSE) {
.attach_this()
if (! host %in% .xena_mirror_map) {
host <- .xena_mirror_map[host]
}
df <- base::subset(UCSCXenaTools::XenaData, XenaHosts == host & XenaDatasets == dataset)
rv <- !is.na(df[["ProbeMap"]])
if (rv && return_url) {
paste0(
df$XenaHosts, "/download/",
df[["ProbeMap"]]
)
} else rv
}
utils::globalVariables(
c(
".p_dataset_fetch", ".p_dataset_field", ".p_dataset_gene_probe_avg",
".p_dataset_samples", ".p_sparse_data"
)
)