--- a +++ b/R/XenaPrepare.R @@ -0,0 +1,277 @@ +##' Prepare (Load) Downloaded Datasets to R +##' +##' @author Shixiang Wang <w_shixiang@163.com> +##' @param objects a object of character vector or data.frame. If `objects` is data.frame, +##' it should be returned object of [XenaDownload] function. More easier way is +##' that objects can be character vector specify local files/directory and download urls. +##' @param objectsName specify names for elements of return object, i.e. names of list +##' @param use_chunk default is `FALSE`. If you want to select subset of original data, please set it to +##' `TRUE` and specify corresponding arguments: `chunk_size`, `select_direction`, `select_names`, +##' `callback`. +##' @param chunk_size the number of rows to include in each chunk +##' @param subset_rows logical expression indicating elements or rows to keep: +##' missing values are taken as false. `x` can be a representation of data frame +##' you wanna do subset operation. Of note, the first colname of most of datasets +##' in Xena will be set to "sample", you can use it to select rows. +##' @param select_cols expression, indicating columns to select from a data frame. +##' 'x' can be a representation of data frame you wanna do subset operation, +##' e.g. `select_cols = colnames(x)[1:3]` will keep only first to third column. +##' @param callback a function to call on each chunk, default is `NULL`, +##' this option will overvide operations of subset_rows and select_cols. +##' @param comment a character specify comment rows in files +##' @param na a character vectory specify `NA` values in files +##' @param ... other arguments transfer to `read_tsv` function or +##' `read_tsv_chunked` function (when `use_chunk` is `TRUE`) of `readr` package. +##' @return a list contains file data, which in way of tibbles +##' @export +##' @importFrom readr read_tsv +##' @importFrom readr read_tsv_chunked +##' @importFrom readr cols +##' @examples +##' \dontrun{ +##' xe = XenaGenerate(subset = XenaHostNames == "tcgaHub") +##' hosts(xe) +##' xe_query = XenaQuery(xe) +##' +##' xe_download = XenaDownload(xe_query) +##' dat = XenaPrepare(xe_download) +##' } + +XenaPrepare <- function(objects, + objectsName = NULL, + use_chunk = FALSE, + chunk_size = 100, + subset_rows = TRUE, + select_cols = TRUE, + callback = NULL, + comment = "#", + na = c("", "NA", "[Discrepancy]"), + ...) { + # objects can be url, local files/directory or xena object from xena download process + stopifnot( + is.character(objects) | + is.data.frame(objects), + is.logical(use_chunk) + ) + + subset_rows.bk <- subset_rows + select_cols.bk <- select_cols + + subset_rows <- substitute(subset_rows) + if (is.name(subset_rows)) { + subset_rows <- substitute(eval(subset_rows.bk)) + } + + select_cols <- substitute(select_cols) + if (is.name(select_cols)) { + select_cols <- substitute(eval(select_cols.bk)) + } + + # subset_rows <- substitute(subset_rows) + # select_cols <- substitute(select_cols) + + # subset_direction = match.arg(subset_direction) + + objects2 <- objects + + if (is.character(objects)) { + if (length(objects) == 0) { + stop("Please check you input!") + } + + # Is the input directory? + if (all(dir.exists(objects))) { + if (length(objects) > 1) { + stop("We do not accept multiple directories as input.") + } else { + files <- paste0(objects, "/", dir(objects)) + res <- lapply(files, function(x) { + if (use_chunk) { + if (is.null(callback)) { + f <- function(x, pos) { + subset(x, + eval(subset_rows), + select = eval(select_cols) + ) + } + } else { + f <- callback + } + + y <- readr::read_tsv_chunked( + x, + readr::DataFrameCallback$new(f), + chunk_size = chunk_size, + comment = comment, + na = na, + col_types = readr::cols() + ) + } else { + y <- readr::read_tsv( + x, + comment = comment, + na = na, + col_types = readr::cols(), + ... + ) + } + + y + }) + if (is.null(objectsName)) { + objectsName <- make.names(dir(objects)) + names(res) <- objectsName + } + } + } else if (all(file.exists(objects))) { + res <- lapply(objects, function(x) { + if (use_chunk) { + if (is.null(eval(callback))) { + f <- function(x, pos) { + subset(x, + eval(subset_rows), + select = eval(select_cols) + ) + } + } else { + f <- callback + } + + y <- readr::read_tsv_chunked( + x, + readr::DataFrameCallback$new(f), + chunk_size = chunk_size, + comment = comment, + na = na, + col_types = readr::cols() + ) + } else { + y <- readr::read_tsv( + x, + comment = comment, + na = na, + col_types = readr::cols(), + ... + ) + } + + y + }) + if (is.null(objectsName)) { + objectsName <- make.names(basename(objects)) + names(res) <- objectsName + } + if (length(res) == 1) { + res <- res[[1]] + } + } + else { + # check urls + all_right <- grepl(pattern = "http", x = objects) + + if (any(all_right)) { + objects <- objects[all_right] + if (length(objects) == 1) { + if (use_chunk) { + if (is.null(callback)) { + f <- function(x, pos) { + subset(x, + eval(subset_rows), + select = eval(select_cols) + ) + } + } else { + f <- callback + } + + res <- readr::read_tsv_chunked( + objects, + readr::DataFrameCallback$new(f), + chunk_size = chunk_size, + comment = comment, + na = na, + col_types = readr::cols() + ) + } else { + res <- readr::read_tsv( + objects, + comment = comment, + na = na, + col_types = readr::cols(), + ... + ) + } + + # res = suppressMessages(read_tsv(objects, comment=comment, na=na, ...)) + } else { + res <- lapply(objects, function(x) { + if (use_chunk) { + if (is.null(callback)) { + f <- function(x, pos) { + subset(x, + eval(subset_rows), + select = eval(select_cols) + ) + } + } else { + f <- callback + } + + y <- readr::read_tsv_chunked( + x, + readr::DataFrameCallback$new(f), + chunk_size = chunk_size, + comment = comment, + na = na, + col_types = readr::cols() + ) + } else { + y <- readr::read_tsv( + x, + comment = comment, + na = na, + col_types = readr::cols(), + ... + ) + } + + y + }) + + if (is.null(objectsName)) { + objectsName <- make.names(basename(objects)) + names(res) <- objectsName + } + } + } + all_wrong <- !all_right + if (any(all_wrong)) { + bad_urls <- objects2[all_wrong] + message("Some inputs are wrong, maybe you should check:") + print(bad_urls) + } + } + } else { + if (!"destfiles" %in% colnames(objects)) { + stop( + "Input data.frame should contain 'destfiles' column which generated by XenaDownload functions. Please check your input." + ) + } + + files <- objects$destfiles + res <- XenaPrepare( + files, + objectsName = objectsName, + use_chunk = use_chunk, + chunk_size = chunk_size, + subset_rows = subset_rows, + select_cols = select_cols, + callback = callback, + comment = comment, + na = na, + ... + ) + } + + return(res) +}