Diff of /R/XenaPrepare.R [000000] .. [0bdad5]

Switch to unified view

a b/R/XenaPrepare.R
1
##' Prepare (Load) Downloaded Datasets to R
2
##'
3
##' @author Shixiang Wang <w_shixiang@163.com>
4
##' @param objects a object of character vector or data.frame. If `objects` is data.frame,
5
##' it should be returned object of [XenaDownload] function. More easier way is
6
##' that objects can be character vector specify local files/directory and download urls.
7
##' @param objectsName specify names for elements of return object, i.e. names of list
8
##' @param use_chunk default is `FALSE`. If you want to select subset of original data, please set it to
9
##' `TRUE` and specify corresponding arguments: `chunk_size`, `select_direction`, `select_names`,
10
##' `callback`.
11
##' @param chunk_size the number of rows to include in each chunk
12
##' @param subset_rows logical expression indicating elements or rows to keep:
13
##' missing values are taken as false. `x` can be a representation of data frame
14
##' you wanna do subset operation. Of note, the first colname of most of datasets
15
##' in Xena will be set to "sample", you can use it to select rows.
16
##' @param select_cols expression, indicating columns to select from a data frame.
17
##' 'x' can be a representation of data frame you wanna do subset operation,
18
##' e.g. `select_cols = colnames(x)[1:3]` will keep only first to third column.
19
##' @param callback a function to call on each chunk, default is `NULL`,
20
##' this option will overvide operations of subset_rows and select_cols.
21
##' @param comment a character specify comment rows in files
22
##' @param na a character vectory specify `NA` values in files
23
##' @param ... other arguments transfer to `read_tsv` function or
24
##' `read_tsv_chunked` function (when `use_chunk` is `TRUE`) of `readr` package.
25
##' @return a list contains file data, which in way of tibbles
26
##' @export
27
##' @importFrom readr read_tsv
28
##' @importFrom readr read_tsv_chunked
29
##' @importFrom readr cols
30
##' @examples
31
##' \dontrun{
32
##' xe = XenaGenerate(subset = XenaHostNames == "tcgaHub")
33
##' hosts(xe)
34
##' xe_query = XenaQuery(xe)
35
##'
36
##' xe_download = XenaDownload(xe_query)
37
##' dat = XenaPrepare(xe_download)
38
##' }
39
40
XenaPrepare <- function(objects,
41
                        objectsName = NULL,
42
                        use_chunk = FALSE,
43
                        chunk_size = 100,
44
                        subset_rows = TRUE,
45
                        select_cols = TRUE,
46
                        callback = NULL,
47
                        comment = "#",
48
                        na = c("", "NA", "[Discrepancy]"),
49
                        ...) {
50
  # objects can be url, local files/directory or xena object from xena download process
51
  stopifnot(
52
    is.character(objects) |
53
      is.data.frame(objects),
54
    is.logical(use_chunk)
55
  )
56
57
  subset_rows.bk <- subset_rows
58
  select_cols.bk <- select_cols
59
60
  subset_rows <- substitute(subset_rows)
61
  if (is.name(subset_rows)) {
62
    subset_rows <- substitute(eval(subset_rows.bk))
63
  }
64
65
  select_cols <- substitute(select_cols)
66
  if (is.name(select_cols)) {
67
    select_cols <- substitute(eval(select_cols.bk))
68
  }
69
70
  # subset_rows <- substitute(subset_rows)
71
  # select_cols <- substitute(select_cols)
72
73
  #    subset_direction = match.arg(subset_direction)
74
75
  objects2 <- objects
76
77
  if (is.character(objects)) {
78
    if (length(objects) == 0) {
79
      stop("Please check you input!")
80
    }
81
82
    # Is the input directory?
83
    if (all(dir.exists(objects))) {
84
      if (length(objects) > 1) {
85
        stop("We do not accept multiple directories as input.")
86
      } else {
87
        files <- paste0(objects, "/", dir(objects))
88
        res <- lapply(files, function(x) {
89
          if (use_chunk) {
90
            if (is.null(callback)) {
91
              f <- function(x, pos) {
92
                subset(x,
93
                  eval(subset_rows),
94
                  select = eval(select_cols)
95
                )
96
              }
97
            } else {
98
              f <- callback
99
            }
100
101
            y <- readr::read_tsv_chunked(
102
              x,
103
              readr::DataFrameCallback$new(f),
104
              chunk_size = chunk_size,
105
              comment = comment,
106
              na = na,
107
              col_types = readr::cols()
108
            )
109
          } else {
110
            y <- readr::read_tsv(
111
              x,
112
              comment = comment,
113
              na = na,
114
              col_types = readr::cols(),
115
              ...
116
            )
117
          }
118
119
          y
120
        })
121
        if (is.null(objectsName)) {
122
          objectsName <- make.names(dir(objects))
123
          names(res) <- objectsName
124
        }
125
      }
126
    } else if (all(file.exists(objects))) {
127
      res <- lapply(objects, function(x) {
128
        if (use_chunk) {
129
          if (is.null(eval(callback))) {
130
            f <- function(x, pos) {
131
              subset(x,
132
                eval(subset_rows),
133
                select = eval(select_cols)
134
              )
135
            }
136
          } else {
137
            f <- callback
138
          }
139
140
          y <- readr::read_tsv_chunked(
141
            x,
142
            readr::DataFrameCallback$new(f),
143
            chunk_size = chunk_size,
144
            comment = comment,
145
            na = na,
146
            col_types = readr::cols()
147
          )
148
        } else {
149
          y <- readr::read_tsv(
150
            x,
151
            comment = comment,
152
            na = na,
153
            col_types = readr::cols(),
154
            ...
155
          )
156
        }
157
158
        y
159
      })
160
      if (is.null(objectsName)) {
161
        objectsName <- make.names(basename(objects))
162
        names(res) <- objectsName
163
      }
164
      if (length(res) == 1) {
165
        res <- res[[1]]
166
      }
167
    }
168
    else {
169
      # check urls
170
      all_right <- grepl(pattern = "http", x = objects)
171
172
      if (any(all_right)) {
173
        objects <- objects[all_right]
174
        if (length(objects) == 1) {
175
          if (use_chunk) {
176
            if (is.null(callback)) {
177
              f <- function(x, pos) {
178
                subset(x,
179
                  eval(subset_rows),
180
                  select = eval(select_cols)
181
                )
182
              }
183
            } else {
184
              f <- callback
185
            }
186
187
            res <- readr::read_tsv_chunked(
188
              objects,
189
              readr::DataFrameCallback$new(f),
190
              chunk_size = chunk_size,
191
              comment = comment,
192
              na = na,
193
              col_types = readr::cols()
194
            )
195
          } else {
196
            res <- readr::read_tsv(
197
              objects,
198
              comment = comment,
199
              na = na,
200
              col_types = readr::cols(),
201
              ...
202
            )
203
          }
204
205
          # res = suppressMessages(read_tsv(objects, comment=comment, na=na, ...))
206
        } else {
207
          res <- lapply(objects, function(x) {
208
            if (use_chunk) {
209
              if (is.null(callback)) {
210
                f <- function(x, pos) {
211
                  subset(x,
212
                    eval(subset_rows),
213
                    select = eval(select_cols)
214
                  )
215
                }
216
              } else {
217
                f <- callback
218
              }
219
220
              y <- readr::read_tsv_chunked(
221
                x,
222
                readr::DataFrameCallback$new(f),
223
                chunk_size = chunk_size,
224
                comment = comment,
225
                na = na,
226
                col_types = readr::cols()
227
              )
228
            } else {
229
              y <- readr::read_tsv(
230
                x,
231
                comment = comment,
232
                na = na,
233
                col_types = readr::cols(),
234
                ...
235
              )
236
            }
237
238
            y
239
          })
240
241
          if (is.null(objectsName)) {
242
            objectsName <- make.names(basename(objects))
243
            names(res) <- objectsName
244
          }
245
        }
246
      }
247
      all_wrong <- !all_right
248
      if (any(all_wrong)) {
249
        bad_urls <- objects2[all_wrong]
250
        message("Some inputs are wrong, maybe you should check:")
251
        print(bad_urls)
252
      }
253
    }
254
  } else {
255
    if (!"destfiles" %in% colnames(objects)) {
256
      stop(
257
        "Input data.frame should contain 'destfiles' column which generated by XenaDownload functions. Please check your input."
258
      )
259
    }
260
261
    files <- objects$destfiles
262
    res <- XenaPrepare(
263
      files,
264
      objectsName = objectsName,
265
      use_chunk = use_chunk,
266
      chunk_size = chunk_size,
267
      subset_rows = subset_rows,
268
      select_cols = select_cols,
269
      callback = callback,
270
      comment = comment,
271
      na = na,
272
      ...
273
    )
274
  }
275
276
  return(res)
277
}