|
a |
|
b/R/XenaPrepare.R |
|
|
1 |
##' Prepare (Load) Downloaded Datasets to R |
|
|
2 |
##' |
|
|
3 |
##' @author Shixiang Wang <w_shixiang@163.com> |
|
|
4 |
##' @param objects a object of character vector or data.frame. If `objects` is data.frame, |
|
|
5 |
##' it should be returned object of [XenaDownload] function. More easier way is |
|
|
6 |
##' that objects can be character vector specify local files/directory and download urls. |
|
|
7 |
##' @param objectsName specify names for elements of return object, i.e. names of list |
|
|
8 |
##' @param use_chunk default is `FALSE`. If you want to select subset of original data, please set it to |
|
|
9 |
##' `TRUE` and specify corresponding arguments: `chunk_size`, `select_direction`, `select_names`, |
|
|
10 |
##' `callback`. |
|
|
11 |
##' @param chunk_size the number of rows to include in each chunk |
|
|
12 |
##' @param subset_rows logical expression indicating elements or rows to keep: |
|
|
13 |
##' missing values are taken as false. `x` can be a representation of data frame |
|
|
14 |
##' you wanna do subset operation. Of note, the first colname of most of datasets |
|
|
15 |
##' in Xena will be set to "sample", you can use it to select rows. |
|
|
16 |
##' @param select_cols expression, indicating columns to select from a data frame. |
|
|
17 |
##' 'x' can be a representation of data frame you wanna do subset operation, |
|
|
18 |
##' e.g. `select_cols = colnames(x)[1:3]` will keep only first to third column. |
|
|
19 |
##' @param callback a function to call on each chunk, default is `NULL`, |
|
|
20 |
##' this option will overvide operations of subset_rows and select_cols. |
|
|
21 |
##' @param comment a character specify comment rows in files |
|
|
22 |
##' @param na a character vectory specify `NA` values in files |
|
|
23 |
##' @param ... other arguments transfer to `read_tsv` function or |
|
|
24 |
##' `read_tsv_chunked` function (when `use_chunk` is `TRUE`) of `readr` package. |
|
|
25 |
##' @return a list contains file data, which in way of tibbles |
|
|
26 |
##' @export |
|
|
27 |
##' @importFrom readr read_tsv |
|
|
28 |
##' @importFrom readr read_tsv_chunked |
|
|
29 |
##' @importFrom readr cols |
|
|
30 |
##' @examples |
|
|
31 |
##' \dontrun{ |
|
|
32 |
##' xe = XenaGenerate(subset = XenaHostNames == "tcgaHub") |
|
|
33 |
##' hosts(xe) |
|
|
34 |
##' xe_query = XenaQuery(xe) |
|
|
35 |
##' |
|
|
36 |
##' xe_download = XenaDownload(xe_query) |
|
|
37 |
##' dat = XenaPrepare(xe_download) |
|
|
38 |
##' } |
|
|
39 |
|
|
|
40 |
XenaPrepare <- function(objects, |
|
|
41 |
objectsName = NULL, |
|
|
42 |
use_chunk = FALSE, |
|
|
43 |
chunk_size = 100, |
|
|
44 |
subset_rows = TRUE, |
|
|
45 |
select_cols = TRUE, |
|
|
46 |
callback = NULL, |
|
|
47 |
comment = "#", |
|
|
48 |
na = c("", "NA", "[Discrepancy]"), |
|
|
49 |
...) { |
|
|
50 |
# objects can be url, local files/directory or xena object from xena download process |
|
|
51 |
stopifnot( |
|
|
52 |
is.character(objects) | |
|
|
53 |
is.data.frame(objects), |
|
|
54 |
is.logical(use_chunk) |
|
|
55 |
) |
|
|
56 |
|
|
|
57 |
subset_rows.bk <- subset_rows |
|
|
58 |
select_cols.bk <- select_cols |
|
|
59 |
|
|
|
60 |
subset_rows <- substitute(subset_rows) |
|
|
61 |
if (is.name(subset_rows)) { |
|
|
62 |
subset_rows <- substitute(eval(subset_rows.bk)) |
|
|
63 |
} |
|
|
64 |
|
|
|
65 |
select_cols <- substitute(select_cols) |
|
|
66 |
if (is.name(select_cols)) { |
|
|
67 |
select_cols <- substitute(eval(select_cols.bk)) |
|
|
68 |
} |
|
|
69 |
|
|
|
70 |
# subset_rows <- substitute(subset_rows) |
|
|
71 |
# select_cols <- substitute(select_cols) |
|
|
72 |
|
|
|
73 |
# subset_direction = match.arg(subset_direction) |
|
|
74 |
|
|
|
75 |
objects2 <- objects |
|
|
76 |
|
|
|
77 |
if (is.character(objects)) { |
|
|
78 |
if (length(objects) == 0) { |
|
|
79 |
stop("Please check you input!") |
|
|
80 |
} |
|
|
81 |
|
|
|
82 |
# Is the input directory? |
|
|
83 |
if (all(dir.exists(objects))) { |
|
|
84 |
if (length(objects) > 1) { |
|
|
85 |
stop("We do not accept multiple directories as input.") |
|
|
86 |
} else { |
|
|
87 |
files <- paste0(objects, "/", dir(objects)) |
|
|
88 |
res <- lapply(files, function(x) { |
|
|
89 |
if (use_chunk) { |
|
|
90 |
if (is.null(callback)) { |
|
|
91 |
f <- function(x, pos) { |
|
|
92 |
subset(x, |
|
|
93 |
eval(subset_rows), |
|
|
94 |
select = eval(select_cols) |
|
|
95 |
) |
|
|
96 |
} |
|
|
97 |
} else { |
|
|
98 |
f <- callback |
|
|
99 |
} |
|
|
100 |
|
|
|
101 |
y <- readr::read_tsv_chunked( |
|
|
102 |
x, |
|
|
103 |
readr::DataFrameCallback$new(f), |
|
|
104 |
chunk_size = chunk_size, |
|
|
105 |
comment = comment, |
|
|
106 |
na = na, |
|
|
107 |
col_types = readr::cols() |
|
|
108 |
) |
|
|
109 |
} else { |
|
|
110 |
y <- readr::read_tsv( |
|
|
111 |
x, |
|
|
112 |
comment = comment, |
|
|
113 |
na = na, |
|
|
114 |
col_types = readr::cols(), |
|
|
115 |
... |
|
|
116 |
) |
|
|
117 |
} |
|
|
118 |
|
|
|
119 |
y |
|
|
120 |
}) |
|
|
121 |
if (is.null(objectsName)) { |
|
|
122 |
objectsName <- make.names(dir(objects)) |
|
|
123 |
names(res) <- objectsName |
|
|
124 |
} |
|
|
125 |
} |
|
|
126 |
} else if (all(file.exists(objects))) { |
|
|
127 |
res <- lapply(objects, function(x) { |
|
|
128 |
if (use_chunk) { |
|
|
129 |
if (is.null(eval(callback))) { |
|
|
130 |
f <- function(x, pos) { |
|
|
131 |
subset(x, |
|
|
132 |
eval(subset_rows), |
|
|
133 |
select = eval(select_cols) |
|
|
134 |
) |
|
|
135 |
} |
|
|
136 |
} else { |
|
|
137 |
f <- callback |
|
|
138 |
} |
|
|
139 |
|
|
|
140 |
y <- readr::read_tsv_chunked( |
|
|
141 |
x, |
|
|
142 |
readr::DataFrameCallback$new(f), |
|
|
143 |
chunk_size = chunk_size, |
|
|
144 |
comment = comment, |
|
|
145 |
na = na, |
|
|
146 |
col_types = readr::cols() |
|
|
147 |
) |
|
|
148 |
} else { |
|
|
149 |
y <- readr::read_tsv( |
|
|
150 |
x, |
|
|
151 |
comment = comment, |
|
|
152 |
na = na, |
|
|
153 |
col_types = readr::cols(), |
|
|
154 |
... |
|
|
155 |
) |
|
|
156 |
} |
|
|
157 |
|
|
|
158 |
y |
|
|
159 |
}) |
|
|
160 |
if (is.null(objectsName)) { |
|
|
161 |
objectsName <- make.names(basename(objects)) |
|
|
162 |
names(res) <- objectsName |
|
|
163 |
} |
|
|
164 |
if (length(res) == 1) { |
|
|
165 |
res <- res[[1]] |
|
|
166 |
} |
|
|
167 |
} |
|
|
168 |
else { |
|
|
169 |
# check urls |
|
|
170 |
all_right <- grepl(pattern = "http", x = objects) |
|
|
171 |
|
|
|
172 |
if (any(all_right)) { |
|
|
173 |
objects <- objects[all_right] |
|
|
174 |
if (length(objects) == 1) { |
|
|
175 |
if (use_chunk) { |
|
|
176 |
if (is.null(callback)) { |
|
|
177 |
f <- function(x, pos) { |
|
|
178 |
subset(x, |
|
|
179 |
eval(subset_rows), |
|
|
180 |
select = eval(select_cols) |
|
|
181 |
) |
|
|
182 |
} |
|
|
183 |
} else { |
|
|
184 |
f <- callback |
|
|
185 |
} |
|
|
186 |
|
|
|
187 |
res <- readr::read_tsv_chunked( |
|
|
188 |
objects, |
|
|
189 |
readr::DataFrameCallback$new(f), |
|
|
190 |
chunk_size = chunk_size, |
|
|
191 |
comment = comment, |
|
|
192 |
na = na, |
|
|
193 |
col_types = readr::cols() |
|
|
194 |
) |
|
|
195 |
} else { |
|
|
196 |
res <- readr::read_tsv( |
|
|
197 |
objects, |
|
|
198 |
comment = comment, |
|
|
199 |
na = na, |
|
|
200 |
col_types = readr::cols(), |
|
|
201 |
... |
|
|
202 |
) |
|
|
203 |
} |
|
|
204 |
|
|
|
205 |
# res = suppressMessages(read_tsv(objects, comment=comment, na=na, ...)) |
|
|
206 |
} else { |
|
|
207 |
res <- lapply(objects, function(x) { |
|
|
208 |
if (use_chunk) { |
|
|
209 |
if (is.null(callback)) { |
|
|
210 |
f <- function(x, pos) { |
|
|
211 |
subset(x, |
|
|
212 |
eval(subset_rows), |
|
|
213 |
select = eval(select_cols) |
|
|
214 |
) |
|
|
215 |
} |
|
|
216 |
} else { |
|
|
217 |
f <- callback |
|
|
218 |
} |
|
|
219 |
|
|
|
220 |
y <- readr::read_tsv_chunked( |
|
|
221 |
x, |
|
|
222 |
readr::DataFrameCallback$new(f), |
|
|
223 |
chunk_size = chunk_size, |
|
|
224 |
comment = comment, |
|
|
225 |
na = na, |
|
|
226 |
col_types = readr::cols() |
|
|
227 |
) |
|
|
228 |
} else { |
|
|
229 |
y <- readr::read_tsv( |
|
|
230 |
x, |
|
|
231 |
comment = comment, |
|
|
232 |
na = na, |
|
|
233 |
col_types = readr::cols(), |
|
|
234 |
... |
|
|
235 |
) |
|
|
236 |
} |
|
|
237 |
|
|
|
238 |
y |
|
|
239 |
}) |
|
|
240 |
|
|
|
241 |
if (is.null(objectsName)) { |
|
|
242 |
objectsName <- make.names(basename(objects)) |
|
|
243 |
names(res) <- objectsName |
|
|
244 |
} |
|
|
245 |
} |
|
|
246 |
} |
|
|
247 |
all_wrong <- !all_right |
|
|
248 |
if (any(all_wrong)) { |
|
|
249 |
bad_urls <- objects2[all_wrong] |
|
|
250 |
message("Some inputs are wrong, maybe you should check:") |
|
|
251 |
print(bad_urls) |
|
|
252 |
} |
|
|
253 |
} |
|
|
254 |
} else { |
|
|
255 |
if (!"destfiles" %in% colnames(objects)) { |
|
|
256 |
stop( |
|
|
257 |
"Input data.frame should contain 'destfiles' column which generated by XenaDownload functions. Please check your input." |
|
|
258 |
) |
|
|
259 |
} |
|
|
260 |
|
|
|
261 |
files <- objects$destfiles |
|
|
262 |
res <- XenaPrepare( |
|
|
263 |
files, |
|
|
264 |
objectsName = objectsName, |
|
|
265 |
use_chunk = use_chunk, |
|
|
266 |
chunk_size = chunk_size, |
|
|
267 |
subset_rows = subset_rows, |
|
|
268 |
select_cols = select_cols, |
|
|
269 |
callback = callback, |
|
|
270 |
comment = comment, |
|
|
271 |
na = na, |
|
|
272 |
... |
|
|
273 |
) |
|
|
274 |
} |
|
|
275 |
|
|
|
276 |
return(res) |
|
|
277 |
} |