Diff of /R/simplify.R [000000] .. [0bdad5]

Switch to unified view

a b/R/simplify.R
1
# simplify TCGA data download workflow
2
## ------------------------------------
3
# Typical Cohorts Structure
4
5
# Given GBM as an example: <https://xenabrowser.net/datapages/?cohort=TCGA%20Glioblastoma%20(GBM)>
6
# Cohorts
7
#       Copy Number
8
#           gistic2
9
#           gistic2 thresholded
10
#       Copy Number Segments
11
#           After remove germline cnv
12
#           Before remove germline cnv
13
#       DNA Methylation
14
#           Methylation27k
15
#           Methylation450k
16
#       Exon Expression RNASeq
17
#           IlluminaHiSeq
18
#       Gene Expression Array
19
#           AffyU133a (always change)
20
#       Gene Expression RNASeq
21
#           IlluminaHiSeq
22
#           IlluminaHiSeq pancan normalized
23
#           IlluminaHiSeq percentile
24
#       miRNA Mature Strand Expression RNASeq
25
#           IlluminaHiseq
26
#       PARADIGM Pathway Activity
27
#           expression
28
#           expression (array) + CNV
29
#           expression + CNV
30
#           exprssion (array)
31
#       Phenotype
32
#           Phenotypes
33
#       Protein Expression RPPA
34
#           RPPA
35
#           RPPA (replicate-base normalization)
36
#       Somatic Mutation (SNPs and small INDELs)
37
#           broad
38
#           ucsc automated
39
#       Somatic non-silent mutation (gene-level)
40
#           broad
41
#           PANCAN AWG
42
#           ucsc automated
43
#       Transcription factor regulatory impact
44
#           Agilent, by RABIT
45
#           HiSeqV2, by RABIT
46
#           U133A, by RABIT
47
48
# compiler::setCompilerOptions(suppressAll = TRUE)
49
# suppress Binding Notes
50
# suppressBindingNotes <- function(variablesMentionedInNotes) {
51
#     for(variable in variablesMentionedInNotes) {
52
#         assign(variable, NULL, envir = .GlobalEnv)
53
#     }
54
# }
55
56
# suppressBindingNotes(c("XenaHostNames","XenaCohorts", "ProjectID", "DataType", "FileType"))
57
58
59
##' @title Get TCGA Common Data Sets by Project ID and Property
60
##' @description This is the most useful function for user to download common
61
##' TCGA datasets, it is similar to `getFirehoseData` function in `RTCGAToolbox`
62
##'  package.
63
##' @details TCGA Common Data Sets are frequently used for biological analysis.
64
##' To make easier to achieve these data, this function provide really easy
65
##' options to choose datasets and behavior. All availble information about
66
##' datasets of TCGA can access vis `availTCGA()` and check with `showTCGA()`.
67
##' @author Shixiang Wang <w_shixiang@163.com>
68
##' @inheritParams downloadTCGA
69
##' @param clinical logical. if `TRUE`, download clinical information. Default is `TRUE`.
70
##' @param download logical. if `TRUE`, download data, otherwise return a result list include data
71
##' information. Default is `FALSE`. You can set this to `FALSE` if you want to check what you will download or
72
##' use other function provided by `UCSCXenaTools` to filter result datasets you want to download.
73
##' @param forceDownload logical. if `TRUE`, force to download files no matter if exist. Default is `FALSE`.
74
##' @param mRNASeq logical. if `TRUE`, download mRNASeq data. Default is `FALSE`.
75
##' @param mRNAArray logical. if `TRUE`, download mRNA microarray data. Default is `FALSE`.
76
##' @param mRNASeqType character vector. Can be one, two or three
77
##' in `c("normalized", "pancan normalized", "percentile")`.
78
##' @param miRNASeq logical. if `TRUE`, download miRNASeq data. Default is `FALSE`.
79
##' @param exonRNASeq logical. if `TRUE`, download exon RNASeq data. Default is `FALSE`.
80
##' @param RPPAArray logical. if `TRUE`, download RPPA data. Default is `FALSE`.
81
##' @param ReplicateBaseNormalization logical. if `TRUE`, download RPPA data by Replicate Base
82
##' Normalization (RBN). Default is `FALSE`.
83
##' @param Methylation logical. if `TRUE`, download DNA Methylation data. Default is `FALSE`.
84
##' @param MethylationType character vector. Can be one or two in `c("27K", "450K")`.
85
##' @param GeneMutation logical. if `TRUE`, download gene mutation data. Default is `FALSE`.
86
##' @param SomaticMutation logical. if `TRUE`, download somatic mutation data. Default is `FALSE`.
87
##' @param GisticCopyNumber logical. if `TRUE`, download Gistic2 Copy Number data. Default is `FALSE`.
88
##' @param Gistic2Threshold logical. if `TRUE`, download Threshold Gistic2 data. Default is `TRUE`.
89
##' @param CopyNumberSegment logical. if `TRUE`, download Copy Number Segment data. Default is `FALSE`.
90
##' @param RemoveGermlineCNV logical. if `TRUE`, download Copy Number Segment data which has removed
91
##' germline copy number variation. Default is `TRUE`.
92
##' @return if `download=TRUE`, return `data.frame` from `XenaDownload`,
93
##' otherwise return a list including `XenaHub` object and datasets information
94
##' @export
95
##' @examples
96
##' ###### get data, but not download
97
##'
98
##' # 1 choose project and data types you wanna download
99
##' getTCGAdata(project = "LUAD", mRNASeq = TRUE, mRNAArray = TRUE,
100
##' mRNASeqType = "normalized", miRNASeq = TRUE, exonRNASeq = TRUE,
101
##' RPPAArray = TRUE, Methylation = TRUE, MethylationType = "450K",
102
##' GeneMutation = TRUE, SomaticMutation = TRUE)
103
##'
104
##' # 2 only choose 'LUAD' and its clinical data
105
##' getTCGAdata(project = "LUAD")
106
##' \dontrun{
107
##' ###### download datasets
108
##'
109
##' # 3 download clinical datasets of LUAD and LUSC
110
##' getTCGAdata(project = c("LUAD", "LUSC"), clinical = TRUE, download = TRUE)
111
##'
112
##' # 4 download clinical, RPPA and gene mutation datasets of LUAD and LUSC
113
##' # getTCGAdata(project = c("LUAD", "LUSC"), clinical = TRUE, RPPAArray = TRUE, GeneMutation = TRUE)
114
##' }
115
getTCGAdata <- function(project = NULL,
116
                        clinical = TRUE,
117
                        download = FALSE,
118
                        forceDownload = FALSE,
119
                        destdir = tempdir(),
120
                        mRNASeq = FALSE,
121
                        mRNAArray = FALSE,
122
                        mRNASeqType = "normalized",
123
                        miRNASeq = FALSE,
124
                        exonRNASeq = FALSE,
125
                        RPPAArray = FALSE,
126
                        ReplicateBaseNormalization = FALSE,
127
                        Methylation = FALSE,
128
                        MethylationType = c("27K", "450K"),
129
                        GeneMutation = FALSE,
130
                        SomaticMutation = FALSE,
131
                        GisticCopyNumber = FALSE,
132
                        Gistic2Threshold = TRUE,
133
                        CopyNumberSegment = FALSE,
134
                        RemoveGermlineCNV = TRUE,
135
                        ...) {
136
  #----- check data type of input
137
  stopifnot(!is.null(project))
138
  stopifnot(is.logical(
139
    c(
140
      clinical,
141
      mRNASeq,
142
      mRNAArray,
143
      miRNASeq,
144
      RPPAArray,
145
      ReplicateBaseNormalization,
146
      Methylation,
147
      GeneMutation,
148
      SomaticMutation,
149
      GisticCopyNumber,
150
      Gistic2Threshold,
151
      CopyNumberSegment,
152
      RemoveGermlineCNV,
153
      download,
154
      forceDownload
155
    )
156
  ))
157
158
  projects <- c(
159
    "LAML",
160
    "ACC",
161
    "CHOL",
162
    "BLCA",
163
    "BRCA",
164
    "CESC",
165
    "COADREAD",
166
    "COAD",
167
    "UCEC",
168
    "ESCA",
169
    "FPPP",
170
    "GBM",
171
    "HNSC",
172
    "KICH",
173
    "KIRC",
174
    "KIRP",
175
    "DLBC",
176
    "LIHC",
177
    "LGG",
178
    "GBMLGG",
179
    "LUAD",
180
    "LUNG",
181
    "LUSC",
182
    "SKCM",
183
    "MESO",
184
    "UVM",
185
    "OV",
186
    "PANCAN",
187
    "PAAD",
188
    "PCPG",
189
    "PRAD",
190
    "READ",
191
    "SARC",
192
    "STAD",
193
    "TGCT",
194
    "THYM",
195
    "THCA",
196
    "UCS"
197
  )
198
199
  if (!all(project %in% projects)) {
200
    message("Only following Project valid:")
201
    print(project[project %in% projects])
202
    stop("Invaild Input!")
203
  }
204
205
  tcga_all <- .decodeDataType(Target = "tcgaHub")
206
207
  # tcga_all %>%
208
  #     filter(ProjectID %in% project) %>% # select project
209
  #     filter()
210
211
212
  res <- subset(tcga_all, ProjectID %in% project)
213
  res %>%
214
    filter(
215
      DataType != "Transcription Factor Regulatory Impact",
216
      DataType != "Signatures",
217
      DataType != "PARADIGM Pathway Activity",
218
      DataType != "iCluster"
219
    ) -> res
220
221
222
  if (clinical) {
223
    quo_cli <- dplyr::quo((FileType == "Clinical Information"))
224
  } else {
225
    quo_cli <- dplyr::quo((FALSE))
226
  }
227
228
  if (mRNASeq) {
229
    if (!all(mRNASeqType %in% c("normalized", "pancan normalized", "percentile"))) {
230
      message("Available mRNASeqType values are:")
231
      print(c("normalized", "pancan normalized", "percentile"))
232
      stop("Not Vaild Input!")
233
    }
234
235
    RNA <- c(
236
      "IlluminaHiSeq RNASeqV2",
237
      "IlluminaHiSeq RNASeqV2 pancan normalized",
238
      "IlluminaHiSeq RNASeqV2 in percentile rank"
239
    )
240
    names(RNA) <- c("normalized", "pancan normalized", "percentile")
241
    RNA_select <- c(RNA[mRNASeqType], "Batch effects normalized")
242
243
    quo_RNA <- dplyr::quo((
244
      DataType == "Gene Expression RNASeq" & FileType %in% RNA_select
245
    ))
246
  } else {
247
    quo_RNA <- dplyr::quo((FALSE))
248
  }
249
250
  if (mRNAArray) {
251
    quo_RNAa <- dplyr::quo((DataType == "Gene Expression Array"))
252
  } else {
253
    quo_RNAa <- dplyr::quo((FALSE))
254
  }
255
256
  if (miRNASeq) {
257
    miRNA_select <- c("IlluminaHiSeq RNASeq", "Batch effects normalized")
258
    quo_miRNA <- dplyr::quo((
259
      DataType == "miRNA Mature Strand Expression RNASeq" &
260
        FileType %in% miRNA_select
261
    ))
262
  } else {
263
    quo_miRNA <- dplyr::quo((FALSE))
264
  }
265
266
  if (exonRNASeq) {
267
    quo_exon <- dplyr::quo((
268
      DataType == "Exon Expression RNASeq" &
269
        FileType == "IlluminaHiSeq RNASeqV2"
270
    ))
271
  } else {
272
    quo_exon <- dplyr::quo((FALSE))
273
  }
274
  # Have no miRNA Array? Need Check
275
  # if(miRNAArray){
276
  #
277
  # }
278
  if (RPPAArray) {
279
    if (ReplicateBaseNormalization) {
280
      RPPA_select <- "RPPA normalized by RBN"
281
    } else {
282
      RPPA_select <- "RPPA"
283
    }
284
    quo_RPPA <- dplyr::quo((
285
      DataType == "Protein Expression RPPA" &
286
        FileType %in% c(RPPA_select, "RPPA pancan normalized")
287
    ))
288
  } else {
289
    quo_RPPA <- dplyr::quo((FALSE))
290
  }
291
292
  if (Methylation) {
293
    if (!all(MethylationType %in% c("27K", "450K"))) {
294
      message("Available MethylationType values are:")
295
      print(c("27K", "450K"))
296
      stop("Not Vaild Input!")
297
    }
298
299
    Methy <- c("Methylation27K", "Methylation450K")
300
    names(Methy) <- c("27K", "450K")
301
    Methy_select <- Methy[MethylationType]
302
303
    quo_Methy <- dplyr::quo((
304
      DataType == "DNA Methylation" & FileType %in% Methy_select
305
    ))
306
  } else {
307
    quo_Methy <- dplyr::quo((FALSE))
308
  }
309
310
  if (GeneMutation) {
311
    quo_genMutation <- dplyr::quo((
312
      DataType == "Gene Somatic Non-silent Mutation" &
313
        FileType %in% c("broad automated", "MC3 Public Version")
314
    ))
315
  } else {
316
    quo_genMutation <- dplyr::quo((FALSE))
317
  }
318
319
  if (SomaticMutation) {
320
    quo_somaticMutation <- dplyr::quo((
321
      DataType == "Somatic Mutation" &
322
        FileType %in% c("broad automated", "MC3 Public Version")
323
    ))
324
  } else {
325
    quo_somaticMutation <- dplyr::quo((FALSE))
326
  }
327
328
  if (GisticCopyNumber) {
329
    if (Gistic2Threshold) {
330
      gistic_select <- "Gistic2 thresholded"
331
    } else {
332
      gistic_select <- "Gistic2"
333
    }
334
    quo_gistic <- dplyr::quo((
335
      DataType == "Gene Level Copy Number" & FileType == gistic_select
336
    ))
337
  } else {
338
    quo_gistic <- dplyr::quo((FALSE))
339
  }
340
341
  if (CopyNumberSegment) {
342
    if (RemoveGermlineCNV) {
343
      cns_select <- "After remove germline cnv"
344
    } else {
345
      cns_select <- "Before remove germline cnv"
346
    }
347
    quo_cns <- dplyr::quo((
348
      DataType == "Copy Number Segments" & FileType == cns_select
349
    ))
350
  } else {
351
    quo_cns <- dplyr::quo((FALSE))
352
  }
353
354
  cond_select <- dplyr::quo(
355
    !!quo_cli |
356
      !!quo_RNA |
357
      !!quo_RNAa |
358
      !!quo_miRNA |
359
      !!quo_exon |
360
      !!quo_RPPA |
361
      !!quo_Methy |
362
      !!quo_genMutation |
363
      !!quo_somaticMutation | !!quo_gistic | !!quo_cns
364
  )
365
  res <- filter(res, !!cond_select)
366
367
  if (download) {
368
    res %>%
369
      XenaGenerate() %>%
370
      XenaQuery() %>%
371
      XenaDownload(destdir = destdir, force = forceDownload, ...)
372
  } else {
373
    xe <- res %>% XenaGenerate()
374
    list(Xena = xe, DataInfo = res)
375
  }
376
}
377
378
379
380
381
##' @title Easily Download TCGA Data by Several Options
382
##' @description TCGA is a very useful database and here we provide this function to
383
##' download TCGA (include TCGA Pancan) datasets in human-friendly way. Users who are not
384
##' familiar with R operation will benefit from this.
385
##' @details All availble information about datasets of TCGA can access vis `availTCGA()` and
386
##' check with `showTCGA()`.
387
##' @author Shixiang Wang <w_shixiang@163.com>
388
##' @param project default is `NULL`. Should be one or more of TCGA project id (character vector) provided by Xena.
389
##' See all available project id, please use `availTCGA("ProjectID")`.
390
##' @param data_type default is `NULL`. Should be a character vector specify data type.
391
##' See all available data types by `availTCGA("DataType")`.
392
##' @param file_type default is `NULL`. Should be a character vector specify file type.
393
##' See all available file types by `availTCGA("FileType")`.
394
##' @inheritParams XenaDownload
395
##' @return same as `XenaDownload()` function result.
396
##' @export
397
##' @examples
398
##' \dontrun{
399
##' # download RNASeq data (use UVM as example)
400
##' downloadTCGA(project = "UVM",
401
##'                  data_type = "Gene Expression RNASeq",
402
##'                  file_type = "IlluminaHiSeq RNASeqV2")
403
##' }
404
##' @seealso [UCSCXenaTools::XenaQuery()],
405
##' [UCSCXenaTools::XenaFilter()],
406
##' [UCSCXenaTools::XenaDownload()],
407
##' [UCSCXenaTools::XenaPrepare()],
408
##' [UCSCXenaTools::availTCGA()],
409
##' [UCSCXenaTools::showTCGA()]
410
411
downloadTCGA <- function(project = NULL,
412
                         data_type = NULL,
413
                         file_type = NULL,
414
                         destdir = tempdir(),
415
                         force = FALSE,
416
                         ...) {
417
  stopifnot(
418
    !is.null(project),
419
    !is.null(data_type),
420
    !is.null(file_type)
421
  )
422
  tcga_all <- .decodeDataType(Target = "tcgaHub")
423
  tcga_projects <- unique(tcga_all$ProjectID)
424
425
  # suppress binding notes
426
  ProjectID <- DataType <- FileType <- NULL
427
428
  if (!all(project %in% tcga_projects)) {
429
    message(
430
      project,
431
      " are not (all) valid, please select one or more of following valid project ID:"
432
    )
433
    print(tcga_projects, quote = FALSE)
434
    return(invisible(NULL))
435
  }
436
437
  res <- tcga_all %>%
438
    filter(
439
      ProjectID %in% project,
440
      DataType %in% data_type,
441
      FileType %in% file_type
442
    )
443
444
  if (nrow(res) == 0) { # nocov start
445
    message("Find nothing about your input, please check it.")
446
    message("availTCGA and showTCGA function may help you.")
447
    return(invisible(NULL))
448
  } # nocov end
449
450
  res %>%
451
    XenaGenerate() %>%
452
    XenaQuery() %>%
453
    XenaDownload(destdir = destdir, force = force, ...)
454
}
455
456
##' @title Get or Check TCGA Available ProjectID, DataType and FileType
457
##' @param which a character of `c("All", "ProjectID", "DataType", "FileType")`
458
##' @author Shixiang Wang <w_shixiang@163.com>
459
##' @export
460
##' @examples
461
##' \donttest{
462
##' availTCGA("all")
463
##' }
464
availTCGA <- function(which = c("all", "ProjectID", "DataType", "FileType")) {
465
  which <- match.arg(which)
466
  tcga_all <- .decodeDataType(Target = "tcgaHub")
467
  tcga_projects <- unique(tcga_all$ProjectID)
468
  tcga_datatype <- unique(tcga_all$DataType)
469
  tcga_filetype <- unique(tcga_all$FileType)
470
471
  if (which == "all") {
472
    message(
473
      "Note not all projects have listed data types and file types, you can use showTCGA function to check if exist"
474
    )
475
    return(
476
      list(
477
        ProjectID = tcga_projects,
478
        DataType = tcga_datatype,
479
        FileType = tcga_filetype
480
      )
481
    )
482
  }
483
484
  if (which == "ProjectID") {
485
    return(tcga_projects)
486
  }
487
  if (which == "DataType") {
488
    return(tcga_datatype)
489
  }
490
  if (which == "FileType") {
491
    return(tcga_filetype)
492
  }
493
}
494
495
##' @title Show TCGA data structure by Project ID or ALL
496
##' @description This can used to check if data type or file type exist in one or more projects by hand.
497
##' @param project a character vector. Can be "all" or one or more of TCGA Project IDs.
498
##' @return a `data.frame` including project data structure information.
499
##' @author Shixiang Wang <w_shixiang@163.com>
500
##' @export
501
##' @examples
502
##' \donttest{
503
##' showTCGA("all")
504
##' }
505
##' @seealso [UCSCXenaTools::availTCGA()]
506
showTCGA <- function(project = "all") {
507
  # suppress binding notes
508
  ProjectID <- DataType <- FileType <- NULL
509
510
  tcga_all <- .decodeDataType(Target = "tcgaHub")
511
  if (project == "all") {
512
    # res = data.table::data.table(tcga_all)
513
    # res = res[, .(ProjectID, DataType, FileType)]
514
    res <- tcga_all %>% select(ProjectID, DataType, FileType)
515
  } else {
516
    res <- tcga_all %>%
517
      filter(ProjectID %in% project) %>%
518
      select(ProjectID, DataType, FileType)
519
    # res = data.table::data.table(tcga_all)
520
    # res = res[ProjectID %in% project, .(ProjectID, DataType, FileType)]
521
  }
522
523
  if (nrow(res) == 0) { # nocov start
524
    message("Something is wrong in your input, NULL will be returned, please check.")
525
    return(NULL)
526
  } # nocov end
527
  return(res)
528
}
529
530
531
532
533
# Only works for TCGA
534
.decodeDataType <- function(XenaData = UCSCXenaTools::XenaData,
535
                            Target = "tcgaHub") {
536
  # This TCGA include TCGA PANCAN dataset
537
  if ("tcgaHub" %in% Target) {
538
    Target <- c(Target, "pancanAtlasHub")
539
  }
540
541
  # supress binding notes
542
  XenaHostNames <- XenaCohorts <- NULL
543
544
  ob <- XenaData %>% filter(XenaHostNames %in% Target)
545
546
  if ("tcgaHub" %in% Target) {
547
    # decode project id
548
    ob %>% mutate(ProjectID = sub(".*\\((.*)\\)", "\\1", XenaCohorts)) -> ob
549
    # decode DataType
550
    ob %>%
551
      mutate(
552
        DataType = dplyr::case_when(
553
          grepl("Gistic2_CopyNumber_Gistic2", XenaDatasets) ~ "Gene Level Copy Number",
554
          grepl(
555
            "PANCAN_Genome_Wide_SNP_6_whitelisted.gene.xena",
556
            XenaDatasets
557
          ) ~ "Gene Level Copy Number",
558
          # pancan
559
          grepl("SNP6", XenaDatasets) ~ "Copy Number Segments",
560
          grepl(
561
            "PANCAN_Genome_Wide_SNP_6_whitelisted.xena",
562
            XenaDatasets
563
          ) ~ "Copy Number Segments",
564
          # pancan
565
          grepl("HumanMethylation", XenaDatasets) ~ "DNA Methylation",
566
          grepl("MethylMix", XenaDatasets) ~ "DNA Methylation",
567
          grepl("HiSeq.*_exon", XenaDatasets) ~ "Exon Expression RNASeq",
568
          grepl("GA_exon", XenaDatasets) ~ "Exon Expression RNASeq",
569
          grepl("GAV2_exon", XenaDatasets) ~ "Exon Expression RNASeq",
570
          grepl("AgilentG", XenaDatasets) ~ "Gene Expression Array",
571
          grepl("HT_HG-U133A", XenaDatasets) ~ "Gene Expression Array",
572
          grepl("GA$", XenaDatasets) &
573
            !grepl("RABIT", XenaDatasets) ~ "Gene Expression RNASeq",
574
          grepl("GAV2$", XenaDatasets) &
575
            !grepl("RABIT", XenaDatasets) ~ "Gene Expression RNASeq",
576
          grepl("HiSeq$", XenaDatasets) &
577
            !grepl("RABIT", XenaDatasets) ~ "Gene Expression RNASeq",
578
          grepl("HiSeqV2$", XenaDatasets) &
579
            !grepl("RABIT", XenaDatasets) ~ "Gene Expression RNASeq",
580
          grepl("HiSeqV2_PANCAN$", XenaDatasets) ~ "Gene Expression RNASeq",
581
          grepl("HiSeqV2_percentile$", XenaDatasets) ~ "Gene Expression RNASeq",
582
          grepl(
583
            "EB\\+\\+AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.xena",
584
            XenaDatasets
585
          ) ~ "Gene Expression RNASeq",
586
          # pancan
587
          grepl("miRNA", XenaDatasets) ~ "miRNA Mature Strand Expression RNASeq",
588
          grepl(
589
            "pancanMiRs_EBadjOnProtocolPlatformWithoutRepsWithUnCorrectMiRs",
590
            XenaDatasets
591
          ) ~ "miRNA Mature Strand Expression RNASeq",
592
          # pancan
593
          grepl("Pathway_Paradigm", XenaDatasets) ~ "PARADIGM Pathway Activity",
594
          grepl("erge_merged_reals", XenaDatasets) ~ "PARADIGM Pathway Activity",
595
          # pancan
596
          grepl("clinicalMatrix", XenaDatasets) ~ "Phenotype",
597
          grepl(
598
            "Survival_SupplementalTable_S1_20171025_xena_sp",
599
            XenaDatasets
600
          ) ~ "Phenotype",
601
          # pancan
602
          grepl("Subtype_Immune_Model_Based.txt", XenaDatasets) ~ "Phenotype",
603
          # pancan
604
          grepl("TCGASubtype.20170308.tsv", XenaDatasets) ~ "Phenotype",
605
          # pancan
606
          grepl(
607
            "TCGA_phenotype_denseDataOnlyDownload.tsv",
608
            XenaDatasets
609
          ) ~ "Phenotype",
610
          # pancan
611
          grepl("gene_expression_subtype", XenaDatasets) ~ "Phenotype",
612
          # OV
613
          grepl("RPPA", XenaDatasets) ~ "Protein Expression RPPA",
614
          grepl("mutation_", XenaDatasets) &
615
            !endsWith(XenaDatasets, "gene") ~ "Somatic Mutation",
616
          grepl("mc3.v0.2.8.PUBLIC.xena", XenaDatasets) ~ "Somatic Mutation",
617
          # pancan
618
          grepl("mutation($|(.*_gene$))", XenaDatasets) ~ "Gene Somatic Non-silent Mutation",
619
          grepl(
620
            "mc3.v0.2.8.PUBLIC.nonsilentGene.xena",
621
            XenaDatasets
622
          ) ~ "Gene Somatic Non-silent Mutation",
623
          # pancan
624
          grepl("RABIT", XenaDatasets) ~ "Transcription Factor Regulatory Impact",
625
          grepl("iCluster", XenaDatasets) ~ "iCluster",
626
          grepl(
627
            "Pancan12_GenePrograms_drugTargetCanon_in_Pancan33.tsv",
628
            XenaDatasets
629
          ) ~ "Signatures",
630
          # pancan
631
          grepl("TCGA.HRD_withSampleID.txt", XenaDatasets) ~ "Signatures",
632
          # pancan
633
          grepl(
634
            "TCGA_pancancer_10852whitelistsamples_68ImmuneSigs.xena",
635
            XenaDatasets
636
          ) ~ "Signatures",
637
          # pancan
638
          grepl("StemnessScores_DNAmeth_20170210.tsv", XenaDatasets) ~ "Signatures",
639
          # pancan
640
          grepl(
641
            "StemnessScores_RNAexp_20170127.2.tsv",
642
            XenaDatasets
643
          ) ~ "Signatures" # pancan
644
        )
645
      ) -> ob
646
647
    # decode file type
648
    ob %>%
649
      mutate(
650
        FileType = dplyr::case_when(
651
          DataType == "Gene Level Copy Number" &
652
            grepl("Gistic2_all_data_by_genes", XenaDatasets) ~ "Gistic2",
653
          DataType == "Gene Level Copy Number" &
654
            grepl("Gistic2_all_thresholded.by_genes", XenaDatasets) ~ "Gistic2 thresholded",
655
          DataType == "Gene Level Copy Number" &
656
            grepl(
657
              "PANCAN_Genome_Wide_SNP_6_whitelisted.gene.xena",
658
              XenaDatasets
659
            ) ~ "Tumor copy number",
660
661
662
          DataType == "Copy Number Segments" &
663
            grepl("SNP6_genomicSegment", XenaDatasets) ~ "Before remove germline cnv",
664
          DataType == "Copy Number Segments" &
665
            grepl("SNP6_nocnv_genomicSegment", XenaDatasets) ~ "After remove germline cnv",
666
          DataType == "Copy Number Segments" &
667
            grepl(
668
              "PANCAN_Genome_Wide_SNP_6_whitelisted.xena",
669
              XenaDatasets
670
            ) ~ "After remove germline cnv",
671
672
673
          DataType == "DNA Methylation" &
674
            grepl("HumanMethylation27", XenaDatasets) ~ "Methylation27K",
675
          DataType == "DNA Methylation" &
676
            grepl("HumanMethylation450", XenaDatasets) ~ "Methylation450K",
677
          DataType == "DNA Methylation" &
678
            grepl("oneoff_TCGA_LGG_MethylMix", XenaDatasets) ~ "MethylMix",
679
680
681
          DataType == "Exon Expression RNASeq" &
682
            grepl("GA_exon", XenaDatasets) ~ "IlluminaGA RNASeq",
683
          DataType == "Exon Expression RNASeq" &
684
            grepl("GAV2_exon", XenaDatasets) ~ "IlluminaGA RNASeqV2",
685
          DataType == "Exon Expression RNASeq" &
686
            grepl("HiSeq_exon", XenaDatasets) ~ "IlluminaHiSeq RNASeq",
687
          DataType == "Exon Expression RNASeq" &
688
            grepl("HiSeqV2_exon", XenaDatasets) ~ "IlluminaHiSeq RNASeqV2",
689
690
691
          DataType == "Gene Expression Array" &
692
            grepl("AgilentG4502A", XenaDatasets) ~ "Agilent 244K Microarray",
693
          DataType == "Gene Expression Array" &
694
            grepl("HT_HG-U133A", XenaDatasets) ~ "Affymetrix U133A Microarray",
695
696
          DataType == "Gene Expression RNASeq" &
697
            endsWith(XenaDatasets, "GA") ~ "IlluminaGA RNASeq",
698
          DataType == "Gene Expression RNASeq" &
699
            endsWith(XenaDatasets, "GAV2") ~ "IlluminaGA RNASeqV2",
700
          DataType == "Gene Expression RNASeq" &
701
            endsWith(XenaDatasets, "HiSeq") ~ "IlluminaHiSeq RNASeq",
702
          DataType == "Gene Expression RNASeq" &
703
            endsWith(XenaDatasets, "HiSeqV2") ~ "IlluminaHiSeq RNASeqV2",
704
          DataType == "Gene Expression RNASeq" &
705
            endsWith(XenaDatasets, "HiSeqV2_PANCAN") ~ "IlluminaHiSeq RNASeqV2 pancan normalized",
706
          DataType == "Gene Expression RNASeq" &
707
            endsWith(XenaDatasets, "HiSeqV2_percentile") ~ "IlluminaHiSeq RNASeqV2 in percentile rank",
708
          DataType == "Gene Expression RNASeq" &
709
            grepl("AdjustPANCAN_IlluminaHiSeq_RNASeqV2", XenaDatasets) ~ "Batch effects normalized",
710
711
          DataType == "miRNA Mature Strand Expression RNASeq" &
712
            endsWith(XenaDatasets, "miRNA_GA_gene") ~ "IlluminaGA RNASeq",
713
          DataType == "miRNA Mature Strand Expression RNASeq" &
714
            endsWith(XenaDatasets, "miRNA_HiSeq_gene") ~ "IlluminaHiSeq RNASeq",
715
          DataType == "miRNA Mature Strand Expression RNASeq" &
716
            grepl(
717
              "pancanMiRs_EBadjOnProtocolPlatformWithoutRepsWithU",
718
              XenaDatasets
719
            ) ~ "Batch effects normalized",
720
721
722
          DataType == "PARADIGM Pathway Activity" &
723
            grepl("merge_merged_reals", XenaDatasets) ~ "Platform-corrected PANCAN12 dataset",
724
          DataType == "PARADIGM Pathway Activity" &
725
            endsWith(XenaDatasets, "Pathway_Paradigm_mRNA") ~ "Use only Microarray",
726
          DataType == "PARADIGM Pathway Activity" &
727
            endsWith(
728
              XenaDatasets,
729
              "Pathway_Paradigm_mRNA_And_Copy_Number"
730
            ) ~ "Use Microarray plus Copy Number",
731
          DataType == "PARADIGM Pathway Activity" &
732
            endsWith(XenaDatasets, "Pathway_Paradigm_RNASeq") ~ "Use only RNASeq",
733
          DataType == "PARADIGM Pathway Activity" &
734
            endsWith(
735
              XenaDatasets,
736
              "Pathway_Paradigm_RNASeq_And_Copy_Number"
737
            ) ~ "Use RNASeq plus Copy Number",
738
739
740
          DataType == "Phenotype" &
741
            endsWith(XenaDatasets, "clinicalMatrix") ~ "Clinical Information",
742
          DataType == "Phenotype" &
743
            grepl(
744
              "Survival_SupplementalTable_S1_20171025_xena_sp",
745
              XenaDatasets
746
            ) ~ "Clinical Information",
747
          DataType == "Phenotype" &
748
            grepl("gene_expression_subtype", XenaDatasets) ~ "Gene Expression Subtype",
749
          DataType == "Phenotype" &
750
            grepl("Subtype_Immune_Model_Based", XenaDatasets) ~ "Immune Model Based Subtype",
751
          DataType == "Phenotype" &
752
            grepl("TCGASubtype", XenaDatasets) ~ "TCGA Molecular Subtype",
753
          DataType == "Phenotype" &
754
            grepl(
755
              "TCGA_phenotype_denseDataOnlyDownload",
756
              XenaDatasets
757
            ) ~ "TCGA Sample Type and Primary Disease",
758
759
          DataType == "Protein Expression RPPA" &
760
            endsWith(XenaDatasets, "RPPA") ~ "RPPA",
761
          DataType == "Protein Expression RPPA" &
762
            endsWith(XenaDatasets, "RPPA_RBN") ~ "RPPA normalized by RBN",
763
          DataType == "Protein Expression RPPA" &
764
            grepl("TCGA-RPPA-pancan-clean", XenaDatasets) ~ "RPPA pancan normalized",
765
766
767
          DataType == "Somatic Mutation" &
768
            grepl("mc3.v0.2.8.PUBLIC.xena", XenaDatasets) ~ "MC3 Public Version",
769
          DataType == "Somatic Mutation" &
770
            endsWith(XenaDatasets, "mutation_bcgsc") ~ "bcgsc automated",
771
          DataType == "Somatic Mutation" &
772
            endsWith(XenaDatasets, "mutation_bcm") ~ "bcm automated",
773
          DataType == "Somatic Mutation" &
774
            endsWith(XenaDatasets, "mutation_bcm_solid") ~ "bcm SOLiD",
775
          DataType == "Somatic Mutation" &
776
            endsWith(XenaDatasets, "mutation_broad") ~ "broad automated",
777
          DataType == "Somatic Mutation" &
778
            endsWith(XenaDatasets, "mutation_curated_bcm") ~ "bcm curated",
779
          DataType == "Somatic Mutation" &
780
            endsWith(XenaDatasets, "mutation_curated_bcm_solid") ~ "bcm SOLiD curated",
781
          DataType == "Somatic Mutation" &
782
            endsWith(XenaDatasets, "mutation_curated_broad") ~ "broad curated",
783
          DataType == "Somatic Mutation" &
784
            endsWith(XenaDatasets, "mutation_curated_wustl") ~ "wustl curated",
785
          DataType == "Somatic Mutation" &
786
            endsWith(XenaDatasets, "mutation_ucsc_maf") ~ "ucsc automated",
787
          DataType == "Somatic Mutation" &
788
            endsWith(XenaDatasets, "mutation_wustl") ~ "wustl automated",
789
          DataType == "Somatic Mutation" &
790
            endsWith(XenaDatasets, "mutation_wustl_hiseq") ~ "wustl hiseq automated",
791
792
          DataType == "Gene Somatic Non-silent Mutation" &
793
            grepl(
794
              "mc3.v0.2.8.PUBLIC.nonsilentGene.xena",
795
              XenaDatasets
796
            ) ~ "MC3 Public Version",
797
          DataType == "Gene Somatic Non-silent Mutation" &
798
            endsWith(XenaDatasets, "mutation") ~ "PANCAN AWG analyzed",
799
          DataType == "Gene Somatic Non-silent Mutation" &
800
            endsWith(XenaDatasets, "mutation_bcgsc_gene") ~ "bsgsc automated",
801
          DataType == "Gene Somatic Non-silent Mutation" &
802
            endsWith(XenaDatasets, "mutation_bcm_gene") ~ "bcm automated",
803
          DataType == "Gene Somatic Non-silent Mutation" &
804
            endsWith(XenaDatasets, "mutation_bcm_solid_gene") ~ "bcm SOLiD",
805
          DataType == "Gene Somatic Non-silent Mutation" &
806
            endsWith(XenaDatasets, "mutation_broad_gene") ~ "broad automated",
807
          DataType == "Gene Somatic Non-silent Mutation" &
808
            endsWith(XenaDatasets, "mutation_curated_bcm_gene") ~ "bcm curated",
809
          DataType == "Gene Somatic Non-silent Mutation" &
810
            endsWith(XenaDatasets, "mutation_curated_bcm_solid_gene") ~ "bcm SOLiD curated",
811
          DataType == "Gene Somatic Non-silent Mutation" &
812
            endsWith(XenaDatasets, "mutation_curated_broad_gene") ~ "broad curated",
813
          DataType == "Gene Somatic Non-silent Mutation" &
814
            endsWith(XenaDatasets, "mutation_curated_wustl_gene") ~ "wustl curated",
815
          DataType == "Gene Somatic Non-silent Mutation" &
816
            endsWith(XenaDatasets, "mutation_ucsc_maf_gene") ~ "ucsc automated",
817
          DataType == "Gene Somatic Non-silent Mutation" &
818
            endsWith(XenaDatasets, "mutation_wustl_gene") ~ "wustl automated",
819
          DataType == "Gene Somatic Non-silent Mutation" &
820
            endsWith(XenaDatasets, "mutation_wustl_hiseq_gene") ~ "wustl hiseq automated",
821
822
823
          DataType == "Transcription Factor Regulatory Impact" &
824
            grepl("HiSeq.V2$", XenaDatasets) ~ "RABIT Use IlluminaHiSeq RNASeqV2",
825
          DataType == "Transcription Factor Regulatory Impact" &
826
            grepl("HiSeq$", XenaDatasets) ~ "RABIT Use IlluminaHiSeq RNASeq",
827
          DataType == "Transcription Factor Regulatory Impact" &
828
            grepl("GA.V2$", XenaDatasets) ~ "RABIT Use IlluminaGA RNASeqV2",
829
          DataType == "Transcription Factor Regulatory Impact" &
830
            grepl("GA$", XenaDatasets) ~ "RABIT Use IlluminaGA RNASeq",
831
          DataType == "Transcription Factor Regulatory Impact" &
832
            grepl("Agilent$", XenaDatasets) ~ "RABIT Use Agilent 244K Microarray",
833
          DataType == "Transcription Factor Regulatory Impact" &
834
            grepl("U133A$", XenaDatasets) ~ "RABIT Use Affymetrix U133A Microarray",
835
836
          DataType == "iCluster" &
837
            grepl("TCGA_PanCan33_iCluster_k28_tumor", XenaDatasets) ~ "iCluster cluster assignments",
838
          DataType == "iCluster" &
839
            grepl("lat.vars.iCluster.redo.tumor", XenaDatasets) ~ "iCluster latent variables",
840
841
          DataType == "Signatures" &
842
            grepl(
843
              "Pancan12_GenePrograms_drugTargetCanon",
844
              XenaDatasets
845
            ) ~ "Pancan Gene Programs",
846
          DataType == "Signatures" &
847
            grepl("StemnessScores_DNAmeth_", XenaDatasets) ~ "DNA methylation based StemnessScore",
848
          DataType == "Signatures" &
849
            grepl("StemnessScores_RNAexp", XenaDatasets) ~ "RNA based StemnessScore",
850
          DataType == "Signatures" &
851
            grepl(
852
              "TCGA_pancancer_10852whitelistsamples_68ImmuneSigs",
853
              XenaDatasets
854
            ) ~ "Immune Signature Scores",
855
          DataType == "Signatures" &
856
            grepl("TCGA.HRD_withSampleID.txt", XenaDatasets) ~ "Genome-wide DNA Damage Footprint HRD Score"
857
        )
858
      ) -> ob_tcga
859
  }
860
  ob_tcga
861
}
862
863
864
865
# grep unique pattern
866
# ob1 = sub("TCGA.*/(.*)", "\\1", ob$XenaDatasets) %>% table() %>% names() -> uniqueDatasets
867
# ob1 = tibble(XenaDatasets = uniqueDatasets)
868
# grep("gene_expression_subtype", ob$XenaDatasets, value = TRUE)
869
870
871
utils::globalVariables(c("DataType", "FileType", "ProjectID"))