Diff of /R/amaretto_preprocess.R [000000] .. [2ab972]

Switch to unified view

a b/R/amaretto_preprocess.R
1
#' AMARETTO_Preprocess
2
#'
3
#' Wrapper code that analyzes process TCGA GISTIC (CNV) and gene expression (rna-seq or microarray) data  via one call
4
#' @param DataSetDirectories DataSetDirectories
5
#' @param BatchData BatchData
6
#' @importFrom graphics lines par title
7
#' @import grDevices
8
#' @importFrom limma  strsplit2
9
#' @importFrom MultiAssayExperiment experiments
10
#' @importFrom stats aov  prcomp  qqline  qqnorm  qqplot  rgamma
11
#' @importFrom utils data read.csv  write.table install.packages
12
#' @return result
13
#' @export
14
#'
15
#' @examples
16
#'\dontrun{
17
#' TargetDirectory <-  "Downloads"  # path to data download directory
18
#' CancerSite <- 'CHOL'
19
#' DataSetDirectories <- AMARETTO_Download(CancerSite,TargetDirectory)
20
#' ProcessedData <- AMARETTO_Preprocess(DataSetDirectories,BatchData)
21
#'}
22
AMARETTO_Preprocess <- function(DataSetDirectories = DataSetDirectories,
23
                                BatchData = BatchData) {
24
  CancerSite <- DataSetDirectories[1]
25
  MinPerBatch = 5
26
  MAEO <- readRDS(paste0(DataSetDirectories[2], "/", 
27
                         CancerSite, "_RNASeq_MAEO.rds"))
28
  query1 <- grep("RNASeq", names(experiments(MAEO)))
29
  MAEO_ge <- as.matrix(assay(MAEO[[query1]]))
30
  MAEO_ge <- apply(MAEO_ge, 2, log2)
31
  MAEO_ge[is.infinite(MAEO_ge) & MAEO_ge < 0] <- NA
32
  cat("Loading mRNA data.\n")
33
  if (!is.null(MAEO)) {
34
    MA_TCGA = Preprocess_MAdata(CancerSite = CancerSite, MAEO_ge = MAEO_ge,BatchData = BatchData)
35
  } else {
36
    stop("No RNASeq MAEO object found.\n")
37
  }
38
  cat("Loading CNV data.\n")
39
  cat("\tProcessing GISTIC output.\n")
40
  CGH_Data = TCGA_Load_GISTICdata(DataSetDirectories$CNVdirectory)
41
  CNVgenes = c(CGH_Data$AMPgenes, CGH_Data$DELgenes)
42
  CNVgenes = gsub("\\[", "", CNVgenes)
43
  CNVgenes = gsub("\\]", "", CNVgenes)
44
  CGH_Data$CGH_Data_Segmented = CGH_Data$CGH_Data_Segmented[unique(CNVgenes), 
45
                                                            ]
46
  SampleNames = colnames(CGH_Data$CGH_Data_Segmented)
47
  if (CancerSite == "LAML") {
48
    colnames(CGH_Data$CGH_Data_Segmented) = paste(SampleNames, 
49
                                                  "-03", sep = "")
50
  } else {
51
    colnames(CGH_Data$CGH_Data_Segmented) = paste(SampleNames, 
52
                                                  "-01", sep = "")
53
  }
54
  cat("\tBatch correction.\n")
55
  CNV_TCGA = TCGA_BatchCorrection_MolecularData(GEN_Data = CGH_Data$CGH_Data_Segmented,
56
                                                BatchData =  BatchData, MinInBatch = MinPerBatch)
57
  CNV_TCGA = CGH_Data$CGH_Data_Segmented
58
  Genes = rownames(CNV_TCGA)
59
  SplitGenes = strsplit2(Genes, "\\|")
60
  rownames(CNV_TCGA) = SplitGenes[, 1]
61
  CNV_TCGA = TCGA_GENERIC_MergeData(unique(rownames(CNV_TCGA)), 
62
                                    CNV_TCGA)
63
  AllCancers = c("BLCA", "BRCA", "CESC", "CHOL", 
64
                 "COAD", "ESCA", "GBM", "HNSC", "KIRC", "KIRP", 
65
                 "LAML", "LGG", "LIHC", "LUAD", "LUSC", "OV", 
66
                 "PAAD", "PCPG", "READ", "SARC", "STAD", "THCA", 
67
                 "THYM", "UCEC", "COADREAD")
68
  if (length(intersect(CancerSite, AllCancers)) > 
69
      0) {
70
    cat("Loading MethylMix data.\n")
71
    cat("\tGetting MethylMix methylation states.\n")
72
    eval(parse(text = paste("MET_TCGA=MethylStates$", 
73
                            CancerSite, sep = "")), envir = environment())
74
    SampleNames = colnames(MET_TCGA)
75
    SampleNames = gsub("\\.", "-", SampleNames)
76
    if (CancerSite == "LAML") {
77
      colnames(MET_TCGA) = paste(SampleNames, 
78
                                 "-03", sep = "")
79
    } else {
80
      colnames(MET_TCGA) = paste(SampleNames, 
81
                                 "-01", sep = "")
82
    }
83
  } else {
84
    cat("MethylMix data for", CancerSite, "not available\n")
85
    MET_TCGA = matrix(0, 1, 1)
86
  }
87
  cat("Summarizing:\n")
88
  cat("\tFound", length(rownames(MA_TCGA)), "genes and", 
89
      length(colnames(MA_TCGA)), "samples for MA data.\n")
90
  cat("\tFound", length(rownames(CNV_TCGA)), "genes and", 
91
      length(colnames(CNV_TCGA)), "samples for GISTIC data before overlapping.\n")
92
  cat("\tFound", length(rownames(MET_TCGA)), "genes and", 
93
      length(colnames(MET_TCGA)), "samples for MethylMix data before overlapping.\n")
94
  cat("Overlapping genes and samples for GISTIC and MethylMix.\n")
95
  OverlapGenes = Reduce(intersect, list(rownames(MA_TCGA), 
96
                                        rownames(CNV_TCGA)))
97
  OverlapSamples = Reduce(intersect, list(colnames(MA_TCGA), 
98
                                          colnames(CNV_TCGA)))
99
  CNV_TCGA = CNV_TCGA[OverlapGenes, OverlapSamples]
100
  cat("\tFound", length(OverlapGenes), "overlapping genes and", 
101
      length(OverlapSamples), "samples for GISTIC data.\n")
102
  OverlapGenes = Reduce(intersect, list(rownames(MA_TCGA), 
103
                                        rownames(MET_TCGA)))
104
  OverlapSamples = Reduce(intersect, list(colnames(MA_TCGA), 
105
                                          colnames(MET_TCGA)))
106
  MET_TCGA = MET_TCGA[OverlapGenes, OverlapSamples]
107
  cat("\tFound", length(OverlapGenes), "overlapping genes and", 
108
      length(OverlapSamples), "samples for MethylMix data.\n")
109
  return(list(MA_TCGA = MA_TCGA, CNV_TCGA = CNV_TCGA, 
110
              MET_TCGA = MET_TCGA))
111
}
112
113
#' TCGA_Load_GISTICdata
114
#' 
115
#' @return result
116
#' @keywords internal
117
TCGA_Load_GISTICdata <- function(GisticDirectory) {
118
  GenesFile = paste(GisticDirectory, "all_data_by_genes.txt", 
119
                    sep = "")
120
  CGH_Data = read.csv(GenesFile, sep = "\t", row.names = 1, 
121
                      header = TRUE, na.strings = "NA")
122
  SampleNames = colnames(CGH_Data)
123
  SampleNames = gsub("\\.", "-", SampleNames)
124
  colnames(CGH_Data) = SampleNames
125
  Samplegroups = TCGA_GENERIC_GetSampleGroups(colnames(CGH_Data))
126
  CGH_Data = TCGA_GENERIC_CleanUpSampleNames(CGH_Data, 
127
                                             12)
128
  CGH_Data = CGH_Data[, -1]
129
  CGH_Data = CGH_Data[, -1]
130
  CGH_Data = as.matrix(CGH_Data)
131
  GenesFile = paste(GisticDirectory, "all_thresholded.by_genes.txt", 
132
                    sep = "")
133
  CGH_Data_Thresholded = read.csv(GenesFile, sep = "\t", 
134
                                  row.names = 1, header = TRUE, na.strings = "NA")
135
  SampleNames = colnames(CGH_Data_Thresholded)
136
  SampleNames = gsub("\\.", "-", SampleNames)
137
  colnames(CGH_Data_Thresholded) = SampleNames
138
  CGH_Data_Thresholded = TCGA_GENERIC_CleanUpSampleNames(CGH_Data_Thresholded, 
139
                                                         12)
140
  CGH_Data_Thresholded = CGH_Data_Thresholded[, -1]
141
  CGH_Data_Thresholded = CGH_Data_Thresholded[, -1]
142
  CGH_Data_Thresholded = as.matrix(CGH_Data_Thresholded)
143
  AMPfile = paste(GisticDirectory, "amp_genes.conf_99.txt", 
144
                  sep = "")
145
  AMPtable = read.csv(AMPfile, sep = "\t", header = TRUE, 
146
                      na.strings = "NA")
147
  AMPgenes <- as.vector(sapply(AMPtable[4:nrow(AMPtable), 
148
                                        2:ncol(AMPtable)], as.character))
149
  AMPgenes <- unique(AMPgenes[!AMPgenes %in% c("", 
150
                                               NA)])
151
  AMPgenes <- AMPgenes[order(AMPgenes)]
152
  DELfile = paste(GisticDirectory, "del_genes.conf_99.txt", 
153
                  sep = "")
154
  DELtable = read.csv(DELfile, sep = "\t", header = TRUE, 
155
                      na.strings = "NA")
156
  DELgenes <- as.vector(sapply(DELtable[4:nrow(DELtable), 
157
                                        2:ncol(DELtable)], as.character))
158
  DELgenes <- unique(DELgenes[!DELgenes %in% c("", 
159
                                               NA)])
160
  DELgenes <- DELgenes[order(DELgenes)]
161
  cat("There are", length(AMPgenes), "AMP genes and", 
162
      length(DELgenes), "DEL genes.\n")
163
  return(list(CGH_Data_Segmented = CGH_Data, CGH_Data_Thresholded = CGH_Data_Thresholded, 
164
              AMPgenes = AMPgenes, DELgenes = DELgenes))
165
}
166
167
#' Preprocess_MAdata
168
#'
169
#' @return result
170
#' @keywords internal
171
Preprocess_MAdata <- function(CancerSite=CancerSite, MAEO_ge=MAEO_ge,BatchData=BatchData) {
172
  MinPerBatch = 5
173
  cat("\tMissing value estimation.\n")
174
  MA_TCGA = TCGA_Load_MolecularData(MAEO_ge)
175
  Samplegroups = TCGA_GENERIC_GetSampleGroups(colnames(MA_TCGA))
176
  if (CancerSite == "LAML") {
177
    MA_TCGA = MA_TCGA[, Samplegroups$PeripheralBloodCancer]
178
  } else {
179
    MA_TCGA = MA_TCGA[, Samplegroups$Primary]
180
  }
181
  cat("\tBatch correction.\n")
182
  MA_TCGA = TCGA_BatchCorrection_MolecularData(MA_TCGA, 
183
                                               BatchData, MinPerBatch)
184
  cat("\tProcessing gene ids and merging.\n")
185
  Genes = rownames(MA_TCGA)
186
  SplitGenes = strsplit2(Genes, "\\|")
187
  rownames(MA_TCGA) = SplitGenes[, 1]
188
  MA_TCGA = MA_TCGA[!rownames(MA_TCGA) %in% "?", 
189
                    ]
190
  MA_TCGA = TCGA_GENERIC_MergeData(unique(rownames(MA_TCGA)), 
191
                                   MA_TCGA)
192
  return(MA_TCGA = MA_TCGA)
193
}
194
195
196
#' TCGA_Load_MolecularData
197
#'
198
#' @return result
199
#' @importFrom impute impute.knn
200
#' @keywords internal
201
TCGA_Load_MolecularData <- function(MAEO_ge) {
202
  MET_Data <- MAEO_ge
203
  if (rownames(MET_Data)[1] == "Composite Element REF") {
204
    cat("Removing first row with text stuff.\n")
205
    MET_Data = MET_Data[-1, ]
206
    Genes = rownames(MET_Data)
207
    MET_Data = apply(MET_Data, 2, as.numeric)
208
    rownames(MET_Data) = Genes
209
  }
210
  SampleNames = colnames(MET_Data)
211
  SampleNames = gsub("\\.", "-", SampleNames)
212
  colnames(MET_Data) = SampleNames
213
  MissingValueThreshold = 0.1
214
  NrMissingsPerGene = apply(MET_Data, 1, function(x) sum(is.na(x)))/ncol(MET_Data)
215
  cat("Removing", sum(NrMissingsPerGene > MissingValueThreshold), 
216
      "genes with more than 10% missing values.\n")
217
  if (sum(NrMissingsPerGene > MissingValueThreshold) > 
218
      0) 
219
    MET_Data = MET_Data[NrMissingsPerGene < MissingValueThreshold, 
220
                        ]
221
  NrMissingsPerSample = apply(MET_Data, 2, function(x) sum(is.na(x)))/nrow(MET_Data)
222
  cat("Removing", sum(NrMissingsPerSample > MissingValueThreshold), 
223
      "patients with more than 10% missing values.\n")
224
  if (sum(NrMissingsPerSample > MissingValueThreshold) > 
225
      0) 
226
    MET_Data = MET_Data[, NrMissingsPerSample < 
227
                          MissingValueThreshold]
228
  if (length(colnames(MET_Data)) > 1) {
229
    k = 15
230
    KNNresults = impute.knn(as.matrix(MET_Data), 
231
                            k)
232
    MET_Data_KNN = KNNresults$data
233
    MET_Data_KNN_Clean = TCGA_GENERIC_CleanUpSampleNames(MET_Data_KNN, 
234
                                                         15)
235
    return(MET_Data_KNN_Clean)
236
  } else {
237
    MET_Data_Clean = TCGA_GENERIC_CleanUpSampleNames(MET_Data, 
238
                                                     15)
239
    return(MET_Data_Clean)
240
  }
241
}
242
243
244
#' TCGA_GENERIC_CleanUpSampleNames
245
#' 
246
#' @return result
247
#' @keywords internal
248
TCGA_GENERIC_CleanUpSampleNames <- function(GEN_Data=GEN_Data, 
249
                                            IDlength = 12) {
250
  SampleNames = colnames(GEN_Data)
251
  SampleNamesShort = as.character(apply(as.matrix(SampleNames), 
252
                                        2, substr, 1, IDlength))
253
  if (length(SampleNamesShort) != length(unique(SampleNamesShort))) {
254
    Counts = table(SampleNamesShort)
255
    Doubles = rownames(Counts)[which(Counts > 1)]
256
    cat("Removing doubles for", length(Doubles), 
257
        "samples.\n")
258
    for (i in 1:length(Doubles)) {
259
      pos = grep(Doubles[i], SampleNames)
260
      GEN_Data = GEN_Data[, -pos[2:length(pos)]]
261
      SampleNames = colnames(GEN_Data)
262
    }
263
    SampleNames = colnames(GEN_Data)
264
    SampleNamesShort = as.character(apply(as.matrix(SampleNames), 
265
                                          2, substr, 1, IDlength))
266
    colnames(GEN_Data) = SampleNamesShort
267
  } else {
268
    colnames(GEN_Data) = SampleNamesShort
269
  }
270
  return(GEN_Data)
271
}
272
273
#' TCGA_GENERIC_GetSampleGroups
274
#'
275
#' @return result
276
#' @keywords internal
277
TCGA_GENERIC_GetSampleGroups <- function(SampleNames) {
278
  SampleGroups = list()
279
  Matches = regexpr("TCGA[.|-]\\w\\w[.|-]\\w\\w\\w\\w[.|-]01[.|-]*", 
280
                    SampleNames, perl = FALSE, useBytes = FALSE)
281
  SampleGroups$Primary = SampleNames[Matches == 1]
282
  Matches = regexpr("TCGA[.|-]\\w\\w[.|-]\\w\\w\\w\\w[.|-]02[.|-]*", 
283
                    SampleNames, perl = FALSE, useBytes = FALSE)
284
  SampleGroups$Recurrent = SampleNames[Matches == 
285
                                         1]
286
  Matches = regexpr("TCGA[.|-]\\w\\w[.|-]\\w\\w\\w\\w[.|-]03[.|-]*", 
287
                    SampleNames, perl = FALSE, useBytes = FALSE)
288
  SampleGroups$PeripheralBloodCancer = SampleNames[Matches == 
289
                                                     1]
290
  Matches = regexpr("TCGA[.|-]\\w\\w[.|-]\\w\\w\\w\\w[.|-]10[.|-]*", 
291
                    SampleNames, perl = FALSE, useBytes = FALSE)
292
  SampleGroups$BloodNormal = SampleNames[Matches == 
293
                                           1]
294
  Matches = regexpr("TCGA[.|-]\\w\\w[.|-]\\w\\w\\w\\w[.|-]11[.|-]*", 
295
                    SampleNames, perl = FALSE, useBytes = FALSE)
296
  SampleGroups$SolidNormal = SampleNames[Matches == 
297
                                           1]
298
  Matches = regexpr("TCGA[.|-]\\w\\w[.|-]\\w\\w\\w\\w[.|-]20[.|-]*", 
299
                    SampleNames, perl = FALSE, useBytes = FALSE)
300
  SampleGroups$CellLines = SampleNames[Matches == 
301
                                         1]
302
  Matches = regexpr("TCGA[.|-]\\w\\w[.|-]\\w\\w\\w\\w[.|-]06[.|-]*", 
303
                    SampleNames, perl = FALSE, useBytes = FALSE)
304
  SampleGroups$Metastatic = SampleNames[Matches == 
305
                                          1]
306
  return(SampleGroups)
307
}
308
309
#' TCGA_BatchCorrection_MolecularData
310
#' 
311
#' @return result
312
#' @keywords internal
313
TCGA_BatchCorrection_MolecularData <- function(GEN_Data =GEN_Data,
314
                                               BatchData = BatchData,
315
                                               MinInBatch = MinInBatch) {
316
  if (length(-which(BatchData[, 3] == 0)) > 0) {
317
    BatchData = BatchData[-which(BatchData[, 3] == 
318
                                   0), ]
319
  }
320
  PresentSamples = is.element(BatchData[, 1], colnames(GEN_Data))
321
  BatchDataSelected = BatchData[PresentSamples, ]
322
  if (sum(PresentSamples) != length(colnames(GEN_Data))) 
323
    BatchDataSelected = BatchData[-which(PresentSamples == 
324
                                           FALSE), ]
325
  BatchDataSelected$Batch <- factor(BatchDataSelected$Batch)
326
  NrPerBatch = table(BatchDataSelected$Batch)
327
  SmallBatches = NrPerBatch < MinInBatch
328
  BatchesToBeRemoved = names(SmallBatches)[which(SmallBatches == 
329
                                                   TRUE)]
330
  SamplesToBeRemoved = as.character(BatchDataSelected[which(BatchDataSelected$Batch %in% 
331
                                                              BatchesToBeRemoved), 1])
332
  if (length(colnames(GEN_Data)) - length(which(colnames(GEN_Data) %in% 
333
                                                SamplesToBeRemoved)) > 5) {
334
    # just checking if we have enough samples after
335
    # removing the too small batches
336
    if (length(which(colnames(GEN_Data) %in% SamplesToBeRemoved)) > 
337
        0) {
338
      cat("Removing", length(which(colnames(GEN_Data) %in% 
339
                                     SamplesToBeRemoved)), "samples because their batches are too small.\n")
340
      GEN_Data = GEN_Data[, -which(colnames(GEN_Data) %in% 
341
                                     SamplesToBeRemoved)]
342
    }
343
    BatchCheck = TCGA_GENERIC_CheckBatchEffect(GEN_Data, 
344
                                               BatchData)
345
    if (is.list(BatchCheck)) {
346
      GEN_Data_Corrected = TCGA_GENERIC_BatchCorrection(GEN_Data, 
347
                                                        BatchData)
348
      BatchCheck = TCGA_GENERIC_CheckBatchEffect(GEN_Data_Corrected, 
349
                                                 BatchData)
350
      return(GEN_Data_Corrected)
351
    } else {
352
      cat("Only one batch, no batch correction possible.\n")
353
      return(GEN_Data)
354
    }
355
  } else {
356
    cat("The nr of samples becomes to small, no batch correction possible.\n")
357
    return(GEN_Data)
358
  }
359
}
360
361
#' TCGA_GENERIC_BatchCorrection
362
#'
363
#' @return result
364
#' @keywords internal
365
TCGA_GENERIC_BatchCorrection <- function(GEN_Data = GEN_Data, 
366
                                         BatchData = BatchData) {
367
  WithBatchSamples = is.element(colnames(GEN_Data), 
368
                                BatchData[, 1])
369
  if (length(which(WithBatchSamples == FALSE)) > 
370
      0) 
371
    GEN_Data = GEN_Data[, -which(WithBatchSamples == 
372
                                   FALSE)]
373
  PresentSamples = is.element(BatchData[, 1], colnames(GEN_Data))
374
  BatchDataSelected = BatchData
375
  if (sum(PresentSamples) != length(colnames(GEN_Data))) 
376
    BatchDataSelected = BatchData[-which(PresentSamples == 
377
                                           FALSE), ]
378
  BatchDataSelected$Batch <- factor(BatchDataSelected$Batch)
379
  BatchDataSelected$ArrayName <- factor(BatchDataSelected$ArrayName)
380
  order <- match(colnames(GEN_Data), BatchDataSelected[, 
381
                                                       1])
382
  BatchDataSelected = BatchDataSelected[order, ]
383
  BatchDataSelected$Batch <- factor(BatchDataSelected$Batch)
384
  CombatResults = ComBat_NoFiles(GEN_Data, BatchDataSelected)
385
  GEN_Data_Corrected = CombatResults[, -1]
386
  class(GEN_Data_Corrected) <- "numeric"
387
  return(GEN_Data_Corrected)
388
}
389
390
#' ComBat_NoFiles
391
#' 
392
#' @return result
393
#' @keywords internal
394
ComBat_NoFiles <- function(dat, saminfo, type = "txt", 
395
                           write = FALSE, covariates = "all", par.prior = TRUE, 
396
                           filter = FALSE, skip = 0, prior.plots = FALSE) {
397
  cat("Reading Sample Information File\n")
398
  if (sum(colnames(saminfo) == "Batch") != 1) {
399
    return("ERROR: Sample Information File does not have a Batch column!")
400
  }
401
  expression_xls = "exp.txt"
402
  cat("Reading Expression Data File\n")
403
  dat <- dat[, trim.dat(dat)]
404
  if (skip > 0) {
405
    geneinfo <- as.matrix(dat[, 1:skip])
406
    dat <- dat[, -c(1:skip)]
407
  } else {
408
    geneinfo = NULL
409
  }
410
  if (filter) {
411
    ngenes <- nrow(dat)
412
    col <- ncol(dat)/2
413
    present <- apply(dat, 1, filter.absent, filter)
414
    dat <- dat[present, -(2 * (1:col))]
415
    if (skip > 0) {
416
      geneinfo <- geneinfo[present, ]
417
    }
418
    cat("Filtered genes absent in more than", filter, 
419
        "of samples. Genes remaining:", nrow(dat), 
420
        "; Genes filtered:", ngenes - nrow(dat), 
421
        "\n")
422
  }
423
  if (any(apply(dat, 2, mode) != "numeric")) {
424
    return("ERROR: Array expression columns contain non-numeric values! (Check your .xls file for non-numeric values and if this is not the problem, make a .csv file and use the type=csv option)")
425
  }
426
  tmp <- match(colnames(dat), saminfo[, 1])
427
  if (any(is.na(tmp))) {
428
    return("ERROR: Sample Information File and Data Array Names are not the same!")
429
  }
430
  tmp1 <- match(saminfo[, 1], colnames(dat))
431
  saminfo <- saminfo[tmp1[!is.na(tmp1)], ]
432
  if (any(covariates != "all")) {
433
    saminfo <- saminfo[, c(1:2, covariates)]
434
  }
435
  design <- design.mat(saminfo)
436
  batches <- list.batch(saminfo)
437
  n.batch <- length(batches)
438
  n.batches <- sapply(batches, length)
439
  n.array <- sum(n.batches)
440
  NAs = any(is.na(dat))
441
  if (NAs) {
442
    cat(c("Found", sum(is.na(dat)), "Missing Data Values\n"), 
443
        sep = " ")
444
  }
445
  cat("Standardizing Data across genes\n")
446
  if (!NAs) {
447
    B.hat <- solve(t(design) %*% design) %*% t(design) %*% 
448
      t(as.matrix(dat))
449
  } else {
450
    B.hat = apply(dat, 1, Beta.NA, design)
451
  }
452
  grand.mean <- t(n.batches/n.array) %*% B.hat[1:n.batch, 
453
                                               ]
454
  if (!NAs) {
455
    var.pooled <- ((dat - t(design %*% B.hat))^2) %*% 
456
      rep(1/n.array, n.array)
457
  } else {
458
    var.pooled <- apply(dat - t(design %*% B.hat), 
459
                        1, var, na.rm = TRUE)
460
  }
461
  stand.mean <- t(grand.mean) %*% t(rep(1, n.array))
462
  if (!is.null(design)) {
463
    tmp <- design
464
    tmp[, c(1:n.batch)] <- 0
465
    stand.mean <- stand.mean + t(tmp %*% B.hat)
466
  }
467
  s.data <- (dat - stand.mean)/(sqrt(var.pooled) %*% 
468
                                  t(rep(1, n.array)))
469
  cat("Fitting L/S model and finding priors\n")
470
  batch.design <- design[, 1:n.batch]
471
  if (!NAs) {
472
    gamma.hat <- solve(t(batch.design) %*% batch.design) %*% 
473
      t(batch.design) %*% t(as.matrix(s.data))
474
  } else {
475
    gamma.hat = apply(s.data, 1, Beta.NA, batch.design)
476
  }
477
  delta.hat <- NULL
478
  for (i in batches) {
479
    delta.hat <- rbind(delta.hat, apply(s.data[, 
480
                                               i], 1, var, na.rm = TRUE))
481
  }
482
  gamma.bar <- apply(gamma.hat, 1, mean)
483
  t2 <- apply(gamma.hat, 1, var)
484
  a.prior <- apply(delta.hat, 1, aprior)
485
  b.prior <- apply(delta.hat, 1, bprior)
486
  if (prior.plots & par.prior) {
487
    pdf(file = "prior_plots.pdf")
488
    par(mfrow = c(2, 2))
489
    tmp <- density(gamma.hat[1, ])
490
    plot(tmp, type = "l", main = "Density Plot")
491
    xx <- seq(min(tmp$x), max(tmp$x), length = 100)
492
    lines(xx, dnorm(xx, gamma.bar[1], sqrt(t2[1])), 
493
          col = 2)
494
    qqnorm(gamma.hat[1, ])
495
    qqline(gamma.hat[1, ], col = 2)
496
    tmp <- density(delta.hat[1, ])
497
    invgam <- 1/rgamma(ncol(delta.hat), a.prior[1], 
498
                       b.prior[1])
499
    tmp1 <- density(invgam)
500
    plot(tmp, typ = "l", main = "Density Plot", 
501
         ylim = c(0, max(tmp$y, tmp1$y)))
502
    lines(tmp1, col = 2)
503
    qqplot(delta.hat[1, ], invgam, xlab = "Sample Quantiles", 
504
           ylab = "Theoretical Quantiles")
505
    lines(c(0, max(invgam)), c(0, max(invgam)), 
506
          col = 2)
507
    title("Q-Q Plot")
508
    dev.off()
509
  }
510
  gamma.star <- delta.star <- NULL
511
  if (par.prior) {
512
    cat("Finding parametric adjustments\n")
513
    for (i in 1:n.batch) {
514
      temp <- it.sol(s.data[, batches[[i]]], 
515
                     gamma.hat[i, ], delta.hat[i, ], gamma.bar[i], 
516
                     t2[i], a.prior[i], b.prior[i])
517
      gamma.star <- rbind(gamma.star, temp[1, 
518
                                           ])
519
      delta.star <- rbind(delta.star, temp[2, 
520
                                           ])
521
    }
522
  } else {
523
    cat("Finding nonparametric adjustments\n")
524
    for (i in 1:n.batch) {
525
      temp <- int.eprior(as.matrix(s.data[, batches[[i]]]), 
526
                         gamma.hat[i, ], delta.hat[i, ])
527
      gamma.star <- rbind(gamma.star, temp[1, 
528
                                           ])
529
      delta.star <- rbind(delta.star, temp[2, 
530
                                           ])
531
    }
532
  }
533
  cat("Adjusting the Data\n")
534
  bayesdata <- s.data
535
  j <- 1
536
  for (i in batches) {
537
    bayesdata[, i] <- (bayesdata[, i] - t(batch.design[i, 
538
                                                       ] %*% gamma.star))/(sqrt(delta.star[j, 
539
                                                                                           ]) %*% t(rep(1, n.batches[j])))
540
    j <- j + 1
541
  }
542
  bayesdata <- (bayesdata * (sqrt(var.pooled) %*% 
543
                               t(rep(1, n.array)))) + stand.mean
544
  if (write) {
545
    output_file <- paste(expression_xls, "Adjusted", 
546
                         ".txt", sep = "_")
547
    outdata <- cbind(ProbeID = rownames(dat), bayesdata)
548
    write.table(outdata, file = output_file, sep = "\t")
549
    cat("Adjusted data saved in file:", output_file, 
550
        "\n")
551
  } else {
552
    return(cbind(rownames(dat), bayesdata))
553
  }
554
}
555
556
#' filter.absent
557
#'
558
#' filters data based on presence/absence call
559
#' @return result
560
#' @keywords internal
561
filter.absent <- function(x, pct) {
562
  present <- TRUE
563
  col <- length(x)/2
564
  pct.absent <- (sum(x[2 * (1:col)] == "A") + sum(x[2 * 
565
                                                      (1:col)] == "M"))/col
566
  if (pct.absent > pct) {
567
    present <- FALSE
568
  }
569
  return(present)
570
}
571
572
#' build.design
573
#'
574
#' Next two functions make the design matrix (X) from the sample info file
575
#' @return result
576
#' @keywords internal
577
build.design <- function(vec, des = NULL, start = 2) {
578
  tmp <- matrix(0, length(vec), nlevels(vec) - start + 
579
                  1)
580
  for (i in 1:ncol(tmp)) {
581
    tmp[, i] <- vec == levels(vec)[i + start - 
582
                                     1]
583
  }
584
  return(cbind(des, tmp))
585
}
586
587
#' design.mat
588
#'
589
#' @return result
590
#' @keywords internal
591
design.mat <- function(saminfo) {
592
  tmp <- which(colnames(saminfo) == "Batch")
593
  tmp1 <- as.factor(saminfo[, tmp])
594
  cat("Found", nlevels(tmp1), "batches\n")
595
  design <- build.design(tmp1, start = 1)
596
  ncov <- ncol(as.matrix(saminfo[, -c(1:2, tmp)]))
597
  cat("Found", ncov, "covariate(s)\n")
598
  if (ncov > 0) {
599
    for (j in 1:ncov) {
600
      tmp1 <- as.factor(as.matrix(saminfo[, -c(1:2, 
601
                                               tmp)])[, j])
602
      design <- build.design(tmp1, des = design)
603
    }
604
  }
605
  return(design)
606
}
607
608
#' list.batch
609
#'
610
#' Makes a list with elements pointing to which array belongs to which batch
611
#' @return result
612
#' @keywords internal
613
list.batch <- function(saminfo) {
614
  tmp1 <- as.factor(saminfo[, which(colnames(saminfo) == 
615
                                      "Batch")])
616
  batches <- lapply(1:nlevels(tmp1), function(x) (1:length(tmp1))[tmp1 == 
617
                                                                    levels(tmp1)[x]])
618
  return(batches)
619
}
620
621
#' trim.dat
622
#'
623
#' Trims the data of extra columns, note your array names cannot be named 'X' or start with 'X.'
624
#' @return result
625
#' @keywords internal
626
trim.dat <- function(dat) {
627
  tmp <- strsplit(colnames(dat), "\\.")
628
  tr <- sapply(1:length(tmp), function(x) tmp[[x]][1] != 
629
                 "X")
630
  return(tr)
631
}
632
633
#' aprior
634
#'
635
#' Following four find empirical hyper-prior values
636
#' @return result
637
#' @keywords internal
638
aprior <- function(gamma.hat) {
639
  m = mean(gamma.hat)
640
  s2 = var(gamma.hat)
641
  return((2 * s2 + m^2)/s2)
642
}
643
644
#' bprior
645
#'
646
#' @return result
647
#' @keywords internal
648
bprior <- function(gamma.hat) {
649
  m = mean(gamma.hat)
650
  s2 = var(gamma.hat)
651
  return((m * s2 + m^3)/s2)
652
}
653
654
#' postmean
655
#'
656
#' @return result
657
#' @keywords internal
658
postmean <- function(g.hat, g.bar, n, d.star, t2) {
659
  return((t2 * n * g.hat + d.star * g.bar)/(t2 * 
660
                                              n + d.star))
661
}
662
663
#' postvar
664
#'
665
#' @return result
666
#' @keywords internal
667
postvar <- function(sum2, n, a, b) {
668
  return((0.5 * sum2 + b)/(n/2 + a - 1))
669
}
670
671
#' it.sol
672
#'
673
#' Pass in entire data set, the design matrix for the entire data, the batch means, the batch variances, priors (m, t2, a, b), columns of the data  matrix for the batch. Uses the EM to find the parametric batch adjustments
674
#' 
675
#' @return result
676
#' @keywords internal
677
it.sol <- function(sdat, g.hat, d.hat, g.bar, t2, a, 
678
                   b, conv = 1e-04) {
679
  n <- apply(!is.na(sdat), 1, sum)
680
  g.old <- g.hat
681
  d.old <- d.hat
682
  change <- 1
683
  count <- 0
684
  while (change > conv) {
685
    g.new <- postmean(g.hat, g.bar, n, d.old, t2)
686
    sum2 <- apply((sdat - g.new %*% t(rep(1, ncol(sdat))))^2, 
687
                  1, sum, na.rm = TRUE)
688
    d.new <- postvar(sum2, n, a, b)
689
    change <- max(abs(g.new - g.old)/g.old, abs(d.new - 
690
                                                  d.old)/d.old)
691
    g.old <- g.new
692
    d.old <- d.new
693
    count <- count + 1
694
  }
695
  adjust <- rbind(g.new, d.new)
696
  rownames(adjust) <- c("g.star", "d.star")
697
  return(adjust)
698
}
699
700
#' L
701
#'
702
#' likelihood function
703
#' 
704
#' @return result
705
#' @keywords internal
706
L <- function(x, g.hat, d.hat) {
707
  return(prod(dnorm(x, g.hat, sqrt(d.hat))))
708
}
709
710
#' int.eprior
711
#'
712
#'Monte Carlo integration function to find the nonparametric adjustments
713
#' @return result
714
#' @keywords internal
715
int.eprior <- function(sdat, g.hat, d.hat) {
716
  g.star <- d.star <- NULL
717
  r <- nrow(sdat)
718
  for (i in 1:r) {
719
    g <- g.hat[-i]
720
    d <- d.hat[-i]
721
    x <- sdat[i, !is.na(sdat[i, ])]
722
    n <- length(x)
723
    j <- numeric(n) + 1
724
    dat <- matrix(as.numeric(x), length(g), n, 
725
                  byrow = TRUE)
726
    resid2 <- (dat - g)^2
727
    sum2 <- resid2 %*% j
728
    LH <- 1/(2 * pi * d)^(n/2) * exp(-sum2/(2 * 
729
                                              d))
730
    LH[LH == "NaN"] = 0
731
    g.star <- c(g.star, sum(g * LH)/sum(LH))
732
    d.star <- c(d.star, sum(d * LH)/sum(LH))
733
  }
734
  adjust <- rbind(g.star, d.star)
735
  rownames(adjust) <- c("g.star", "d.star")
736
  return(adjust)
737
}
738
739
740
#' Beta.NA
741
#'
742
#' @return result
743
#' @keywords internal
744
Beta.NA = function(y, X) {
745
  des = X[!is.na(y), ]
746
  y1 = y[!is.na(y)]
747
  B <- solve(t(des) %*% des) %*% t(des) %*% y1
748
  return(B)
749
}
750
751
#' TCGA_GENERIC_MergeData
752
#'
753
#' @return result
754
#' @keywords internal
755
TCGA_GENERIC_MergeData <- function(NewIDListUnique, 
756
                                   DataMatrix, MergeMethod) {
757
  NrUniqueGenes = length(NewIDListUnique)
758
  MergedData = matrix(0, NrUniqueGenes, length(colnames(DataMatrix)))
759
  for (i in 1:NrUniqueGenes) {
760
    tmpData = DataMatrix[which(rownames(DataMatrix) %in% 
761
                                 NewIDListUnique[i]), ]
762
    ifelse(length(rownames(tmpData)) > 1, MergedData[i, 
763
                                                     ] <- colMeans(tmpData), MergedData[i, ] <- tmpData)
764
  }
765
  rownames(MergedData) = NewIDListUnique
766
  colnames(MergedData) = colnames(DataMatrix)
767
  return(MergedData)
768
}
769
770
#' TCGA_GENERIC_CheckBatchEffect
771
#'
772
#' @return result
773
#' @keywords internal
774
TCGA_GENERIC_CheckBatchEffect <- function(GEN_Data, 
775
                                          BatchData) {
776
  Order = match(colnames(GEN_Data), BatchData[, 1])
777
  BatchDataSelected = BatchData[Order, ]
778
  BatchDataSelected$Batch <- factor(BatchDataSelected$Batch)
779
  PCAanalysis = prcomp(t(GEN_Data))
780
  PCdata = PCAanalysis$x
781
  #plot(PCdata[, 1] ~ BatchDataSelected[, 3])
782
  if (length(unique(BatchDataSelected$Batch)) > 1) {
783
    tmp = aov(PCdata[, 1] ~ BatchDataSelected[, 
784
                                              3])
785
    return(list(Pvalues = summary(tmp), PCA = PCdata, 
786
                BatchDataSelected = BatchDataSelected))
787
  } else {
788
    return(-1)
789
  }
790
}
791
792
#' Save_CancerSite
793
#'
794
#' @return result
795
#' @keywords internal
796
Save_CancerSite <- function(CancerSite, TargetDirectory, 
797
                            DataSetDirectories, ProcessedData) {
798
  if (length(DataSetDirectories$MAdirectory) > 1) {
799
    tmpMAdir = DataSetDirectories$MAdirectory[1]
800
  } else {
801
    tmpMAdir = DataSetDirectories$MAdirectory
802
  }
803
  MAdir = strsplit2(tmpMAdir, "/")
804
  tmpPos = grep("gdac", MAdir)
805
  MAversion = gsub("gdac_", "", MAdir[tmpPos[1]])
806
  CNVdir = strsplit2(DataSetDirectories$CNVdirectory, 
807
                     "/")
808
  CNVversion = gsub("gdac_", "", CNVdir[tmpPos[1]])
809
  METversion = "MethylMix2015"
810
  SaveFile = paste(TargetDirectory, "TCGA_", CancerSite, 
811
                   "_ProcessedData_MA", MAversion, "_CNV", CNVversion, 
812
                   "_MET", METversion, ".RData", sep = "")
813
  save(file = SaveFile, ProcessedData)
814
  return(SaveFile)
815
}