--- a +++ b/R/amaretto_preprocess.R @@ -0,0 +1,815 @@ +#' AMARETTO_Preprocess +#' +#' Wrapper code that analyzes process TCGA GISTIC (CNV) and gene expression (rna-seq or microarray) data via one call +#' @param DataSetDirectories DataSetDirectories +#' @param BatchData BatchData +#' @importFrom graphics lines par title +#' @import grDevices +#' @importFrom limma strsplit2 +#' @importFrom MultiAssayExperiment experiments +#' @importFrom stats aov prcomp qqline qqnorm qqplot rgamma +#' @importFrom utils data read.csv write.table install.packages +#' @return result +#' @export +#' +#' @examples +#'\dontrun{ +#' TargetDirectory <- "Downloads" # path to data download directory +#' CancerSite <- 'CHOL' +#' DataSetDirectories <- AMARETTO_Download(CancerSite,TargetDirectory) +#' ProcessedData <- AMARETTO_Preprocess(DataSetDirectories,BatchData) +#'} +AMARETTO_Preprocess <- function(DataSetDirectories = DataSetDirectories, + BatchData = BatchData) { + CancerSite <- DataSetDirectories[1] + MinPerBatch = 5 + MAEO <- readRDS(paste0(DataSetDirectories[2], "/", + CancerSite, "_RNASeq_MAEO.rds")) + query1 <- grep("RNASeq", names(experiments(MAEO))) + MAEO_ge <- as.matrix(assay(MAEO[[query1]])) + MAEO_ge <- apply(MAEO_ge, 2, log2) + MAEO_ge[is.infinite(MAEO_ge) & MAEO_ge < 0] <- NA + cat("Loading mRNA data.\n") + if (!is.null(MAEO)) { + MA_TCGA = Preprocess_MAdata(CancerSite = CancerSite, MAEO_ge = MAEO_ge,BatchData = BatchData) + } else { + stop("No RNASeq MAEO object found.\n") + } + cat("Loading CNV data.\n") + cat("\tProcessing GISTIC output.\n") + CGH_Data = TCGA_Load_GISTICdata(DataSetDirectories$CNVdirectory) + CNVgenes = c(CGH_Data$AMPgenes, CGH_Data$DELgenes) + CNVgenes = gsub("\\[", "", CNVgenes) + CNVgenes = gsub("\\]", "", CNVgenes) + CGH_Data$CGH_Data_Segmented = CGH_Data$CGH_Data_Segmented[unique(CNVgenes), + ] + SampleNames = colnames(CGH_Data$CGH_Data_Segmented) + if (CancerSite == "LAML") { + colnames(CGH_Data$CGH_Data_Segmented) = paste(SampleNames, + "-03", sep = "") + } else { + colnames(CGH_Data$CGH_Data_Segmented) = paste(SampleNames, + "-01", sep = "") + } + cat("\tBatch correction.\n") + CNV_TCGA = TCGA_BatchCorrection_MolecularData(GEN_Data = CGH_Data$CGH_Data_Segmented, + BatchData = BatchData, MinInBatch = MinPerBatch) + CNV_TCGA = CGH_Data$CGH_Data_Segmented + Genes = rownames(CNV_TCGA) + SplitGenes = strsplit2(Genes, "\\|") + rownames(CNV_TCGA) = SplitGenes[, 1] + CNV_TCGA = TCGA_GENERIC_MergeData(unique(rownames(CNV_TCGA)), + CNV_TCGA) + AllCancers = c("BLCA", "BRCA", "CESC", "CHOL", + "COAD", "ESCA", "GBM", "HNSC", "KIRC", "KIRP", + "LAML", "LGG", "LIHC", "LUAD", "LUSC", "OV", + "PAAD", "PCPG", "READ", "SARC", "STAD", "THCA", + "THYM", "UCEC", "COADREAD") + if (length(intersect(CancerSite, AllCancers)) > + 0) { + cat("Loading MethylMix data.\n") + cat("\tGetting MethylMix methylation states.\n") + eval(parse(text = paste("MET_TCGA=MethylStates$", + CancerSite, sep = "")), envir = environment()) + SampleNames = colnames(MET_TCGA) + SampleNames = gsub("\\.", "-", SampleNames) + if (CancerSite == "LAML") { + colnames(MET_TCGA) = paste(SampleNames, + "-03", sep = "") + } else { + colnames(MET_TCGA) = paste(SampleNames, + "-01", sep = "") + } + } else { + cat("MethylMix data for", CancerSite, "not available\n") + MET_TCGA = matrix(0, 1, 1) + } + cat("Summarizing:\n") + cat("\tFound", length(rownames(MA_TCGA)), "genes and", + length(colnames(MA_TCGA)), "samples for MA data.\n") + cat("\tFound", length(rownames(CNV_TCGA)), "genes and", + length(colnames(CNV_TCGA)), "samples for GISTIC data before overlapping.\n") + cat("\tFound", length(rownames(MET_TCGA)), "genes and", + length(colnames(MET_TCGA)), "samples for MethylMix data before overlapping.\n") + cat("Overlapping genes and samples for GISTIC and MethylMix.\n") + OverlapGenes = Reduce(intersect, list(rownames(MA_TCGA), + rownames(CNV_TCGA))) + OverlapSamples = Reduce(intersect, list(colnames(MA_TCGA), + colnames(CNV_TCGA))) + CNV_TCGA = CNV_TCGA[OverlapGenes, OverlapSamples] + cat("\tFound", length(OverlapGenes), "overlapping genes and", + length(OverlapSamples), "samples for GISTIC data.\n") + OverlapGenes = Reduce(intersect, list(rownames(MA_TCGA), + rownames(MET_TCGA))) + OverlapSamples = Reduce(intersect, list(colnames(MA_TCGA), + colnames(MET_TCGA))) + MET_TCGA = MET_TCGA[OverlapGenes, OverlapSamples] + cat("\tFound", length(OverlapGenes), "overlapping genes and", + length(OverlapSamples), "samples for MethylMix data.\n") + return(list(MA_TCGA = MA_TCGA, CNV_TCGA = CNV_TCGA, + MET_TCGA = MET_TCGA)) +} + +#' TCGA_Load_GISTICdata +#' +#' @return result +#' @keywords internal +TCGA_Load_GISTICdata <- function(GisticDirectory) { + GenesFile = paste(GisticDirectory, "all_data_by_genes.txt", + sep = "") + CGH_Data = read.csv(GenesFile, sep = "\t", row.names = 1, + header = TRUE, na.strings = "NA") + SampleNames = colnames(CGH_Data) + SampleNames = gsub("\\.", "-", SampleNames) + colnames(CGH_Data) = SampleNames + Samplegroups = TCGA_GENERIC_GetSampleGroups(colnames(CGH_Data)) + CGH_Data = TCGA_GENERIC_CleanUpSampleNames(CGH_Data, + 12) + CGH_Data = CGH_Data[, -1] + CGH_Data = CGH_Data[, -1] + CGH_Data = as.matrix(CGH_Data) + GenesFile = paste(GisticDirectory, "all_thresholded.by_genes.txt", + sep = "") + CGH_Data_Thresholded = read.csv(GenesFile, sep = "\t", + row.names = 1, header = TRUE, na.strings = "NA") + SampleNames = colnames(CGH_Data_Thresholded) + SampleNames = gsub("\\.", "-", SampleNames) + colnames(CGH_Data_Thresholded) = SampleNames + CGH_Data_Thresholded = TCGA_GENERIC_CleanUpSampleNames(CGH_Data_Thresholded, + 12) + CGH_Data_Thresholded = CGH_Data_Thresholded[, -1] + CGH_Data_Thresholded = CGH_Data_Thresholded[, -1] + CGH_Data_Thresholded = as.matrix(CGH_Data_Thresholded) + AMPfile = paste(GisticDirectory, "amp_genes.conf_99.txt", + sep = "") + AMPtable = read.csv(AMPfile, sep = "\t", header = TRUE, + na.strings = "NA") + AMPgenes <- as.vector(sapply(AMPtable[4:nrow(AMPtable), + 2:ncol(AMPtable)], as.character)) + AMPgenes <- unique(AMPgenes[!AMPgenes %in% c("", + NA)]) + AMPgenes <- AMPgenes[order(AMPgenes)] + DELfile = paste(GisticDirectory, "del_genes.conf_99.txt", + sep = "") + DELtable = read.csv(DELfile, sep = "\t", header = TRUE, + na.strings = "NA") + DELgenes <- as.vector(sapply(DELtable[4:nrow(DELtable), + 2:ncol(DELtable)], as.character)) + DELgenes <- unique(DELgenes[!DELgenes %in% c("", + NA)]) + DELgenes <- DELgenes[order(DELgenes)] + cat("There are", length(AMPgenes), "AMP genes and", + length(DELgenes), "DEL genes.\n") + return(list(CGH_Data_Segmented = CGH_Data, CGH_Data_Thresholded = CGH_Data_Thresholded, + AMPgenes = AMPgenes, DELgenes = DELgenes)) +} + +#' Preprocess_MAdata +#' +#' @return result +#' @keywords internal +Preprocess_MAdata <- function(CancerSite=CancerSite, MAEO_ge=MAEO_ge,BatchData=BatchData) { + MinPerBatch = 5 + cat("\tMissing value estimation.\n") + MA_TCGA = TCGA_Load_MolecularData(MAEO_ge) + Samplegroups = TCGA_GENERIC_GetSampleGroups(colnames(MA_TCGA)) + if (CancerSite == "LAML") { + MA_TCGA = MA_TCGA[, Samplegroups$PeripheralBloodCancer] + } else { + MA_TCGA = MA_TCGA[, Samplegroups$Primary] + } + cat("\tBatch correction.\n") + MA_TCGA = TCGA_BatchCorrection_MolecularData(MA_TCGA, + BatchData, MinPerBatch) + cat("\tProcessing gene ids and merging.\n") + Genes = rownames(MA_TCGA) + SplitGenes = strsplit2(Genes, "\\|") + rownames(MA_TCGA) = SplitGenes[, 1] + MA_TCGA = MA_TCGA[!rownames(MA_TCGA) %in% "?", + ] + MA_TCGA = TCGA_GENERIC_MergeData(unique(rownames(MA_TCGA)), + MA_TCGA) + return(MA_TCGA = MA_TCGA) +} + + +#' TCGA_Load_MolecularData +#' +#' @return result +#' @importFrom impute impute.knn +#' @keywords internal +TCGA_Load_MolecularData <- function(MAEO_ge) { + MET_Data <- MAEO_ge + if (rownames(MET_Data)[1] == "Composite Element REF") { + cat("Removing first row with text stuff.\n") + MET_Data = MET_Data[-1, ] + Genes = rownames(MET_Data) + MET_Data = apply(MET_Data, 2, as.numeric) + rownames(MET_Data) = Genes + } + SampleNames = colnames(MET_Data) + SampleNames = gsub("\\.", "-", SampleNames) + colnames(MET_Data) = SampleNames + MissingValueThreshold = 0.1 + NrMissingsPerGene = apply(MET_Data, 1, function(x) sum(is.na(x)))/ncol(MET_Data) + cat("Removing", sum(NrMissingsPerGene > MissingValueThreshold), + "genes with more than 10% missing values.\n") + if (sum(NrMissingsPerGene > MissingValueThreshold) > + 0) + MET_Data = MET_Data[NrMissingsPerGene < MissingValueThreshold, + ] + NrMissingsPerSample = apply(MET_Data, 2, function(x) sum(is.na(x)))/nrow(MET_Data) + cat("Removing", sum(NrMissingsPerSample > MissingValueThreshold), + "patients with more than 10% missing values.\n") + if (sum(NrMissingsPerSample > MissingValueThreshold) > + 0) + MET_Data = MET_Data[, NrMissingsPerSample < + MissingValueThreshold] + if (length(colnames(MET_Data)) > 1) { + k = 15 + KNNresults = impute.knn(as.matrix(MET_Data), + k) + MET_Data_KNN = KNNresults$data + MET_Data_KNN_Clean = TCGA_GENERIC_CleanUpSampleNames(MET_Data_KNN, + 15) + return(MET_Data_KNN_Clean) + } else { + MET_Data_Clean = TCGA_GENERIC_CleanUpSampleNames(MET_Data, + 15) + return(MET_Data_Clean) + } +} + + +#' TCGA_GENERIC_CleanUpSampleNames +#' +#' @return result +#' @keywords internal +TCGA_GENERIC_CleanUpSampleNames <- function(GEN_Data=GEN_Data, + IDlength = 12) { + SampleNames = colnames(GEN_Data) + SampleNamesShort = as.character(apply(as.matrix(SampleNames), + 2, substr, 1, IDlength)) + if (length(SampleNamesShort) != length(unique(SampleNamesShort))) { + Counts = table(SampleNamesShort) + Doubles = rownames(Counts)[which(Counts > 1)] + cat("Removing doubles for", length(Doubles), + "samples.\n") + for (i in 1:length(Doubles)) { + pos = grep(Doubles[i], SampleNames) + GEN_Data = GEN_Data[, -pos[2:length(pos)]] + SampleNames = colnames(GEN_Data) + } + SampleNames = colnames(GEN_Data) + SampleNamesShort = as.character(apply(as.matrix(SampleNames), + 2, substr, 1, IDlength)) + colnames(GEN_Data) = SampleNamesShort + } else { + colnames(GEN_Data) = SampleNamesShort + } + return(GEN_Data) +} + +#' TCGA_GENERIC_GetSampleGroups +#' +#' @return result +#' @keywords internal +TCGA_GENERIC_GetSampleGroups <- function(SampleNames) { + SampleGroups = list() + Matches = regexpr("TCGA[.|-]\\w\\w[.|-]\\w\\w\\w\\w[.|-]01[.|-]*", + SampleNames, perl = FALSE, useBytes = FALSE) + SampleGroups$Primary = SampleNames[Matches == 1] + Matches = regexpr("TCGA[.|-]\\w\\w[.|-]\\w\\w\\w\\w[.|-]02[.|-]*", + SampleNames, perl = FALSE, useBytes = FALSE) + SampleGroups$Recurrent = SampleNames[Matches == + 1] + Matches = regexpr("TCGA[.|-]\\w\\w[.|-]\\w\\w\\w\\w[.|-]03[.|-]*", + SampleNames, perl = FALSE, useBytes = FALSE) + SampleGroups$PeripheralBloodCancer = SampleNames[Matches == + 1] + Matches = regexpr("TCGA[.|-]\\w\\w[.|-]\\w\\w\\w\\w[.|-]10[.|-]*", + SampleNames, perl = FALSE, useBytes = FALSE) + SampleGroups$BloodNormal = SampleNames[Matches == + 1] + Matches = regexpr("TCGA[.|-]\\w\\w[.|-]\\w\\w\\w\\w[.|-]11[.|-]*", + SampleNames, perl = FALSE, useBytes = FALSE) + SampleGroups$SolidNormal = SampleNames[Matches == + 1] + Matches = regexpr("TCGA[.|-]\\w\\w[.|-]\\w\\w\\w\\w[.|-]20[.|-]*", + SampleNames, perl = FALSE, useBytes = FALSE) + SampleGroups$CellLines = SampleNames[Matches == + 1] + Matches = regexpr("TCGA[.|-]\\w\\w[.|-]\\w\\w\\w\\w[.|-]06[.|-]*", + SampleNames, perl = FALSE, useBytes = FALSE) + SampleGroups$Metastatic = SampleNames[Matches == + 1] + return(SampleGroups) +} + +#' TCGA_BatchCorrection_MolecularData +#' +#' @return result +#' @keywords internal +TCGA_BatchCorrection_MolecularData <- function(GEN_Data =GEN_Data, + BatchData = BatchData, + MinInBatch = MinInBatch) { + if (length(-which(BatchData[, 3] == 0)) > 0) { + BatchData = BatchData[-which(BatchData[, 3] == + 0), ] + } + PresentSamples = is.element(BatchData[, 1], colnames(GEN_Data)) + BatchDataSelected = BatchData[PresentSamples, ] + if (sum(PresentSamples) != length(colnames(GEN_Data))) + BatchDataSelected = BatchData[-which(PresentSamples == + FALSE), ] + BatchDataSelected$Batch <- factor(BatchDataSelected$Batch) + NrPerBatch = table(BatchDataSelected$Batch) + SmallBatches = NrPerBatch < MinInBatch + BatchesToBeRemoved = names(SmallBatches)[which(SmallBatches == + TRUE)] + SamplesToBeRemoved = as.character(BatchDataSelected[which(BatchDataSelected$Batch %in% + BatchesToBeRemoved), 1]) + if (length(colnames(GEN_Data)) - length(which(colnames(GEN_Data) %in% + SamplesToBeRemoved)) > 5) { + # just checking if we have enough samples after + # removing the too small batches + if (length(which(colnames(GEN_Data) %in% SamplesToBeRemoved)) > + 0) { + cat("Removing", length(which(colnames(GEN_Data) %in% + SamplesToBeRemoved)), "samples because their batches are too small.\n") + GEN_Data = GEN_Data[, -which(colnames(GEN_Data) %in% + SamplesToBeRemoved)] + } + BatchCheck = TCGA_GENERIC_CheckBatchEffect(GEN_Data, + BatchData) + if (is.list(BatchCheck)) { + GEN_Data_Corrected = TCGA_GENERIC_BatchCorrection(GEN_Data, + BatchData) + BatchCheck = TCGA_GENERIC_CheckBatchEffect(GEN_Data_Corrected, + BatchData) + return(GEN_Data_Corrected) + } else { + cat("Only one batch, no batch correction possible.\n") + return(GEN_Data) + } + } else { + cat("The nr of samples becomes to small, no batch correction possible.\n") + return(GEN_Data) + } +} + +#' TCGA_GENERIC_BatchCorrection +#' +#' @return result +#' @keywords internal +TCGA_GENERIC_BatchCorrection <- function(GEN_Data = GEN_Data, + BatchData = BatchData) { + WithBatchSamples = is.element(colnames(GEN_Data), + BatchData[, 1]) + if (length(which(WithBatchSamples == FALSE)) > + 0) + GEN_Data = GEN_Data[, -which(WithBatchSamples == + FALSE)] + PresentSamples = is.element(BatchData[, 1], colnames(GEN_Data)) + BatchDataSelected = BatchData + if (sum(PresentSamples) != length(colnames(GEN_Data))) + BatchDataSelected = BatchData[-which(PresentSamples == + FALSE), ] + BatchDataSelected$Batch <- factor(BatchDataSelected$Batch) + BatchDataSelected$ArrayName <- factor(BatchDataSelected$ArrayName) + order <- match(colnames(GEN_Data), BatchDataSelected[, + 1]) + BatchDataSelected = BatchDataSelected[order, ] + BatchDataSelected$Batch <- factor(BatchDataSelected$Batch) + CombatResults = ComBat_NoFiles(GEN_Data, BatchDataSelected) + GEN_Data_Corrected = CombatResults[, -1] + class(GEN_Data_Corrected) <- "numeric" + return(GEN_Data_Corrected) +} + +#' ComBat_NoFiles +#' +#' @return result +#' @keywords internal +ComBat_NoFiles <- function(dat, saminfo, type = "txt", + write = FALSE, covariates = "all", par.prior = TRUE, + filter = FALSE, skip = 0, prior.plots = FALSE) { + cat("Reading Sample Information File\n") + if (sum(colnames(saminfo) == "Batch") != 1) { + return("ERROR: Sample Information File does not have a Batch column!") + } + expression_xls = "exp.txt" + cat("Reading Expression Data File\n") + dat <- dat[, trim.dat(dat)] + if (skip > 0) { + geneinfo <- as.matrix(dat[, 1:skip]) + dat <- dat[, -c(1:skip)] + } else { + geneinfo = NULL + } + if (filter) { + ngenes <- nrow(dat) + col <- ncol(dat)/2 + present <- apply(dat, 1, filter.absent, filter) + dat <- dat[present, -(2 * (1:col))] + if (skip > 0) { + geneinfo <- geneinfo[present, ] + } + cat("Filtered genes absent in more than", filter, + "of samples. Genes remaining:", nrow(dat), + "; Genes filtered:", ngenes - nrow(dat), + "\n") + } + if (any(apply(dat, 2, mode) != "numeric")) { + return("ERROR: Array expression columns contain non-numeric values! (Check your .xls file for non-numeric values and if this is not the problem, make a .csv file and use the type=csv option)") + } + tmp <- match(colnames(dat), saminfo[, 1]) + if (any(is.na(tmp))) { + return("ERROR: Sample Information File and Data Array Names are not the same!") + } + tmp1 <- match(saminfo[, 1], colnames(dat)) + saminfo <- saminfo[tmp1[!is.na(tmp1)], ] + if (any(covariates != "all")) { + saminfo <- saminfo[, c(1:2, covariates)] + } + design <- design.mat(saminfo) + batches <- list.batch(saminfo) + n.batch <- length(batches) + n.batches <- sapply(batches, length) + n.array <- sum(n.batches) + NAs = any(is.na(dat)) + if (NAs) { + cat(c("Found", sum(is.na(dat)), "Missing Data Values\n"), + sep = " ") + } + cat("Standardizing Data across genes\n") + if (!NAs) { + B.hat <- solve(t(design) %*% design) %*% t(design) %*% + t(as.matrix(dat)) + } else { + B.hat = apply(dat, 1, Beta.NA, design) + } + grand.mean <- t(n.batches/n.array) %*% B.hat[1:n.batch, + ] + if (!NAs) { + var.pooled <- ((dat - t(design %*% B.hat))^2) %*% + rep(1/n.array, n.array) + } else { + var.pooled <- apply(dat - t(design %*% B.hat), + 1, var, na.rm = TRUE) + } + stand.mean <- t(grand.mean) %*% t(rep(1, n.array)) + if (!is.null(design)) { + tmp <- design + tmp[, c(1:n.batch)] <- 0 + stand.mean <- stand.mean + t(tmp %*% B.hat) + } + s.data <- (dat - stand.mean)/(sqrt(var.pooled) %*% + t(rep(1, n.array))) + cat("Fitting L/S model and finding priors\n") + batch.design <- design[, 1:n.batch] + if (!NAs) { + gamma.hat <- solve(t(batch.design) %*% batch.design) %*% + t(batch.design) %*% t(as.matrix(s.data)) + } else { + gamma.hat = apply(s.data, 1, Beta.NA, batch.design) + } + delta.hat <- NULL + for (i in batches) { + delta.hat <- rbind(delta.hat, apply(s.data[, + i], 1, var, na.rm = TRUE)) + } + gamma.bar <- apply(gamma.hat, 1, mean) + t2 <- apply(gamma.hat, 1, var) + a.prior <- apply(delta.hat, 1, aprior) + b.prior <- apply(delta.hat, 1, bprior) + if (prior.plots & par.prior) { + pdf(file = "prior_plots.pdf") + par(mfrow = c(2, 2)) + tmp <- density(gamma.hat[1, ]) + plot(tmp, type = "l", main = "Density Plot") + xx <- seq(min(tmp$x), max(tmp$x), length = 100) + lines(xx, dnorm(xx, gamma.bar[1], sqrt(t2[1])), + col = 2) + qqnorm(gamma.hat[1, ]) + qqline(gamma.hat[1, ], col = 2) + tmp <- density(delta.hat[1, ]) + invgam <- 1/rgamma(ncol(delta.hat), a.prior[1], + b.prior[1]) + tmp1 <- density(invgam) + plot(tmp, typ = "l", main = "Density Plot", + ylim = c(0, max(tmp$y, tmp1$y))) + lines(tmp1, col = 2) + qqplot(delta.hat[1, ], invgam, xlab = "Sample Quantiles", + ylab = "Theoretical Quantiles") + lines(c(0, max(invgam)), c(0, max(invgam)), + col = 2) + title("Q-Q Plot") + dev.off() + } + gamma.star <- delta.star <- NULL + if (par.prior) { + cat("Finding parametric adjustments\n") + for (i in 1:n.batch) { + temp <- it.sol(s.data[, batches[[i]]], + gamma.hat[i, ], delta.hat[i, ], gamma.bar[i], + t2[i], a.prior[i], b.prior[i]) + gamma.star <- rbind(gamma.star, temp[1, + ]) + delta.star <- rbind(delta.star, temp[2, + ]) + } + } else { + cat("Finding nonparametric adjustments\n") + for (i in 1:n.batch) { + temp <- int.eprior(as.matrix(s.data[, batches[[i]]]), + gamma.hat[i, ], delta.hat[i, ]) + gamma.star <- rbind(gamma.star, temp[1, + ]) + delta.star <- rbind(delta.star, temp[2, + ]) + } + } + cat("Adjusting the Data\n") + bayesdata <- s.data + j <- 1 + for (i in batches) { + bayesdata[, i] <- (bayesdata[, i] - t(batch.design[i, + ] %*% gamma.star))/(sqrt(delta.star[j, + ]) %*% t(rep(1, n.batches[j]))) + j <- j + 1 + } + bayesdata <- (bayesdata * (sqrt(var.pooled) %*% + t(rep(1, n.array)))) + stand.mean + if (write) { + output_file <- paste(expression_xls, "Adjusted", + ".txt", sep = "_") + outdata <- cbind(ProbeID = rownames(dat), bayesdata) + write.table(outdata, file = output_file, sep = "\t") + cat("Adjusted data saved in file:", output_file, + "\n") + } else { + return(cbind(rownames(dat), bayesdata)) + } +} + +#' filter.absent +#' +#' filters data based on presence/absence call +#' @return result +#' @keywords internal +filter.absent <- function(x, pct) { + present <- TRUE + col <- length(x)/2 + pct.absent <- (sum(x[2 * (1:col)] == "A") + sum(x[2 * + (1:col)] == "M"))/col + if (pct.absent > pct) { + present <- FALSE + } + return(present) +} + +#' build.design +#' +#' Next two functions make the design matrix (X) from the sample info file +#' @return result +#' @keywords internal +build.design <- function(vec, des = NULL, start = 2) { + tmp <- matrix(0, length(vec), nlevels(vec) - start + + 1) + for (i in 1:ncol(tmp)) { + tmp[, i] <- vec == levels(vec)[i + start - + 1] + } + return(cbind(des, tmp)) +} + +#' design.mat +#' +#' @return result +#' @keywords internal +design.mat <- function(saminfo) { + tmp <- which(colnames(saminfo) == "Batch") + tmp1 <- as.factor(saminfo[, tmp]) + cat("Found", nlevels(tmp1), "batches\n") + design <- build.design(tmp1, start = 1) + ncov <- ncol(as.matrix(saminfo[, -c(1:2, tmp)])) + cat("Found", ncov, "covariate(s)\n") + if (ncov > 0) { + for (j in 1:ncov) { + tmp1 <- as.factor(as.matrix(saminfo[, -c(1:2, + tmp)])[, j]) + design <- build.design(tmp1, des = design) + } + } + return(design) +} + +#' list.batch +#' +#' Makes a list with elements pointing to which array belongs to which batch +#' @return result +#' @keywords internal +list.batch <- function(saminfo) { + tmp1 <- as.factor(saminfo[, which(colnames(saminfo) == + "Batch")]) + batches <- lapply(1:nlevels(tmp1), function(x) (1:length(tmp1))[tmp1 == + levels(tmp1)[x]]) + return(batches) +} + +#' trim.dat +#' +#' Trims the data of extra columns, note your array names cannot be named 'X' or start with 'X.' +#' @return result +#' @keywords internal +trim.dat <- function(dat) { + tmp <- strsplit(colnames(dat), "\\.") + tr <- sapply(1:length(tmp), function(x) tmp[[x]][1] != + "X") + return(tr) +} + +#' aprior +#' +#' Following four find empirical hyper-prior values +#' @return result +#' @keywords internal +aprior <- function(gamma.hat) { + m = mean(gamma.hat) + s2 = var(gamma.hat) + return((2 * s2 + m^2)/s2) +} + +#' bprior +#' +#' @return result +#' @keywords internal +bprior <- function(gamma.hat) { + m = mean(gamma.hat) + s2 = var(gamma.hat) + return((m * s2 + m^3)/s2) +} + +#' postmean +#' +#' @return result +#' @keywords internal +postmean <- function(g.hat, g.bar, n, d.star, t2) { + return((t2 * n * g.hat + d.star * g.bar)/(t2 * + n + d.star)) +} + +#' postvar +#' +#' @return result +#' @keywords internal +postvar <- function(sum2, n, a, b) { + return((0.5 * sum2 + b)/(n/2 + a - 1)) +} + +#' it.sol +#' +#' Pass in entire data set, the design matrix for the entire data, the batch means, the batch variances, priors (m, t2, a, b), columns of the data matrix for the batch. Uses the EM to find the parametric batch adjustments +#' +#' @return result +#' @keywords internal +it.sol <- function(sdat, g.hat, d.hat, g.bar, t2, a, + b, conv = 1e-04) { + n <- apply(!is.na(sdat), 1, sum) + g.old <- g.hat + d.old <- d.hat + change <- 1 + count <- 0 + while (change > conv) { + g.new <- postmean(g.hat, g.bar, n, d.old, t2) + sum2 <- apply((sdat - g.new %*% t(rep(1, ncol(sdat))))^2, + 1, sum, na.rm = TRUE) + d.new <- postvar(sum2, n, a, b) + change <- max(abs(g.new - g.old)/g.old, abs(d.new - + d.old)/d.old) + g.old <- g.new + d.old <- d.new + count <- count + 1 + } + adjust <- rbind(g.new, d.new) + rownames(adjust) <- c("g.star", "d.star") + return(adjust) +} + +#' L +#' +#' likelihood function +#' +#' @return result +#' @keywords internal +L <- function(x, g.hat, d.hat) { + return(prod(dnorm(x, g.hat, sqrt(d.hat)))) +} + +#' int.eprior +#' +#'Monte Carlo integration function to find the nonparametric adjustments +#' @return result +#' @keywords internal +int.eprior <- function(sdat, g.hat, d.hat) { + g.star <- d.star <- NULL + r <- nrow(sdat) + for (i in 1:r) { + g <- g.hat[-i] + d <- d.hat[-i] + x <- sdat[i, !is.na(sdat[i, ])] + n <- length(x) + j <- numeric(n) + 1 + dat <- matrix(as.numeric(x), length(g), n, + byrow = TRUE) + resid2 <- (dat - g)^2 + sum2 <- resid2 %*% j + LH <- 1/(2 * pi * d)^(n/2) * exp(-sum2/(2 * + d)) + LH[LH == "NaN"] = 0 + g.star <- c(g.star, sum(g * LH)/sum(LH)) + d.star <- c(d.star, sum(d * LH)/sum(LH)) + } + adjust <- rbind(g.star, d.star) + rownames(adjust) <- c("g.star", "d.star") + return(adjust) +} + + +#' Beta.NA +#' +#' @return result +#' @keywords internal +Beta.NA = function(y, X) { + des = X[!is.na(y), ] + y1 = y[!is.na(y)] + B <- solve(t(des) %*% des) %*% t(des) %*% y1 + return(B) +} + +#' TCGA_GENERIC_MergeData +#' +#' @return result +#' @keywords internal +TCGA_GENERIC_MergeData <- function(NewIDListUnique, + DataMatrix, MergeMethod) { + NrUniqueGenes = length(NewIDListUnique) + MergedData = matrix(0, NrUniqueGenes, length(colnames(DataMatrix))) + for (i in 1:NrUniqueGenes) { + tmpData = DataMatrix[which(rownames(DataMatrix) %in% + NewIDListUnique[i]), ] + ifelse(length(rownames(tmpData)) > 1, MergedData[i, + ] <- colMeans(tmpData), MergedData[i, ] <- tmpData) + } + rownames(MergedData) = NewIDListUnique + colnames(MergedData) = colnames(DataMatrix) + return(MergedData) +} + +#' TCGA_GENERIC_CheckBatchEffect +#' +#' @return result +#' @keywords internal +TCGA_GENERIC_CheckBatchEffect <- function(GEN_Data, + BatchData) { + Order = match(colnames(GEN_Data), BatchData[, 1]) + BatchDataSelected = BatchData[Order, ] + BatchDataSelected$Batch <- factor(BatchDataSelected$Batch) + PCAanalysis = prcomp(t(GEN_Data)) + PCdata = PCAanalysis$x + #plot(PCdata[, 1] ~ BatchDataSelected[, 3]) + if (length(unique(BatchDataSelected$Batch)) > 1) { + tmp = aov(PCdata[, 1] ~ BatchDataSelected[, + 3]) + return(list(Pvalues = summary(tmp), PCA = PCdata, + BatchDataSelected = BatchDataSelected)) + } else { + return(-1) + } +} + +#' Save_CancerSite +#' +#' @return result +#' @keywords internal +Save_CancerSite <- function(CancerSite, TargetDirectory, + DataSetDirectories, ProcessedData) { + if (length(DataSetDirectories$MAdirectory) > 1) { + tmpMAdir = DataSetDirectories$MAdirectory[1] + } else { + tmpMAdir = DataSetDirectories$MAdirectory + } + MAdir = strsplit2(tmpMAdir, "/") + tmpPos = grep("gdac", MAdir) + MAversion = gsub("gdac_", "", MAdir[tmpPos[1]]) + CNVdir = strsplit2(DataSetDirectories$CNVdirectory, + "/") + CNVversion = gsub("gdac_", "", CNVdir[tmpPos[1]]) + METversion = "MethylMix2015" + SaveFile = paste(TargetDirectory, "TCGA_", CancerSite, + "_ProcessedData_MA", MAversion, "_CNV", CNVversion, + "_MET", METversion, ".RData", sep = "") + save(file = SaveFile, ProcessedData) + return(SaveFile) +} \ No newline at end of file