--- a +++ b/createBulkAtlantisFlock.R @@ -0,0 +1,491 @@ +library(gridExtra) +library(foreach) +library(atlantis) + +createBulkAtlantisFlock <- function(analysis.description, run_mode, predMat.file, targetMat.file, anno.file, featureSetToUse, kMinNumSamples=50, seed=300, batch.size=1, predFeaturesPerTarget=NULL, fitControlSettings="default", config.summary=NULL, predictOnly=NULL, permutation.count=NULL, only.permutations=F, + code.dir=".", + test.fraction=NA, + predFeatureRegExps=NULL) { + # analysis.description: a description which will be saved in the run directory + # run_mode: ignored + # mustHave: a vector of dataset ids which must have a value present to be included in the prediction matrix + # niceToHave: a vector of dataset ids to include in the prediction matrix, but these may have missing values in the prediction matrix + # featureSetToUse: one of "all", "perTarget", "perTargetAndSelf" or "single" determining how to subset the prediction matrix when + # for a given target. + # predFeaturesPerTarget: if featureSetToUse = "perTarget" or "perTargetAndSelf" then this should be a dataframe with two columns: c("target", "partner") which will be used to look up which genes to use as predicitive features based on a given target + + if (featureSetToUse == "perTarget" || featureSetToUse == "perTargetAndSelf") { + stopifnot(!is.null(predFeaturesPerTarget) && is.data.frame(predFeaturesPerTarget) && all(colnames(predFeaturesPerTarget) == c("target", "partner"))) + } else if(featureSetToUse == "perGroup") { + stopifnot(!is.null(predFeaturesPerTarget) && is.data.frame(predFeaturesPerTarget) && all(colnames(predFeaturesPerTarget) == c("group", "target", "group.name"))) + } + + # flock_run_dir is the name of the directory of where the results will be written + # the analysis.ID is the trailing dir name + analysis.ID <- basename(flock_run_dir) + output.dir <- paste(flock_run_dir, '/results/', sep='') + dir.create(output.dir) + + # cache the functional related genes if we'll need them + predFeaturesPerTarget.file <- NULL + if(featureSetToUse == "perTarget" || featureSetToUse == "perTargetAndSelf" || featureSetToUse == "perGroup") { + stopifnot(!is.null(predFeaturesPerTarget)) + + predFeaturesPerTarget.file <- paste(output.dir, '/featuresPerTarget.Rdata', sep="") + save(predFeaturesPerTarget, file=predFeaturesPerTarget.file) + } + + # load the target matrix to determine the targets + load(targetMat.file); + stopifnot(is.matrix(targetMat) || is.data.frame(targetMat)) + targets <- colnames(targetMat); + if(is.null(permutation.count)) { + permutation.count <- length(targets) + } + + if(featureSetToUse == "perGroup") { + # then we want to create a target per member in our predFeaturesPerTarget table + with.valid.targets <- predFeaturesPerTarget[paste("GS_",predFeaturesPerTarget$target,sep='') %in% targets,,drop=F] + targets <- lapply(seq(nrow( with.valid.targets)), function(i) { + x<-with.valid.targets[i,]; + list(ID=paste("GS_",as.character(x$target),sep=''), + gene=as.character(x$target), + group=as.character(x$group), + output.prefix=paste(x$group,'-',x$target,sep=''), + group.name=as.character(x$group.name)) + } ) + stopifnot(! ("GS_NA" %in% sapply(targets, function(t){t$ID}))) + } else { + # filter out targets which have a single output value + print("before filtering out singles") + print(length(targets)) + is.bad.target <- sapply(seq(dim(targetMat)[[2]]), function(i) { t<-targetMat[,i]; t<-t[!is.na(t)]; length(unique(t))==1 } ) + targets <- targets[!is.bad.target] + targets <- lapply(targets, function(t) { list(ID=t, output.prefix=t) }) + print("after filtering out targets with a single target value") + print(length(targets)) + stopifnot(grep("GS_NA", targets)!=0) + } + + common <- list( + fitControlSettings=fitControlSettings, + targetMat.file=targetMat.file, + predMat.file=predMat.file, + anno.file=anno.file, + output.dir=output.dir, + featureSetToUse=featureSetToUse, + kMinNumSamples=kMinNumSamples, + predFeaturesPerTarget.file=predFeaturesPerTarget.file, + seed=seed, + quality.dist.file=paste(output.dir, "null_quality_distribution.txt", sep=""), + config.summary=config.summary, + predictOnly=predictOnly, + permutation.count=permutation.count, + only.permutations=only.permutations, + batch.size=batch.size, + predFeatureRegExps=predFeatureRegExps + ) + + sources <- c("createBulkAtlantisFlock.R") + + common$sources <- sources + + # add in runs for each permuted version used for the null + permutations <- sample(targets, permutation.count, replace=T) + for(i in seq_along(permutations)) { + permutations[[i]]$permute.seed <- i + seed + } + + if(common$only.permutations) { + tasks <- permutations + } else { + tasks <- c(targets, permutations) + } + + tasks <- split(tasks, as.integer(seq(length(tasks))/common$batch.size)) + + if(!is.na(test.fraction)) { + task.count <- as.integer(ceiling(test.fraction * length(tasks))) + cat("Only running", task.count, "tasks out of", length(tasks), "because test.fraction was set to", test.fraction, "\n") + tasks <- tasks [ seq(task.count) ] + } + + flock.run(tasks, task_function='per_target', gather_function='gather', sources=common$sources, flock_common_state=common) +} + +prev.proc.time <- NULL + +time.checkpoint <- function(label) { +# now <- proc.time() +# if(!is.null(prev.proc.time)) { +# cat(sprintf("%s: %f\n", label, (now["elapsed"] - prev.proc.time["elapsed"]))) +# } else { +# cat(sprintf("%s: first\n", label)) +# } +# prev.proc.time <<- now +} + +# given a gene symbol, returns a list of functionaly related genes based on dataframe which was persisted +get.related.genes <- function(predFeaturesPerTarget, targetToLookup, target, include.self) { + matches <- as.character(predFeaturesPerTarget$target) == targetToLookup + partners <- as.character(predFeaturesPerTarget[matches, "partner"]) + if(include.self) { + ret <- unique(c(target, partners)) + } else { + ret <- setdiff(unique(partners), target) + } + + return(ret) +# sapply(ret, getFeatureNameFromFeatureID) +} + +get.gene.group <- function(predFeaturesPerTarget, groupToLookup, target, include.self) { + partners <- as.character(predFeaturesPerTarget[predFeaturesPerTarget$group == groupToLookup, "target"]) + if(include.self) { + ret <- unique(c(target, partners)) + } else { + ret <- setdiff(unique(partners), target) + } + return(ret) +# sapply(ret, getFeatureNameFromFeatureID) +} + +determine.feature.genes <- function(target, featureSetToUse, predFeaturesPerTarget.file) { + if(!is.null(predFeaturesPerTarget.file)) { + load(predFeaturesPerTarget.file) + } + + if (featureSetToUse == "all") { + genes <- NULL + } else if (featureSetToUse == "perTarget" || featureSetToUse == "perTargetAndSelf") { + targetName <- getFeatureNameFromFeatureID(target$ID) + stopifnot(!is.null(targetName)) + genes <- get.related.genes(predFeaturesPerTarget, targetName, targetName, featureSetToUse == "perTargetAndSelf") + cat("found", length(genes), "gene names to be used as features for", targetName, "\n") + } else if (featureSetToUse == "perGroup") { + stopifnot(all(colnames(predFeaturesPerTarget) == c("group", "target", "group.name"))) + group <- target$group + stopifnot(! is.null(group)) + stopifnot(! is.null(target$gene)) + genes <- get.gene.group(predFeaturesPerTarget, group, target$gene, FALSE) + } else { + stopifnot(featureSetToUse == "single") + targetName <- getFeatureNameFromFeatureID(target$ID) + genes <- targetName + stopifnot(!is.null(genes)) + } + + return(genes) +} + +per_target <- function(per.task.state, common.state, output.file, run) { + library(foreach) + + flock_run_dir <- run$dir + + analysis.dir <- basename(flock_run_dir); + analysis.ID <- analysis.dir; + output.dir <- flock_common_state$output.dir + predMat.file <- flock_common_state$predMat.file + targetMat.file <- flock_common_state$targetMat.file + anno.file <- flock_common_state$anno + kMinNumSamples <- flock_common_state$kMinNumSamples + predFeaturesPerTarget.file <- flock_common_state$predFeaturesPerTarget.file + config.summary <- flock_common_state$config.summary + predFeatureRegExps <- flock_common_state$predFeatureRegExps + + fitControlSettings = flock_common_state$fitControlSettings + + # the following figures out what are the 100 most similar genes to target gene and uses only them + # for prediction. + #### + targetMat.file <- flock_common_state$targetMat.file + load(targetMat.file) + + featureSetToUse <- flock_common_state$featureSetToUse + + fit.single.target <- function(target) { + # allow target to be specified as either a string or a list of (ID, group) + if(is.character(target)) { + target <- list(ID=target, targetUsedToLookupRelated=target, permute.seed=NULL) + } + + targetID <- target$ID + permute.seed <- target$permute.seed + + print(targetID) + cat(sprintf("Generating model for %s\n", targetID)) + set.seed(flock_common_state$seed) + + if (is.numeric(targetID)) { + targetID <- colnames(targetMat)[targetID] + } + + stopifnot(length(grep("NO_CURRENT", targetID)) == 0) + + permuteRows <- function(seed, targetMat) { + saved <- .Random.seed + set.seed(seed) + permutedTargetMat <- targetMat + for(i in seq(ncol(targetMat))) { + permutedTargetMat[,i] = sample(targetMat[,i]) + } + .Random.seed <- saved + permutedTargetMat + } + + if(featureSetToUse == "all") { + predFeatureRegExps <- ".*" + genes <- NULL + } else { + genes <- determine.feature.genes(target, featureSetToUse, predFeaturesPerTarget.file) + } + + output.prefix <- target$output.prefix + group.name <- target$group.name + if(is.null(group.name)) { + group.name <- NA + } else { + config.summary <- c(config.summary, paste("group:", group.name)) + } + + # if this is for the null distribution + if(!is.null(permute.seed)) { + permutedTargetMat <- permuteRows(permute.seed, targetMat) + + res <- runATLANTIS( + analysis.ID=analysis.ID, + targetID=targetID, + output.prefix=paste(output.prefix, "-NULL-",permute.seed, sep=''), + makePlot=FALSE, + output.dir = paste(dirname(output.dir), '/temp', sep=''), + fitControlSettings = fitControlSettings, + additionalFeatures=NULL, + predFeatureNamesToUse=genes, + predFeatureRegExps=predFeatureRegExps, + predMat.file = predMat.file, + targetMat = permutedTargetMat, + anno.file=anno.file, + kMinNumSamples=kMinNumSamples, + save.params=F, + save.model=F, + save.featureData=F) + } else { + res <- runATLANTIS( + analysis.ID=analysis.ID, + targetID=targetID, + makePlot=TRUE, + output.dir = output.dir, + output.prefix=output.prefix, + fitControlSettings = fitControlSettings, + additionalFeatures=NULL, + predFeatureNamesToUse=genes, + predFeatureRegExps=predFeatureRegExps, + predMat.file = predMat.file, + targetMat = targetMat, + anno.file=anno.file, + kMinNumSamples=kMinNumSamples, + save.params=F, + save.model=F, + report.summary=config.summary, + predictOnly=flock_common_state$predictOnly + ) + } + + list(targetID=targetID, + result=res, + permute.seed=permute.seed, + output.prefix=output.prefix, + group.name=group.name) + + } + + task.results <- lapply(flock_per_task_state, fit.single.target) + + save(task.results, file=output.file) +} + + zscore <- function(x) { + (x-mean(x, na.rm=T))/sd(x, na.rm=T) + } + + +gather <- function(per.task.state, common.state, output.file, run) { + task.results <- do.call(c, lapply(flock_per_task_state, function(job.details) { + e <- new.env() + load(job.details$flock_output_file, envir=e) + e$task.results + })) + + is.permuted <- sapply(task.results, function(x) { !is.null(x$permute.seed) } ) + + null.dist.details <- foreach(task.result=task.results[is.permuted], .combine=rbind) %do% { + load(task.result$result$fit.file) + if(!is.null(fit.info$OOB$weighted.cor)) { + data.frame(quality=fit.info$OOB$quality, weighted.cor=fit.info$OOB$weighted.cor, weighted.cor.R2=fit.info$OOB$weighted.cor.R2, with.more.weight=sum(min(fit.info$OOB$weights) != fit.info$OOB$weights), sample.count=length(fit.info$targetVec), failure.reason=NA, targetID=task.result$targetID, nfeatures=fit.info$OOB$nfeatures) + } else { + data.frame(quality=NA, weighted.cor=NA, weighted.cor.R2=NA, with.more.weight=NA, sample.count=NA, failure.reason=fit.info$failure.reason, targetID=task.result$targetID, nfeatures=NA) + } + } + + null.distribution.values <- na.omit(null.dist.details$quality[!is.na(null.dist.details$quality)]) + pval.from.null.distribution <- function(x) { (sum(null.distribution.values >= x)+1)/(length(null.distribution.values)+1) } + + write.table(null.distribution.values, file=flock_common_state$quality.dist.file, col.names=F, row.names=F) + write.csv(null.dist.details, file=paste(flock_common_state$quality.dist.file,"-details.csv",sep=''),row.names=F) + + # summarize the runs into a table + + fits <- lapply(task.results[!is.permuted], function(job) { + if(!is.null(job$result) && !is.null(job$result$fit.file)) { + load(job$result$fit.file) + # reduce memory by dropping unused fields + top.feature <- names(which.max(fit.info$varImp)) + if(!is.null(top.feature)) { + top.feature.values <- fit.info$predData[,top.feature] + wt.cor.top.feature <- atlantis:::weighted.cor(top.feature.values, fit.info$targetVec, fit.info$OOB$weights) + target.top.pred.corr <- cor(fit.info$targetVec, top.feature.values, use='pairwise.complete') + lines.with.more.weight <- which(min(fit.info$OOB$weights) < fit.info$OOB$weights) + sensitive.lines.zmean.top.feature <- mean(zscore(top.feature.values)[lines.with.more.weight], na.rm=T) + } else { + target.top.pred.corr <- NA + wt.cor.top.feature <- NA + sensitive.lines.zmean.top.feature <- NA + } + fit.info$top.feature <- top.feature + fit.info$target.top.pred.corr <- target.top.pred.corr + fit.info$wt.cor.top.feature <- wt.cor.top.feature + fit.info$sensitive.lines.zmean.top.feature <- sensitive.lines.zmean.top.feature + fit.info$predData <- NULL + fit.info$predictors <- NULL + fit.info + } else { + NULL + } + }) + + taskResults <- task.results[!is.permuted] + + varsPerModel <- foreach(taskResult=taskResults, fit.info=fits, .combine=rbind) %do% { + if(length(fit.info$varImp) == 0) { + NULL + } else { + d <- data.frame(variable=names(fit.info$varImp), varImportance=fit.info$varImp, targetID=taskResult$targetID, group.name=taskResult$group.name) + rownames(d) <- NULL + d + } + } + + coerce.to.na <- function(x) { ifelse(is.null(x), NA, x) } + + qualPerModel <- foreach(taskResult=taskResults, fit.info=fits, .combine=rbind) %do% { + targetID <- taskResult$targetID + Rsquared <- coerce.to.na(fit.info$OOB$quality) + if(!is.null(fit.info$failure.reason)) { + Rsquared <- NA + } + target.top.pred.corr <- fit.info$target.top.pred.corr + top.feature <- fit.info$top.feature + wt.Rsquared <- fit.info$OOB$weighted.cor.R2 + Pvalue <- pval.from.null.distribution(Rsquared) + n <- names(fit.info$varImp[top.feature]) + group.name <- taskResult$group.name + if(is.null(group.name)) { + group.name <- NA + } + output.prefix <- taskResult$output.prefix + target.min <- min(fit.info$targetVec, na.rm=T) + sample.count <- length(fit.info$targetVec) + with.more.weight <- sum(min(fit.info$OOB$weights) != fit.info$OOB$weights) + nfeatures=fit.info$OOB$nfeatures + if(is.null(nfeatures)) { + nfeatures <- NA + } + print("nfeatures") + print(nfeatures) + + #stopifnot(!is.null(Rsquared)) + #stopifnot(!is.null(targetID)) + + data.frame(Rsquared=Rsquared, + targetID=targetID, + topFeature=coerce.to.na(n), + Pvalue=coerce.to.na(Pvalue), + topFeatureCor=target.top.pred.corr, + targetMin=target.min, + group.name=group.name, + output.prefix=output.prefix, + wt.Rsquared=coerce.to.na(wt.Rsquared), + sample.count=sample.count, + with.more.weight=with.more.weight, + failure.reason=coerce.to.na(fit.info$failure.reason), + nfeatures=nfeatures, + sensitive.lines.zmean.top.feature=fit.info$sensitive.lines.zmean.top.feature, + wt.cor.top.feature=fit.info$wt.cor.top.feature) + } + + qualPerModel$percentile <- rank(qualPerModel$Rsquared,na.last=F)/nrow(qualPerModel) + qualPerModel$fdr <- p.adjust(qualPerModel$Pvalue, method="fdr") + + cat("writing summary\n") + write.csv(varsPerModel, file=paste(flock_run_dir,'/results/varsPerModel.csv',sep=''), row.names=F) + write.csv(qualPerModel, file=paste(flock_run_dir,'/results/qualPerModel.csv',sep=''), row.names=F) + + annTable.file <- flock_common_state$anno.file + + # make report summary plot with: + # 1. BRAF, KRAS, CTNNB1, ESR1 + # 2. random 4 out of the top 20 models of that bulk run in terms of R^2 + # 3. random 4 models with p-value > 0.1 + + if(length(qualPerModel) > 4) { + + sample.at.most <- function(v, n) { + sample(v, min(length(v), n)) + } + + randTop4 <- sample.at.most(qualPerModel$targetID[rank(-qualPerModel$Rsquared) <= 20], 4) + randBad4 <- sample.at.most(qualPerModel$targetID[qualPerModel$Pvalue > 0.1], 4) + + add.atlantis.plots <- function(targetIds) { + for(targetID in targetIds) { + e <- new.env() + fi <- sprintf("%s/results/%s_fit.info_cforest.Rdata", flock_run_dir, targetID) + sum.file <- sprintf("%s/results/%s_ATLANTIS_Summary.Rdata", flock_run_dir, targetID) + if(file.exists(fi) && file.exists(sum.file)) { + load(fi, envir=e) + PlotATLANTISresults(e$fit.info, sum.file, + NA, NULL, annTable.file) + } else { + # write something to file to indicate target not present + } + } + } + + pdf(file.path(flock_run_dir, 'results/summary.pdf'), w=11*1.5, h=8.5*1.5) + pardefault <- par() + grid.table(head(qualPerModel[order(qualPerModel$Rsquared, decreasing=T),],20)) + par(pardefault) + add.atlantis.plots(c("GS_BRAF", "GS_KRAS", "GS_CTNNB1", "GS_ESR1")) + par(pardefault) + add.atlantis.plots(randTop4) + par(pardefault) + add.atlantis.plots(randBad4) + par(pardefault) + if(length(null.distribution.values) > 0) { + hist(null.distribution.values) + } + par(pardefault) + dev.off() + } +} + +# given a feature ID (a compound string that includes a feature type and a feature name), +# returns the feature name +getFeatureNameFromFeatureID <- function(ID) { + res <- strsplit(ID, "[_:]")[[1]] + if (length(res) < 2) + NULL + else + res[[2]] +}