atlantis / Git / Diff of /createBulkAtlantisFlock.R

Models:
DanielG/
atlantis
Downloads: 1
Diff of /createBulkAtlantisFlock.R [000000] .. [fbf06f]
Switch to side-by-side view

--- a
+++ b/createBulkAtlantisFlock.R
@@ -0,0 +1,491 @@
+library(gridExtra)
+library(foreach)
+library(atlantis)
+
+createBulkAtlantisFlock <- function(analysis.description, run_mode, predMat.file, targetMat.file, anno.file, featureSetToUse, kMinNumSamples=50, seed=300, batch.size=1, predFeaturesPerTarget=NULL, fitControlSettings="default", config.summary=NULL, predictOnly=NULL, permutation.count=NULL, only.permutations=F, 
+  code.dir=".",
+  test.fraction=NA,
+  predFeatureRegExps=NULL) {
+  # analysis.description: a description which will be saved in the run directory
+  # run_mode: ignored
+  # mustHave: a vector of dataset ids which must have a value present to be included in the prediction matrix
+  # niceToHave: a vector of dataset ids to include in the prediction matrix, but these may have missing values in the prediction matrix
+  # featureSetToUse: one of "all", "perTarget", "perTargetAndSelf" or "single" determining how to subset the prediction matrix when
+  #                  for a given target.
+  # predFeaturesPerTarget: if featureSetToUse = "perTarget" or "perTargetAndSelf" then this should be a dataframe with two columns: c("target", "partner") which will be used to look up which genes to use as predicitive features based on a given target
+
+  if (featureSetToUse == "perTarget" || featureSetToUse == "perTargetAndSelf") {
+	  stopifnot(!is.null(predFeaturesPerTarget) && is.data.frame(predFeaturesPerTarget) && all(colnames(predFeaturesPerTarget) == c("target", "partner")))
+  } else if(featureSetToUse == "perGroup") {
+     stopifnot(!is.null(predFeaturesPerTarget) && is.data.frame(predFeaturesPerTarget) && all(colnames(predFeaturesPerTarget) == c("group", "target", "group.name")))
+  }
+
+  # flock_run_dir is the name of the directory of where the results will be written
+  # the analysis.ID is the trailing dir name
+  analysis.ID <- basename(flock_run_dir)
+  output.dir <- paste(flock_run_dir, '/results/', sep='')
+  dir.create(output.dir)
+
+  # cache the functional related genes if we'll need them
+  predFeaturesPerTarget.file <- NULL
+  if(featureSetToUse == "perTarget" || featureSetToUse == "perTargetAndSelf" || featureSetToUse == "perGroup") {
+    stopifnot(!is.null(predFeaturesPerTarget))
+
+    predFeaturesPerTarget.file <- paste(output.dir, '/featuresPerTarget.Rdata', sep="")
+    save(predFeaturesPerTarget, file=predFeaturesPerTarget.file)
+  }
+
+  # load the target matrix to determine the targets
+  load(targetMat.file);
+  stopifnot(is.matrix(targetMat) || is.data.frame(targetMat))
+  targets <- colnames(targetMat);
+  if(is.null(permutation.count)) {
+    permutation.count <- length(targets)
+  }
+
+  if(featureSetToUse == "perGroup") {
+    # then we want to create a target per member in our predFeaturesPerTarget table
+    with.valid.targets <- predFeaturesPerTarget[paste("GS_",predFeaturesPerTarget$target,sep='') %in% targets,,drop=F]
+    targets <- lapply(seq(nrow( with.valid.targets)), function(i) { 
+	x<-with.valid.targets[i,]; 
+        list(ID=paste("GS_",as.character(x$target),sep=''), 
+             gene=as.character(x$target), 
+             group=as.character(x$group), 
+             output.prefix=paste(x$group,'-',x$target,sep=''),
+             group.name=as.character(x$group.name)) 
+        } )
+    stopifnot(! ("GS_NA" %in% sapply(targets, function(t){t$ID})))
+  } else {
+    # filter out targets which have a single output value
+    print("before filtering out singles")
+    print(length(targets))
+    is.bad.target <- sapply(seq(dim(targetMat)[[2]]), function(i) { t<-targetMat[,i]; t<-t[!is.na(t)]; length(unique(t))==1 } )
+    targets <- targets[!is.bad.target]
+    targets <- lapply(targets, function(t) { list(ID=t, output.prefix=t) })
+    print("after filtering out targets with a single target value")
+    print(length(targets))
+    stopifnot(grep("GS_NA", targets)!=0)
+  }
+
+  common <- list( 
+    fitControlSettings=fitControlSettings,
+    targetMat.file=targetMat.file,
+    predMat.file=predMat.file,
+    anno.file=anno.file,
+    output.dir=output.dir,
+    featureSetToUse=featureSetToUse,
+    kMinNumSamples=kMinNumSamples,
+    predFeaturesPerTarget.file=predFeaturesPerTarget.file,
+    seed=seed,
+    quality.dist.file=paste(output.dir, "null_quality_distribution.txt", sep=""),
+    config.summary=config.summary,
+    predictOnly=predictOnly,
+    permutation.count=permutation.count,
+    only.permutations=only.permutations,
+    batch.size=batch.size,
+    predFeatureRegExps=predFeatureRegExps
+    )
+
+  sources <- c("createBulkAtlantisFlock.R")
+
+  common$sources <- sources
+
+  # add in runs for each permuted version used for the null
+  permutations <- sample(targets, permutation.count, replace=T)
+  for(i in seq_along(permutations)) { 
+    permutations[[i]]$permute.seed <- i + seed
+  }
+
+  if(common$only.permutations) {
+    tasks <- permutations
+  } else {
+    tasks <- c(targets, permutations)
+  }
+
+  tasks <- split(tasks, as.integer(seq(length(tasks))/common$batch.size))  
+
+  if(!is.na(test.fraction)) {
+    task.count <- as.integer(ceiling(test.fraction * length(tasks)))
+    cat("Only running", task.count, "tasks out of", length(tasks), "because test.fraction was set to", test.fraction, "\n")
+    tasks <- tasks [ seq(task.count) ]
+  }
+
+  flock.run(tasks, task_function='per_target', gather_function='gather', sources=common$sources, flock_common_state=common)
+}
+
+prev.proc.time <- NULL
+
+time.checkpoint <- function(label) {
+#  now <- proc.time()
+#  if(!is.null(prev.proc.time)) {
+#    cat(sprintf("%s: %f\n", label, (now["elapsed"] - prev.proc.time["elapsed"])))
+#  } else {
+#    cat(sprintf("%s: first\n", label))
+#  }
+#  prev.proc.time <<- now
+}
+
+# given a gene symbol, returns a list of functionaly related genes based on dataframe which was persisted
+get.related.genes <- function(predFeaturesPerTarget, targetToLookup, target, include.self) {
+  matches <- as.character(predFeaturesPerTarget$target) == targetToLookup
+  partners <- as.character(predFeaturesPerTarget[matches, "partner"])
+  if(include.self) {
+    ret <- unique(c(target, partners))
+  } else {
+    ret <- setdiff(unique(partners), target)
+  }
+
+  return(ret)
+#  sapply(ret, getFeatureNameFromFeatureID)
+}
+
+get.gene.group <- function(predFeaturesPerTarget, groupToLookup, target, include.self) {
+  partners <- as.character(predFeaturesPerTarget[predFeaturesPerTarget$group == groupToLookup, "target"])
+  if(include.self) {
+    ret <- unique(c(target, partners))
+  } else {
+    ret <- setdiff(unique(partners), target)
+  }
+  return(ret)
+#  sapply(ret, getFeatureNameFromFeatureID)
+}
+
+determine.feature.genes <- function(target, featureSetToUse, predFeaturesPerTarget.file) {
+  if(!is.null(predFeaturesPerTarget.file)) {
+    load(predFeaturesPerTarget.file)
+  }
+
+  if (featureSetToUse == "all") {
+    genes <- NULL
+  } else if (featureSetToUse == "perTarget" || featureSetToUse == "perTargetAndSelf") {
+    targetName <- getFeatureNameFromFeatureID(target$ID)
+    stopifnot(!is.null(targetName))
+    genes <- get.related.genes(predFeaturesPerTarget, targetName, targetName, featureSetToUse == "perTargetAndSelf")
+    cat("found", length(genes), "gene names to be used as features for", targetName, "\n")
+  } else if (featureSetToUse == "perGroup") {
+    stopifnot(all(colnames(predFeaturesPerTarget) == c("group", "target", "group.name")))
+    group <- target$group
+    stopifnot(! is.null(group))
+    stopifnot(! is.null(target$gene))
+    genes <- get.gene.group(predFeaturesPerTarget, group, target$gene, FALSE)
+  } else {
+    stopifnot(featureSetToUse ==  "single")
+    targetName <- getFeatureNameFromFeatureID(target$ID)
+    genes <- targetName
+    stopifnot(!is.null(genes))
+  }
+
+  return(genes)
+}
+
+per_target <- function(per.task.state, common.state, output.file, run) {
+  library(foreach)
+
+  flock_run_dir <- run$dir
+
+  analysis.dir <- basename(flock_run_dir);
+  analysis.ID <- analysis.dir;
+  output.dir <- flock_common_state$output.dir
+  predMat.file <- flock_common_state$predMat.file
+  targetMat.file <- flock_common_state$targetMat.file
+  anno.file <- flock_common_state$anno
+  kMinNumSamples <- flock_common_state$kMinNumSamples
+  predFeaturesPerTarget.file <- flock_common_state$predFeaturesPerTarget.file
+  config.summary <- flock_common_state$config.summary
+  predFeatureRegExps <- flock_common_state$predFeatureRegExps
+
+  fitControlSettings = flock_common_state$fitControlSettings
+
+  # the following figures out what are the 100 most similar genes to target gene and uses only them
+  # for prediction.
+  ####
+  targetMat.file <- flock_common_state$targetMat.file
+  load(targetMat.file)
+
+  featureSetToUse <- flock_common_state$featureSetToUse
+
+  fit.single.target <- function(target) {
+    # allow target to be specified as either a string or a list of (ID, group)
+    if(is.character(target)) {
+      target <- list(ID=target, targetUsedToLookupRelated=target, permute.seed=NULL)
+    }
+
+    targetID <- target$ID
+    permute.seed <- target$permute.seed
+
+    print(targetID)
+    cat(sprintf("Generating model for %s\n", targetID))
+    set.seed(flock_common_state$seed)
+
+    if (is.numeric(targetID)) {
+      targetID <- colnames(targetMat)[targetID]
+    }
+
+    stopifnot(length(grep("NO_CURRENT", targetID)) == 0)
+
+    permuteRows <- function(seed, targetMat) {
+      saved <- .Random.seed
+      set.seed(seed)
+      permutedTargetMat <- targetMat
+      for(i in seq(ncol(targetMat))) { 
+        permutedTargetMat[,i] = sample(targetMat[,i]) 
+      }
+      .Random.seed <- saved
+      permutedTargetMat
+    }
+
+    if(featureSetToUse == "all") {
+      predFeatureRegExps <- ".*"
+      genes <- NULL
+    } else {
+      genes <- determine.feature.genes(target, featureSetToUse, predFeaturesPerTarget.file)
+    }
+
+    output.prefix <- target$output.prefix
+    group.name <- target$group.name
+    if(is.null(group.name)) {
+      group.name <- NA
+    } else {
+      config.summary <- c(config.summary, paste("group:", group.name))
+    }
+
+    # if this is for the null distribution
+    if(!is.null(permute.seed)) {
+      permutedTargetMat <- permuteRows(permute.seed, targetMat)
+
+      res <- runATLANTIS(
+        analysis.ID=analysis.ID,
+        targetID=targetID,
+        output.prefix=paste(output.prefix, "-NULL-",permute.seed, sep=''),
+        makePlot=FALSE,
+        output.dir = paste(dirname(output.dir), '/temp', sep=''),
+        fitControlSettings = fitControlSettings,
+        additionalFeatures=NULL,
+        predFeatureNamesToUse=genes,
+        predFeatureRegExps=predFeatureRegExps,
+        predMat.file = predMat.file,
+        targetMat = permutedTargetMat,
+        anno.file=anno.file,
+        kMinNumSamples=kMinNumSamples,
+        save.params=F,
+        save.model=F,
+        save.featureData=F)
+    } else {
+      res <- runATLANTIS(
+        analysis.ID=analysis.ID,
+        targetID=targetID, 
+        makePlot=TRUE,
+        output.dir = output.dir,
+        output.prefix=output.prefix,
+        fitControlSettings = fitControlSettings,
+        additionalFeatures=NULL,
+        predFeatureNamesToUse=genes,
+        predFeatureRegExps=predFeatureRegExps,
+        predMat.file = predMat.file,
+        targetMat = targetMat,
+        anno.file=anno.file,
+        kMinNumSamples=kMinNumSamples,
+        save.params=F,
+        save.model=F,
+        report.summary=config.summary,
+        predictOnly=flock_common_state$predictOnly
+        )
+    }
+
+    list(targetID=targetID,
+         result=res,
+         permute.seed=permute.seed,
+         output.prefix=output.prefix,
+         group.name=group.name)
+
+  }
+
+  task.results <- lapply(flock_per_task_state, fit.single.target)
+
+  save(task.results, file=output.file)
+}
+
+    zscore <- function(x) {
+     (x-mean(x, na.rm=T))/sd(x, na.rm=T)
+    }
+
+
+gather <- function(per.task.state, common.state, output.file, run) {
+  task.results <- do.call(c, lapply(flock_per_task_state, function(job.details) { 
+    e <- new.env()
+    load(job.details$flock_output_file, envir=e)
+    e$task.results
+  }))
+
+  is.permuted <- sapply(task.results, function(x) { !is.null(x$permute.seed) } )
+
+  null.dist.details <- foreach(task.result=task.results[is.permuted], .combine=rbind) %do% {
+    load(task.result$result$fit.file)
+    if(!is.null(fit.info$OOB$weighted.cor)) {
+      data.frame(quality=fit.info$OOB$quality, weighted.cor=fit.info$OOB$weighted.cor, weighted.cor.R2=fit.info$OOB$weighted.cor.R2, with.more.weight=sum(min(fit.info$OOB$weights) != fit.info$OOB$weights), sample.count=length(fit.info$targetVec), failure.reason=NA, targetID=task.result$targetID, nfeatures=fit.info$OOB$nfeatures)
+    } else {
+      data.frame(quality=NA, weighted.cor=NA, weighted.cor.R2=NA, with.more.weight=NA, sample.count=NA, failure.reason=fit.info$failure.reason, targetID=task.result$targetID, nfeatures=NA)
+    }
+  }
+
+  null.distribution.values <- na.omit(null.dist.details$quality[!is.na(null.dist.details$quality)])
+  pval.from.null.distribution <- function(x) { (sum(null.distribution.values >= x)+1)/(length(null.distribution.values)+1) }
+
+  write.table(null.distribution.values, file=flock_common_state$quality.dist.file, col.names=F, row.names=F)
+  write.csv(null.dist.details, file=paste(flock_common_state$quality.dist.file,"-details.csv",sep=''),row.names=F)
+
+  # summarize the runs into a table 
+
+  fits <- lapply(task.results[!is.permuted], function(job) { 
+    if(!is.null(job$result) && !is.null(job$result$fit.file)) {
+        load(job$result$fit.file)
+        # reduce memory by dropping unused fields
+        top.feature <- names(which.max(fit.info$varImp))
+        if(!is.null(top.feature)) {
+          top.feature.values <- fit.info$predData[,top.feature]
+          wt.cor.top.feature <- atlantis:::weighted.cor(top.feature.values, fit.info$targetVec, fit.info$OOB$weights)
+          target.top.pred.corr <- cor(fit.info$targetVec, top.feature.values, use='pairwise.complete')
+          lines.with.more.weight <- which(min(fit.info$OOB$weights) < fit.info$OOB$weights)
+          sensitive.lines.zmean.top.feature <- mean(zscore(top.feature.values)[lines.with.more.weight], na.rm=T)
+        } else {
+          target.top.pred.corr <- NA
+          wt.cor.top.feature <- NA
+          sensitive.lines.zmean.top.feature <- NA
+        }
+        fit.info$top.feature <- top.feature
+        fit.info$target.top.pred.corr <- target.top.pred.corr
+        fit.info$wt.cor.top.feature <- wt.cor.top.feature
+        fit.info$sensitive.lines.zmean.top.feature <- sensitive.lines.zmean.top.feature
+        fit.info$predData <- NULL
+        fit.info$predictors <- NULL
+        fit.info
+    } else {
+      NULL
+    }
+  })
+
+  taskResults <- task.results[!is.permuted]
+
+  varsPerModel <- foreach(taskResult=taskResults, fit.info=fits, .combine=rbind) %do% {
+    if(length(fit.info$varImp) == 0) {
+      NULL
+    } else {
+      d <- data.frame(variable=names(fit.info$varImp), varImportance=fit.info$varImp, targetID=taskResult$targetID, group.name=taskResult$group.name)
+      rownames(d) <- NULL
+      d
+    }
+  }
+
+  coerce.to.na <- function(x) { ifelse(is.null(x), NA, x) }
+
+  qualPerModel <- foreach(taskResult=taskResults, fit.info=fits, .combine=rbind) %do% {
+    targetID <- taskResult$targetID
+    Rsquared <- coerce.to.na(fit.info$OOB$quality)
+    if(!is.null(fit.info$failure.reason)) {
+      Rsquared <- NA
+    }
+    target.top.pred.corr <- fit.info$target.top.pred.corr
+    top.feature <- fit.info$top.feature
+    wt.Rsquared <- fit.info$OOB$weighted.cor.R2
+    Pvalue <- pval.from.null.distribution(Rsquared)
+    n <- names(fit.info$varImp[top.feature])
+    group.name <- taskResult$group.name
+    if(is.null(group.name)) {
+      group.name <- NA
+    }
+    output.prefix <- taskResult$output.prefix
+    target.min <- min(fit.info$targetVec, na.rm=T)
+    sample.count <- length(fit.info$targetVec)
+    with.more.weight <- sum(min(fit.info$OOB$weights) != fit.info$OOB$weights)
+    nfeatures=fit.info$OOB$nfeatures
+    if(is.null(nfeatures)) { 
+      nfeatures <- NA
+    }
+    print("nfeatures")
+    print(nfeatures)
+
+    #stopifnot(!is.null(Rsquared))
+    #stopifnot(!is.null(targetID))
+
+    data.frame(Rsquared=Rsquared,
+               targetID=targetID, 
+               topFeature=coerce.to.na(n), 
+               Pvalue=coerce.to.na(Pvalue), 
+               topFeatureCor=target.top.pred.corr, 
+               targetMin=target.min,
+               group.name=group.name,
+               output.prefix=output.prefix,
+               wt.Rsquared=coerce.to.na(wt.Rsquared),
+               sample.count=sample.count,
+               with.more.weight=with.more.weight,
+               failure.reason=coerce.to.na(fit.info$failure.reason),
+               nfeatures=nfeatures,
+               sensitive.lines.zmean.top.feature=fit.info$sensitive.lines.zmean.top.feature,
+               wt.cor.top.feature=fit.info$wt.cor.top.feature)
+  }
+
+  qualPerModel$percentile <- rank(qualPerModel$Rsquared,na.last=F)/nrow(qualPerModel)
+  qualPerModel$fdr <- p.adjust(qualPerModel$Pvalue, method="fdr")
+
+  cat("writing summary\n")
+  write.csv(varsPerModel, file=paste(flock_run_dir,'/results/varsPerModel.csv',sep=''), row.names=F)
+  write.csv(qualPerModel, file=paste(flock_run_dir,'/results/qualPerModel.csv',sep=''), row.names=F)
+
+  annTable.file <- flock_common_state$anno.file
+
+  # make report summary plot with:
+  # 1. BRAF, KRAS, CTNNB1, ESR1
+  # 2. random 4 out of the top 20 models of that bulk run in terms of R^2
+  # 3. random 4 models with p-value > 0.1 
+
+  if(length(qualPerModel) > 4) {
+
+    sample.at.most <- function(v, n) {
+       sample(v, min(length(v), n))
+    }
+
+    randTop4 <- sample.at.most(qualPerModel$targetID[rank(-qualPerModel$Rsquared) <= 20], 4)
+    randBad4 <- sample.at.most(qualPerModel$targetID[qualPerModel$Pvalue > 0.1], 4)
+
+    add.atlantis.plots <- function(targetIds) {
+      for(targetID in targetIds) {
+        e <- new.env()
+        fi <- sprintf("%s/results/%s_fit.info_cforest.Rdata", flock_run_dir, targetID)
+        sum.file <- sprintf("%s/results/%s_ATLANTIS_Summary.Rdata", flock_run_dir, targetID)
+        if(file.exists(fi) && file.exists(sum.file)) {
+          load(fi, envir=e)
+          PlotATLANTISresults(e$fit.info, sum.file,
+            NA, NULL, annTable.file)
+        } else {
+          # write something to file to indicate target not present
+        }
+      }
+    }
+
+    pdf(file.path(flock_run_dir, 'results/summary.pdf'), w=11*1.5, h=8.5*1.5)
+    pardefault <- par()
+    grid.table(head(qualPerModel[order(qualPerModel$Rsquared, decreasing=T),],20))
+    par(pardefault)
+    add.atlantis.plots(c("GS_BRAF", "GS_KRAS", "GS_CTNNB1", "GS_ESR1"))
+    par(pardefault)
+    add.atlantis.plots(randTop4)
+    par(pardefault)
+    add.atlantis.plots(randBad4)
+    par(pardefault)
+    if(length(null.distribution.values) > 0) {
+      hist(null.distribution.values)
+    }
+    par(pardefault)
+    dev.off()
+  }
+}
+
+# given a feature ID (a compound string that includes a feature type and a feature name),
+# returns the feature name
+getFeatureNameFromFeatureID <- function(ID) {
+  res <- strsplit(ID, "[_:]")[[1]]
+  if (length(res) < 2)
+    NULL
+  else
+    res[[2]]
+}