Diff of /createBulkAtlantisFlock.R [000000] .. [fbf06f]

Switch to unified view

a b/createBulkAtlantisFlock.R
1
library(gridExtra)
2
library(foreach)
3
library(atlantis)
4
5
createBulkAtlantisFlock <- function(analysis.description, run_mode, predMat.file, targetMat.file, anno.file, featureSetToUse, kMinNumSamples=50, seed=300, batch.size=1, predFeaturesPerTarget=NULL, fitControlSettings="default", config.summary=NULL, predictOnly=NULL, permutation.count=NULL, only.permutations=F, 
6
  code.dir=".",
7
  test.fraction=NA,
8
  predFeatureRegExps=NULL) {
9
  # analysis.description: a description which will be saved in the run directory
10
  # run_mode: ignored
11
  # mustHave: a vector of dataset ids which must have a value present to be included in the prediction matrix
12
  # niceToHave: a vector of dataset ids to include in the prediction matrix, but these may have missing values in the prediction matrix
13
  # featureSetToUse: one of "all", "perTarget", "perTargetAndSelf" or "single" determining how to subset the prediction matrix when
14
  #                  for a given target.
15
  # predFeaturesPerTarget: if featureSetToUse = "perTarget" or "perTargetAndSelf" then this should be a dataframe with two columns: c("target", "partner") which will be used to look up which genes to use as predicitive features based on a given target
16
17
  if (featureSetToUse == "perTarget" || featureSetToUse == "perTargetAndSelf") {
18
      stopifnot(!is.null(predFeaturesPerTarget) && is.data.frame(predFeaturesPerTarget) && all(colnames(predFeaturesPerTarget) == c("target", "partner")))
19
  } else if(featureSetToUse == "perGroup") {
20
     stopifnot(!is.null(predFeaturesPerTarget) && is.data.frame(predFeaturesPerTarget) && all(colnames(predFeaturesPerTarget) == c("group", "target", "group.name")))
21
  }
22
23
  # flock_run_dir is the name of the directory of where the results will be written
24
  # the analysis.ID is the trailing dir name
25
  analysis.ID <- basename(flock_run_dir)
26
  output.dir <- paste(flock_run_dir, '/results/', sep='')
27
  dir.create(output.dir)
28
29
  # cache the functional related genes if we'll need them
30
  predFeaturesPerTarget.file <- NULL
31
  if(featureSetToUse == "perTarget" || featureSetToUse == "perTargetAndSelf" || featureSetToUse == "perGroup") {
32
    stopifnot(!is.null(predFeaturesPerTarget))
33
34
    predFeaturesPerTarget.file <- paste(output.dir, '/featuresPerTarget.Rdata', sep="")
35
    save(predFeaturesPerTarget, file=predFeaturesPerTarget.file)
36
  }
37
38
  # load the target matrix to determine the targets
39
  load(targetMat.file);
40
  stopifnot(is.matrix(targetMat) || is.data.frame(targetMat))
41
  targets <- colnames(targetMat);
42
  if(is.null(permutation.count)) {
43
    permutation.count <- length(targets)
44
  }
45
46
  if(featureSetToUse == "perGroup") {
47
    # then we want to create a target per member in our predFeaturesPerTarget table
48
    with.valid.targets <- predFeaturesPerTarget[paste("GS_",predFeaturesPerTarget$target,sep='') %in% targets,,drop=F]
49
    targets <- lapply(seq(nrow( with.valid.targets)), function(i) { 
50
    x<-with.valid.targets[i,]; 
51
        list(ID=paste("GS_",as.character(x$target),sep=''), 
52
             gene=as.character(x$target), 
53
             group=as.character(x$group), 
54
             output.prefix=paste(x$group,'-',x$target,sep=''),
55
             group.name=as.character(x$group.name)) 
56
        } )
57
    stopifnot(! ("GS_NA" %in% sapply(targets, function(t){t$ID})))
58
  } else {
59
    # filter out targets which have a single output value
60
    print("before filtering out singles")
61
    print(length(targets))
62
    is.bad.target <- sapply(seq(dim(targetMat)[[2]]), function(i) { t<-targetMat[,i]; t<-t[!is.na(t)]; length(unique(t))==1 } )
63
    targets <- targets[!is.bad.target]
64
    targets <- lapply(targets, function(t) { list(ID=t, output.prefix=t) })
65
    print("after filtering out targets with a single target value")
66
    print(length(targets))
67
    stopifnot(grep("GS_NA", targets)!=0)
68
  }
69
70
  common <- list( 
71
    fitControlSettings=fitControlSettings,
72
    targetMat.file=targetMat.file,
73
    predMat.file=predMat.file,
74
    anno.file=anno.file,
75
    output.dir=output.dir,
76
    featureSetToUse=featureSetToUse,
77
    kMinNumSamples=kMinNumSamples,
78
    predFeaturesPerTarget.file=predFeaturesPerTarget.file,
79
    seed=seed,
80
    quality.dist.file=paste(output.dir, "null_quality_distribution.txt", sep=""),
81
    config.summary=config.summary,
82
    predictOnly=predictOnly,
83
    permutation.count=permutation.count,
84
    only.permutations=only.permutations,
85
    batch.size=batch.size,
86
    predFeatureRegExps=predFeatureRegExps
87
    )
88
89
  sources <- c("createBulkAtlantisFlock.R")
90
91
  common$sources <- sources
92
93
  # add in runs for each permuted version used for the null
94
  permutations <- sample(targets, permutation.count, replace=T)
95
  for(i in seq_along(permutations)) { 
96
    permutations[[i]]$permute.seed <- i + seed
97
  }
98
99
  if(common$only.permutations) {
100
    tasks <- permutations
101
  } else {
102
    tasks <- c(targets, permutations)
103
  }
104
105
  tasks <- split(tasks, as.integer(seq(length(tasks))/common$batch.size))  
106
107
  if(!is.na(test.fraction)) {
108
    task.count <- as.integer(ceiling(test.fraction * length(tasks)))
109
    cat("Only running", task.count, "tasks out of", length(tasks), "because test.fraction was set to", test.fraction, "\n")
110
    tasks <- tasks [ seq(task.count) ]
111
  }
112
113
  flock.run(tasks, task_function='per_target', gather_function='gather', sources=common$sources, flock_common_state=common)
114
}
115
116
prev.proc.time <- NULL
117
118
time.checkpoint <- function(label) {
119
#  now <- proc.time()
120
#  if(!is.null(prev.proc.time)) {
121
#    cat(sprintf("%s: %f\n", label, (now["elapsed"] - prev.proc.time["elapsed"])))
122
#  } else {
123
#    cat(sprintf("%s: first\n", label))
124
#  }
125
#  prev.proc.time <<- now
126
}
127
128
# given a gene symbol, returns a list of functionaly related genes based on dataframe which was persisted
129
get.related.genes <- function(predFeaturesPerTarget, targetToLookup, target, include.self) {
130
  matches <- as.character(predFeaturesPerTarget$target) == targetToLookup
131
  partners <- as.character(predFeaturesPerTarget[matches, "partner"])
132
  if(include.self) {
133
    ret <- unique(c(target, partners))
134
  } else {
135
    ret <- setdiff(unique(partners), target)
136
  }
137
138
  return(ret)
139
#  sapply(ret, getFeatureNameFromFeatureID)
140
}
141
142
get.gene.group <- function(predFeaturesPerTarget, groupToLookup, target, include.self) {
143
  partners <- as.character(predFeaturesPerTarget[predFeaturesPerTarget$group == groupToLookup, "target"])
144
  if(include.self) {
145
    ret <- unique(c(target, partners))
146
  } else {
147
    ret <- setdiff(unique(partners), target)
148
  }
149
  return(ret)
150
#  sapply(ret, getFeatureNameFromFeatureID)
151
}
152
153
determine.feature.genes <- function(target, featureSetToUse, predFeaturesPerTarget.file) {
154
  if(!is.null(predFeaturesPerTarget.file)) {
155
    load(predFeaturesPerTarget.file)
156
  }
157
158
  if (featureSetToUse == "all") {
159
    genes <- NULL
160
  } else if (featureSetToUse == "perTarget" || featureSetToUse == "perTargetAndSelf") {
161
    targetName <- getFeatureNameFromFeatureID(target$ID)
162
    stopifnot(!is.null(targetName))
163
    genes <- get.related.genes(predFeaturesPerTarget, targetName, targetName, featureSetToUse == "perTargetAndSelf")
164
    cat("found", length(genes), "gene names to be used as features for", targetName, "\n")
165
  } else if (featureSetToUse == "perGroup") {
166
    stopifnot(all(colnames(predFeaturesPerTarget) == c("group", "target", "group.name")))
167
    group <- target$group
168
    stopifnot(! is.null(group))
169
    stopifnot(! is.null(target$gene))
170
    genes <- get.gene.group(predFeaturesPerTarget, group, target$gene, FALSE)
171
  } else {
172
    stopifnot(featureSetToUse ==  "single")
173
    targetName <- getFeatureNameFromFeatureID(target$ID)
174
    genes <- targetName
175
    stopifnot(!is.null(genes))
176
  }
177
178
  return(genes)
179
}
180
181
per_target <- function(per.task.state, common.state, output.file, run) {
182
  library(foreach)
183
184
  flock_run_dir <- run$dir
185
186
  analysis.dir <- basename(flock_run_dir);
187
  analysis.ID <- analysis.dir;
188
  output.dir <- flock_common_state$output.dir
189
  predMat.file <- flock_common_state$predMat.file
190
  targetMat.file <- flock_common_state$targetMat.file
191
  anno.file <- flock_common_state$anno
192
  kMinNumSamples <- flock_common_state$kMinNumSamples
193
  predFeaturesPerTarget.file <- flock_common_state$predFeaturesPerTarget.file
194
  config.summary <- flock_common_state$config.summary
195
  predFeatureRegExps <- flock_common_state$predFeatureRegExps
196
197
  fitControlSettings = flock_common_state$fitControlSettings
198
199
  # the following figures out what are the 100 most similar genes to target gene and uses only them
200
  # for prediction.
201
  ####
202
  targetMat.file <- flock_common_state$targetMat.file
203
  load(targetMat.file)
204
205
  featureSetToUse <- flock_common_state$featureSetToUse
206
207
  fit.single.target <- function(target) {
208
    # allow target to be specified as either a string or a list of (ID, group)
209
    if(is.character(target)) {
210
      target <- list(ID=target, targetUsedToLookupRelated=target, permute.seed=NULL)
211
    }
212
213
    targetID <- target$ID
214
    permute.seed <- target$permute.seed
215
216
    print(targetID)
217
    cat(sprintf("Generating model for %s\n", targetID))
218
    set.seed(flock_common_state$seed)
219
220
    if (is.numeric(targetID)) {
221
      targetID <- colnames(targetMat)[targetID]
222
    }
223
224
    stopifnot(length(grep("NO_CURRENT", targetID)) == 0)
225
226
    permuteRows <- function(seed, targetMat) {
227
      saved <- .Random.seed
228
      set.seed(seed)
229
      permutedTargetMat <- targetMat
230
      for(i in seq(ncol(targetMat))) { 
231
        permutedTargetMat[,i] = sample(targetMat[,i]) 
232
      }
233
      .Random.seed <- saved
234
      permutedTargetMat
235
    }
236
237
    if(featureSetToUse == "all") {
238
      predFeatureRegExps <- ".*"
239
      genes <- NULL
240
    } else {
241
      genes <- determine.feature.genes(target, featureSetToUse, predFeaturesPerTarget.file)
242
    }
243
244
    output.prefix <- target$output.prefix
245
    group.name <- target$group.name
246
    if(is.null(group.name)) {
247
      group.name <- NA
248
    } else {
249
      config.summary <- c(config.summary, paste("group:", group.name))
250
    }
251
252
    # if this is for the null distribution
253
    if(!is.null(permute.seed)) {
254
      permutedTargetMat <- permuteRows(permute.seed, targetMat)
255
256
      res <- runATLANTIS(
257
        analysis.ID=analysis.ID,
258
        targetID=targetID,
259
        output.prefix=paste(output.prefix, "-NULL-",permute.seed, sep=''),
260
        makePlot=FALSE,
261
        output.dir = paste(dirname(output.dir), '/temp', sep=''),
262
        fitControlSettings = fitControlSettings,
263
        additionalFeatures=NULL,
264
        predFeatureNamesToUse=genes,
265
        predFeatureRegExps=predFeatureRegExps,
266
        predMat.file = predMat.file,
267
        targetMat = permutedTargetMat,
268
        anno.file=anno.file,
269
        kMinNumSamples=kMinNumSamples,
270
        save.params=F,
271
        save.model=F,
272
        save.featureData=F)
273
    } else {
274
      res <- runATLANTIS(
275
        analysis.ID=analysis.ID,
276
        targetID=targetID, 
277
        makePlot=TRUE,
278
        output.dir = output.dir,
279
        output.prefix=output.prefix,
280
        fitControlSettings = fitControlSettings,
281
        additionalFeatures=NULL,
282
        predFeatureNamesToUse=genes,
283
        predFeatureRegExps=predFeatureRegExps,
284
        predMat.file = predMat.file,
285
        targetMat = targetMat,
286
        anno.file=anno.file,
287
        kMinNumSamples=kMinNumSamples,
288
        save.params=F,
289
        save.model=F,
290
        report.summary=config.summary,
291
        predictOnly=flock_common_state$predictOnly
292
        )
293
    }
294
295
    list(targetID=targetID,
296
         result=res,
297
         permute.seed=permute.seed,
298
         output.prefix=output.prefix,
299
         group.name=group.name)
300
301
  }
302
303
  task.results <- lapply(flock_per_task_state, fit.single.target)
304
305
  save(task.results, file=output.file)
306
}
307
308
    zscore <- function(x) {
309
     (x-mean(x, na.rm=T))/sd(x, na.rm=T)
310
    }
311
312
313
gather <- function(per.task.state, common.state, output.file, run) {
314
  task.results <- do.call(c, lapply(flock_per_task_state, function(job.details) { 
315
    e <- new.env()
316
    load(job.details$flock_output_file, envir=e)
317
    e$task.results
318
  }))
319
320
  is.permuted <- sapply(task.results, function(x) { !is.null(x$permute.seed) } )
321
322
  null.dist.details <- foreach(task.result=task.results[is.permuted], .combine=rbind) %do% {
323
    load(task.result$result$fit.file)
324
    if(!is.null(fit.info$OOB$weighted.cor)) {
325
      data.frame(quality=fit.info$OOB$quality, weighted.cor=fit.info$OOB$weighted.cor, weighted.cor.R2=fit.info$OOB$weighted.cor.R2, with.more.weight=sum(min(fit.info$OOB$weights) != fit.info$OOB$weights), sample.count=length(fit.info$targetVec), failure.reason=NA, targetID=task.result$targetID, nfeatures=fit.info$OOB$nfeatures)
326
    } else {
327
      data.frame(quality=NA, weighted.cor=NA, weighted.cor.R2=NA, with.more.weight=NA, sample.count=NA, failure.reason=fit.info$failure.reason, targetID=task.result$targetID, nfeatures=NA)
328
    }
329
  }
330
331
  null.distribution.values <- na.omit(null.dist.details$quality[!is.na(null.dist.details$quality)])
332
  pval.from.null.distribution <- function(x) { (sum(null.distribution.values >= x)+1)/(length(null.distribution.values)+1) }
333
334
  write.table(null.distribution.values, file=flock_common_state$quality.dist.file, col.names=F, row.names=F)
335
  write.csv(null.dist.details, file=paste(flock_common_state$quality.dist.file,"-details.csv",sep=''),row.names=F)
336
337
  # summarize the runs into a table 
338
339
  fits <- lapply(task.results[!is.permuted], function(job) { 
340
    if(!is.null(job$result) && !is.null(job$result$fit.file)) {
341
        load(job$result$fit.file)
342
        # reduce memory by dropping unused fields
343
        top.feature <- names(which.max(fit.info$varImp))
344
        if(!is.null(top.feature)) {
345
          top.feature.values <- fit.info$predData[,top.feature]
346
          wt.cor.top.feature <- atlantis:::weighted.cor(top.feature.values, fit.info$targetVec, fit.info$OOB$weights)
347
          target.top.pred.corr <- cor(fit.info$targetVec, top.feature.values, use='pairwise.complete')
348
          lines.with.more.weight <- which(min(fit.info$OOB$weights) < fit.info$OOB$weights)
349
          sensitive.lines.zmean.top.feature <- mean(zscore(top.feature.values)[lines.with.more.weight], na.rm=T)
350
        } else {
351
          target.top.pred.corr <- NA
352
          wt.cor.top.feature <- NA
353
          sensitive.lines.zmean.top.feature <- NA
354
        }
355
        fit.info$top.feature <- top.feature
356
        fit.info$target.top.pred.corr <- target.top.pred.corr
357
        fit.info$wt.cor.top.feature <- wt.cor.top.feature
358
        fit.info$sensitive.lines.zmean.top.feature <- sensitive.lines.zmean.top.feature
359
        fit.info$predData <- NULL
360
        fit.info$predictors <- NULL
361
        fit.info
362
    } else {
363
      NULL
364
    }
365
  })
366
367
  taskResults <- task.results[!is.permuted]
368
369
  varsPerModel <- foreach(taskResult=taskResults, fit.info=fits, .combine=rbind) %do% {
370
    if(length(fit.info$varImp) == 0) {
371
      NULL
372
    } else {
373
      d <- data.frame(variable=names(fit.info$varImp), varImportance=fit.info$varImp, targetID=taskResult$targetID, group.name=taskResult$group.name)
374
      rownames(d) <- NULL
375
      d
376
    }
377
  }
378
379
  coerce.to.na <- function(x) { ifelse(is.null(x), NA, x) }
380
381
  qualPerModel <- foreach(taskResult=taskResults, fit.info=fits, .combine=rbind) %do% {
382
    targetID <- taskResult$targetID
383
    Rsquared <- coerce.to.na(fit.info$OOB$quality)
384
    if(!is.null(fit.info$failure.reason)) {
385
      Rsquared <- NA
386
    }
387
    target.top.pred.corr <- fit.info$target.top.pred.corr
388
    top.feature <- fit.info$top.feature
389
    wt.Rsquared <- fit.info$OOB$weighted.cor.R2
390
    Pvalue <- pval.from.null.distribution(Rsquared)
391
    n <- names(fit.info$varImp[top.feature])
392
    group.name <- taskResult$group.name
393
    if(is.null(group.name)) {
394
      group.name <- NA
395
    }
396
    output.prefix <- taskResult$output.prefix
397
    target.min <- min(fit.info$targetVec, na.rm=T)
398
    sample.count <- length(fit.info$targetVec)
399
    with.more.weight <- sum(min(fit.info$OOB$weights) != fit.info$OOB$weights)
400
    nfeatures=fit.info$OOB$nfeatures
401
    if(is.null(nfeatures)) { 
402
      nfeatures <- NA
403
    }
404
    print("nfeatures")
405
    print(nfeatures)
406
407
    #stopifnot(!is.null(Rsquared))
408
    #stopifnot(!is.null(targetID))
409
410
    data.frame(Rsquared=Rsquared,
411
               targetID=targetID, 
412
               topFeature=coerce.to.na(n), 
413
               Pvalue=coerce.to.na(Pvalue), 
414
               topFeatureCor=target.top.pred.corr, 
415
               targetMin=target.min,
416
               group.name=group.name,
417
               output.prefix=output.prefix,
418
               wt.Rsquared=coerce.to.na(wt.Rsquared),
419
               sample.count=sample.count,
420
               with.more.weight=with.more.weight,
421
               failure.reason=coerce.to.na(fit.info$failure.reason),
422
               nfeatures=nfeatures,
423
               sensitive.lines.zmean.top.feature=fit.info$sensitive.lines.zmean.top.feature,
424
               wt.cor.top.feature=fit.info$wt.cor.top.feature)
425
  }
426
427
  qualPerModel$percentile <- rank(qualPerModel$Rsquared,na.last=F)/nrow(qualPerModel)
428
  qualPerModel$fdr <- p.adjust(qualPerModel$Pvalue, method="fdr")
429
430
  cat("writing summary\n")
431
  write.csv(varsPerModel, file=paste(flock_run_dir,'/results/varsPerModel.csv',sep=''), row.names=F)
432
  write.csv(qualPerModel, file=paste(flock_run_dir,'/results/qualPerModel.csv',sep=''), row.names=F)
433
434
  annTable.file <- flock_common_state$anno.file
435
436
  # make report summary plot with:
437
  # 1. BRAF, KRAS, CTNNB1, ESR1
438
  # 2. random 4 out of the top 20 models of that bulk run in terms of R^2
439
  # 3. random 4 models with p-value > 0.1 
440
441
  if(length(qualPerModel) > 4) {
442
443
    sample.at.most <- function(v, n) {
444
       sample(v, min(length(v), n))
445
    }
446
447
    randTop4 <- sample.at.most(qualPerModel$targetID[rank(-qualPerModel$Rsquared) <= 20], 4)
448
    randBad4 <- sample.at.most(qualPerModel$targetID[qualPerModel$Pvalue > 0.1], 4)
449
450
    add.atlantis.plots <- function(targetIds) {
451
      for(targetID in targetIds) {
452
        e <- new.env()
453
        fi <- sprintf("%s/results/%s_fit.info_cforest.Rdata", flock_run_dir, targetID)
454
        sum.file <- sprintf("%s/results/%s_ATLANTIS_Summary.Rdata", flock_run_dir, targetID)
455
        if(file.exists(fi) && file.exists(sum.file)) {
456
          load(fi, envir=e)
457
          PlotATLANTISresults(e$fit.info, sum.file,
458
            NA, NULL, annTable.file)
459
        } else {
460
          # write something to file to indicate target not present
461
        }
462
      }
463
    }
464
465
    pdf(file.path(flock_run_dir, 'results/summary.pdf'), w=11*1.5, h=8.5*1.5)
466
    pardefault <- par()
467
    grid.table(head(qualPerModel[order(qualPerModel$Rsquared, decreasing=T),],20))
468
    par(pardefault)
469
    add.atlantis.plots(c("GS_BRAF", "GS_KRAS", "GS_CTNNB1", "GS_ESR1"))
470
    par(pardefault)
471
    add.atlantis.plots(randTop4)
472
    par(pardefault)
473
    add.atlantis.plots(randBad4)
474
    par(pardefault)
475
    if(length(null.distribution.values) > 0) {
476
      hist(null.distribution.values)
477
    }
478
    par(pardefault)
479
    dev.off()
480
  }
481
}
482
483
# given a feature ID (a compound string that includes a feature type and a feature name),
484
# returns the feature name
485
getFeatureNameFromFeatureID <- function(ID) {
486
  res <- strsplit(ID, "[_:]")[[1]]
487
  if (length(res) < 2)
488
    NULL
489
  else
490
    res[[2]]
491
}