Diff of /benchmark.R [000000] .. [835948]

Switch to unified view

a b/benchmark.R
1
SUBTYPES.DATA = list(
2
  list(name='aml', only.primary=F, is.rna.seq=T, is.mirna.seq=T, display.name='AML'),
3
  list(name='breast', only.primary=T, is.rna.seq=T, is.mirna.seq=T, display.name='BIC'),
4
  list(name='colon', only.primary=T, is.rna.seq=T, is.mirna.seq=T, display.name='COAD'),
5
  list(name='gbm', only.primary=T, is.rna.seq=F, is.mirna.seq=F, display.name='GBM'),
6
  list(name='kidney', only.primary=T, is.rna.seq=T, is.mirna.seq=T, display.name='KIRC'),
7
  list(name='liver', only.primary=T, is.rna.seq=T, is.mirna.seq=T, display.name='LIHC'),
8
  list(name='lung', only.primary=T, is.rna.seq=T, is.mirna.seq=T, display.name='LUSC'),
9
  list(name='melanoma', only.primary=F, is.rna.seq=T, is.mirna.seq=T, display.name='SKCM'),
10
  list(name='ovarian', only.primary=T, is.rna.seq=T, is.mirna.seq=T, display.name='OV'),
11
  list(name='sarcoma', only.primary=T, is.rna.seq=T, is.mirna.seq=T, display.name='SARC'))
12
13
MAX.NUM.CLUSTERS = 15
14
15
OMIC.SUBSETS = list('multi_omics', 'exp', 'methy', 'mirna')
16
names(OMIC.SUBSETS) = c('all', '1', '2', '3')
17
18
get.clustering.results.dir.path <- function() {
19
  return('RESULTS_DIR_PATH')
20
}
21
22
get.plots.dir.path <- function() {
23
  results.dir.path = get.clustering.results.dir.path()
24
  return(file.path(results.dir.path, 'plots'))
25
}
26
27
get.tables.dir.path <- function() {
28
  results.dir.path = get.clustering.results.dir.path()
29
  return(file.path(results.dir.path, 'tables'))
30
}
31
32
subtype.to.display.name <- function(subtype) {
33
  for (i in 1:length(SUBTYPES.DATA)) {
34
    if (SUBTYPES.DATA[[i]]$name == subtype) {
35
      return(SUBTYPES.DATA[[i]]$display.name)
36
    }
37
  }
38
}
39
40
set.omics.list.attr <- function(subtype.raw.data, subtype.data) {
41
  attr(subtype.raw.data[[1]], 'is.seq') = subtype.data$is.rna.seq
42
  attr(subtype.raw.data[[2]], 'is.seq') = F
43
  attr(subtype.raw.data[[3]], 'is.seq') = subtype.data$is.mirna.seq
44
  return(subtype.raw.data)
45
}
46
47
ALGORITHM.NAMES = c('kmeans', 'spectral', 'lracluster', 'pins', 'snf', 'mkl', 
48
                     'mcca', 'nmf', 'iCluster')
49
ALGORITHM.DISPLAY.NAMES = as.list(c('K-means', 'Spectral', 'LRAcluster', 'PINS', 
50
                           'SNF', 'rMKL-LPP', 'MCCA', 'MultiNMF', 'iClusterBayes'))
51
names(ALGORITHM.DISPLAY.NAMES) = ALGORITHM.NAMES
52
            
53
print.matrix.latex.format <- function(mat) {
54
  print(do.call(paste, as.list(c(colnames(mat), sep=' & '))))
55
  for (i in 1:nrow(mat)) {
56
    print(do.call(paste, as.list(c(rownames(mat)[i], round(mat[i,], digits=2), sep=' & '))))
57
  }
58
}
59
60
61
perform.all.analyses <- function(benchmark.ret) {
62
  par(mar=rep(1, 4))
63
  for (i in 1:4) {
64
    
65
    cur.func = list(benchmark.omics.time, benchmark.omics.num.clusters,
66
      benchmark.omics.surv, benchmark.omics.clinical)[[i]]
67
    for (omic.subset in names(OMIC.SUBSETS)) {
68
      print(paste('current omic subset ', omic.subset))
69
      benchmark.data = cur.func(benchmark.ret, omic.subset)
70
71
      displayed.benchmark.data = benchmark.data
72
      colnames(displayed.benchmark.data)[1:ncol(displayed.benchmark.data)] = 
73
        sapply(as.list(colnames(displayed.benchmark.data)[1:ncol(displayed.benchmark.data)]), 
74
    subtype.to.display.name)
75
      rownames(displayed.benchmark.data) = unlist(ALGORITHM.DISPLAY.NAMES[rownames(displayed.benchmark.data)])
76
      print.matrix.latex.format(displayed.benchmark.data)
77
      
78
      table.name = c('runtime', 'num_cluster', 'survival', 'clinical')[i]
79
      write.csv(displayed.benchmark.data, file=file.path(get.tables.dir.path(), paste0(table.name, '_', OMIC.SUBSETS[[omic.subset]], '.csv')))
80
    }
81
    
82
    
83
    print('------------------------')
84
  }
85
  
86
  # plots that include all datasets
87
  for (i in 1:4) {
88
    omic.subset = names(OMIC.SUBSETS)[[i]]
89
    benchmark.surv = benchmark.omics.surv(benchmark.ret, omic.subset)
90
    benchmark.clinical = benchmark.omics.clinical(benchmark.ret, omic.subset)
91
    plot.name = list('multi_omics_surv_clinical.tiff', 'exp_surv_clinical.tiff', 
92
      'methy_surv_clinical.tiff', 'mirna_surv_clinical.tiff')[[i]]
93
    create.clinical.survival.plots(benchmark.surv, benchmark.clinical, plot.name)
94
  }
95
  
96
  
97
  # plots for the mean behaviour
98
  create.mean.clinical.survival.plot(benchmark.ret)
99
}
100
101
get.best.single.omic.mat <- function(single.omic.list1, single.omic.list2) {
102
  best.mat1 = matrix(0, ncol=ncol(single.omic.list1[[1]]), 
103
                            nrow=nrow(single.omic.list1[[1]]))
104
  best.mat2 = matrix(0, ncol=ncol(single.omic.list1[[1]]), 
105
                            nrow=nrow(single.omic.list1[[1]]))
106
  for (i in 1:nrow(best.mat1)) {
107
    for (j in 1:ncol(best.mat1)) {
108
      values1 = sapply(single.omic.list1, function(mat) mat[i, j])
109
      values2 = sapply(single.omic.list2, function(mat) mat[i, j])
110
      if (any(is.na(values1))) {
111
        best.mat1[i, j] = NA
112
    best.mat2[i, j] = NA
113
      } else {
114
        best.omic = which.max(values1)
115
        best.mat1[i, j] = values1[best.omic]
116
    best.mat2[i, j] = values2[best.omic]
117
      }
118
    }
119
  }
120
  return(list(best.mat1, best.mat2))
121
}
122
123
create.mean.clinical.survival.plot <- function(benchmark.ret) {
124
  tiff(file.path(get.plots.dir.path(), 'mean_surv_clinical.tiff'), width=4500, height=2250, res=300)
125
  layout(matrix(c(1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5), nrow=2, byrow=T))
126
  single.omic.surv.benchmarks = list()
127
  single.omic.clin.benchmarks = list()
128
  for (i in 2:6) {
129
    omic.subset = c(names(OMIC.SUBSETS), 'best_surv', 'best_clin')[i]
130
    alg.cols = c("#8DD3C7", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5", "#BC80BD", "#842121")
131
    pch = c(rep(15, 3), rep(3, 3), rep(19, 3))
132
    
133
    if (i <= 4) {
134
      benchmark.surv = benchmark.omics.surv(benchmark.ret, omic.subset)
135
      benchmark.clinical = benchmark.omics.clinical(benchmark.ret, omic.subset)
136
    
137
      if (i %in% 2:4) {
138
        single.omic.surv.benchmarks[[i - 1]] = benchmark.surv
139
        single.omic.clin.benchmarks[[i - 1]] = benchmark.clinical
140
      }
141
    
142
    } else if (i == 5) {
143
      best.mats = get.best.single.omic.mat(single.omic.surv.benchmarks, single.omic.clin.benchmarks)
144
      benchmark.surv = best.mats[[1]]
145
      benchmark.clinical = best.mats[[2]]
146
147
    } else {
148
      best.mats = get.best.single.omic.mat(single.omic.clin.benchmarks, single.omic.surv.benchmarks)
149
      benchmark.surv = best.mats[[2]]
150
      benchmark.clinical = best.mats[[1]]
151
    }
152
    
153
    surv.sum = rowSums(benchmark.surv, na.rm=T)
154
    clin.sum = rowSums(benchmark.clinical, na.rm=T)
155
    
156
    # for mcca, the sum is for an empty vector, and is equal 0, so we remove this
157
    available.indices = surv.sum != 0
158
    surv.sum = surv.sum[available.indices]
159
    clin.sum = clin.sum[available.indices]
160
    alg.cols = alg.cols[available.indices]
161
    pch = pch[available.indices]
162
    
163
    
164
    if (omic.subset == '1') {
165
      xlab = '-log10(logrank pvalue)'
166
      ylab = '# enriched clinical parameters'
167
    } else {
168
      xlab = ''
169
      ylab = ''
170
    }
171
    
172
    if (i %in% c(1, 5, 6)) {
173
      y.min = min(clin.sum) - 1
174
      x.min = min(surv.sum) - 1
175
    } else {
176
      y.min = 0
177
      x.min = 0
178
    }
179
    
180
    
181
    subplot.name = c('Multi-omics', 'Gene Expression', 'DNA Methylation', 'miRNA Expression',
182
                     'Best single-omic (survival)', 'Best single-omic (clinical)')[i]
183
    plot(surv.sum, clin.sum, main=subplot.name, xlab=xlab, ylab=ylab,
184
         xlim=c(x.min, max(surv.sum) + 1), ylim=c(y.min, max(clin.sum + 1)), col=alg.cols, pch=pch, cex.lab=1.8, cex=4, cex.axis=1.5, cex.main=2.5, lwd=4)
185
  }
186
  dev.off()
187
}
188
189
create.clinical.survival.plots <- function(benchmark.surv, benchmark.clinical, plot.name) {
190
  num.subtypes = ncol(benchmark.surv)
191
  tiff(file.path(get.plots.dir.path(), plot.name), width=4500, height=1875, res=300)
192
  alg.cols = c("#8DD3C7", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5", "#BC80BD", "#842121")
193
  pch = c(rep(15, 3), rep(3, 3), rep(19, 3))
194
195
  par(mfrow=c(2, num.subtypes / 2), mar=c(4.1, 4, 3.2, 1))
196
  for (i in 1:num.subtypes) {
197
    subtype = colnames(benchmark.surv)[i]
198
    subtype.surv = benchmark.surv[,subtype]
199
    subtype.clinical = benchmark.clinical[,subtype]
200
    
201
    available.indices = !is.na(subtype.surv)
202
    subtype.surv = subtype.surv[available.indices]
203
    subtype.clinical = subtype.clinical[available.indices]
204
    current.cols = alg.cols[available.indices]
205
    current.pch = pch[available.indices]
206
    
207
    surv.significance = -log10(0.05)
208
    
209
    if (i == 1) {
210
      xlab = '-log10(logrank pvalue)'
211
      ylab = '# enriched clinical parameters'
212
    } else {
213
      xlab = ''
214
      ylab = ''
215
    }
216
    
217
    plot(subtype.surv, subtype.clinical, main=subtype.to.display.name(subtype), xlab=xlab, ylab=ylab,
218
         xlim=c(0, max(subtype.surv, surv.significance) + 0.2), ylim=c(0, max(subtype.clinical + 1)), col=current.cols, pch=current.pch, cex.lab=1.4, cex=2.4, cex.axis=1.5, cex.main=1.8, lwd=3)
219
    abline(v=surv.significance, col='red')
220
  }
221
  
222
  dev.off()
223
}
224
225
plot.legend <- function() {
226
  alg.cols = c("#8DD3C7", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5", "#BC80BD", "#842121")
227
  pch = c(rep(15, 3), rep(3, 3), rep(19, 3))
228
  lwds = c(rep(NA, 3), rep(3, 3), rep(NA, 3))
229
  pt.cexs = 1.8
230
  algorithm.names = ALGORITHM.DISPLAY.NAMES
231
  algorithm.names[length(algorithm.names)] = paste0(algorithm.names[length(algorithm.names)], ' ')
232
  width=4200
233
  
234
  if (F) {
235
    alg.cols = alg.cols[-7]
236
    pch = pch[-7]
237
    lwds = lwds[-7]
238
    algorithm.names = algorithm.names[-7]
239
    pt.cexs = pt.cexs[-7]
240
    width=3800
241
  }
242
  
243
  tiff(file.path(get.plots.dir.path(), 'legend.tif'), width=width, res=300)
244
  par(font=2, mar=rep(1, 4))
245
  plot(0,xaxt='n',yaxt='n',bty='n',pch='',ylab='',xlab='')
246
  legend(0.58, 1, legend=algorithm.names, col=alg.cols, pch=pch, horiz=T, pt.cex=pt.cexs, pt.lwd=lwds)
247
  dev.off()
248
}
249
250
analyze.benchmark <- function() {
251
  all.clusterings = list()
252
  all.timings = list()
253
  for (i in 1:length(SUBTYPES.DATA)) {
254
    current.subtype.data = SUBTYPES.DATA[[i]]
255
    subtype = current.subtype.data$name
256
    subtype.raw.data = get.raw.data(subtype, 
257
                                    only.primary=current.subtype.data$only.primary)
258
                    
259
    all.clusterings[[subtype]] = list()
260
    all.timings[[subtype]] = list()
261
    
262
    for (algorithm.name in ALGORITHM.NAMES) {
263
      all.clusterings[[subtype]][[algorithm.name]] = list()
264
      all.timings[[subtype]][[algorithm.name]] = list()
265
      for (j in c('all', '1', '2', '3')) {
266
        clustering.path = file.path(get.clustering.results.dir.path(),
267
                                  paste(subtype, algorithm.name, j, sep='_'))
268
        timing.path = file.path(get.clustering.results.dir.path(),
269
                                  paste(subtype, algorithm.name, j, 'timing', sep='_'))
270
    load(clustering.path)
271
    load(timing.path)
272
    if (!any(is.na(clustering))) {
273
      names(clustering) = colnames(subtype.raw.data[[1]])
274
    }
275
    
276
    all.clusterings[[subtype]][[algorithm.name]][[j]] = clustering
277
    all.timings[[subtype]][[algorithm.name]][[j]] = timing
278
      }
279
    }
280
  }
281
  return(list(all.clusterings=all.clusterings, all.timings=all.timings))
282
}
283
284
check.empirical.surv <- function(old.pvals, new.pvals) {
285
  is.in.conf = matrix(0, ncol=ncol(old.pvals), nrow=nrow(old.pvals))
286
  for (i in 1:nrow(old.pvals)) {
287
    for (j in 1:ncol(old.pvals)) {
288
      old.pval = old.pvals[i, j]
289
      if (is.na(old.pval)) {
290
        is.in.conf[i, j] = NA
291
    next
292
      }
293
      if (old.pval == 1e-10) old.pval = 1e-5
294
      num.runs = floor(30 / old.pval)
295
      new.pval = new.pvals[i, j]
296
      num.success = round(new.pval * num.runs)
297
      print(c(num.success, num.runs))
298
      
299
      conf.int = binom.test(num.success, num.runs)$conf.int
300
      in.conf.int = old.pval >= conf.int[1]  & old.pval <= conf.int[2]
301
      is.in.conf[i, j] = in.conf.int
302
    }
303
  }
304
  return(is.in.conf)
305
}
306
307
get.empirical.surv <- function(clustering, subtype) {
308
  set.seed(42)
309
  surv.ret = check.survival(clustering, subtype)
310
  orig.chisq = surv.ret$chisq
311
  orig.pvalue = get.logrank.pvalue(surv.ret)
312
  # The initial number of permutations to run
313
  num.perms = round(min(max(10 / orig.pvalue, 1000), 1e6))
314
  should.continue = T
315
  
316
  total.num.perms = 0
317
  total.num.extreme.chisq = 0
318
  
319
  while (should.continue) {
320
    print('Another iteration in empirical survival calculation')
321
    print(num.perms)
322
    perm.chisq = as.numeric(mclapply(1:num.perms, function(i) {
323
      cur.clustering = sample(clustering)
324
      names(cur.clustering) = names(clustering)
325
      cur.chisq = check.survival(cur.clustering, subtype)$chisq
326
      return(cur.chisq)
327
    }, mc.cores=50))
328
    
329
    total.num.perms = total.num.perms + num.perms
330
    total.num.extreme.chisq = total.num.extreme.chisq + sum(perm.chisq >= orig.chisq)
331
    
332
    binom.ret = binom.test(total.num.extreme.chisq, total.num.perms)
333
    cur.pvalue = binom.ret$estimate
334
    cur.conf.int = binom.ret$conf.int
335
    
336
    print(c(total.num.extreme.chisq, total.num.perms))
337
    print(cur.pvalue)
338
    print(cur.conf.int)
339
    
340
    sig.threshold = 0.05
341
    is.conf.small = ((cur.conf.int[2] - cur.pvalue) < min(cur.pvalue / 10, 0.01)) & ((cur.pvalue - cur.conf.int[1]) < min(cur.pvalue / 10, 0.01))
342
    is.threshold.in.conf = cur.conf.int[1] < sig.threshold & cur.conf.int[2] > sig.threshold
343
    if ((is.conf.small & !is.threshold.in.conf) | (total.num.perms > 2e7)) {
344
    #if (is.conf.small) {
345
      should.continue = F
346
    } else {
347
      num.perms = 1e5
348
    }
349
  }
350
  
351
  return(list(pvalue = cur.pvalue, conf.int = cur.conf.int, total.num.perms=total.num.perms, 
352
              total.num.extreme.chisq=total.num.extreme.chisq))
353
}
354
355
benchmark.omics.surv <- function(benchmark.results, omics='all') {
356
357
  
358
  all.surv.pvalues = matrix(1, ncol=length(SUBTYPES.DATA), nrow=length(ALGORITHM.NAMES))
359
  rownames(all.surv.pvalues) = ALGORITHM.NAMES
360
  colnames(all.surv.pvalues) = sapply(SUBTYPES.DATA, function(x) x$name)
361
  all.clusterings = benchmark.results$all.clusterings
362
  for (i in 1:length(all.clusterings)) {
363
    subtype = colnames(all.surv.pvalues)[i]
364
    subtype.clusterings = all.clusterings[[subtype]]
365
    for (j in 1:length(subtype.clusterings)) {
366
      surv.path = file.path(get.clustering.results.dir.path(),
367
                                  paste(subtype, ALGORITHM.NAMES[j], omics, 'surv', sep='_'))
368
      if (file.exists(surv.path)) {
369
        load(surv.path)
370
    pvalue = empirical.surv.ret$pvalue
371
    
372
      } else {
373
        clustering = subtype.clusterings[[ALGORITHM.NAMES[j]]][[omics]]
374
        if (length(table(clustering)) > 1) {
375
          #pvalue = -log10(get.logrank.pvalue(check.survival(clustering, subtype)))
376
      empirical.surv.ret = get.empirical.surv(clustering, subtype)
377
      save(empirical.surv.ret, file=surv.path)
378
      pvalue = empirical.surv.ret$pvalue
379
      
380
381
        } else {
382
          pvalue = NA
383
        }  
384
      }
385
      all.surv.pvalues[j, i] = -log10(pvalue)
386
    }
387
  }
388
389
  return(all.surv.pvalues)
390
}
391
392
benchmark.omics.num.clusters <- function(benchmark.results, omics='all') {
393
  num.clusters = matrix(1, ncol=length(SUBTYPES.DATA), nrow=length(ALGORITHM.NAMES))
394
  rownames(num.clusters) = ALGORITHM.NAMES
395
  colnames(num.clusters) = sapply(SUBTYPES.DATA, function(x) x$name)
396
  all.clusterings = benchmark.results$all.clusterings
397
  for (i in 1:length(all.clusterings)) {
398
    subtype = colnames(num.clusters)[i]
399
    subtype.clusterings = all.clusterings[[subtype]]
400
    for (j in 1:length(subtype.clusterings)) {
401
      clustering = subtype.clusterings[[ALGORITHM.NAMES[j]]][[omics]]
402
      num.clusters[j, i] = max(clustering)
403
    }
404
  }
405
  return(num.clusters)
406
}
407
408
benchmark.omics.clinical <- function(benchmark.results, omics='all') {
409
                  
410
  num.clinical.enrich = matrix(1, ncol=length(SUBTYPES.DATA), nrow=length(ALGORITHM.NAMES))
411
  rownames(num.clinical.enrich) = ALGORITHM.NAMES
412
  colnames(num.clinical.enrich) = sapply(SUBTYPES.DATA, function(x) x$name)
413
  total.num.tested.parameters = 0
414
  all.clusterings = benchmark.results$all.clusterings
415
  for (i in 1:length(all.clusterings)) {
416
    subtype = colnames(num.clinical.enrich)[i]
417
    subtype.clusterings = all.clusterings[[subtype]]
418
    for (j in 1:length(subtype.clusterings)) {
419
      print('checking clinical enrichment')
420
      print(c(i, j))
421
      
422
      
423
      clustering = subtype.clusterings[[ALGORITHM.NAMES[j]]][[omics]]
424
      if (any(is.na(clustering))) {
425
        num.clinical.enrich[j, i] = NA
426
      } else {
427
        clin.path = file.path(get.clustering.results.dir.path(),
428
                                  paste(subtype, ALGORITHM.NAMES[j], omics, 'clin', sep='_'))
429
                  
430
        if (file.exists(clin.path)) {
431
      load(clin.path)
432
    } else {
433
      enrichment.pvalues = check.clinical.enrichment(clustering, subtype)
434
      save(enrichment.pvalues, file=clin.path)
435
    }
436
        
437
    if (j == 1) {
438
      total.num.tested.parameters = total.num.tested.parameters + length(enrichment.pvalues)
439
    }
440
        num.clinical.enrich[j, i] = sum(enrichment.pvalues * length(enrichment.pvalues) < 0.05)
441
      }
442
    }
443
  }
444
  print(paste0('Total number of parameters tested:', total.num.tested.parameters))
445
  return(num.clinical.enrich)
446
}
447
448
benchmark.omics.time <- function(benchmark.results, omics='all') {
449
  all.alg.times = matrix(1, ncol=length(SUBTYPES.DATA), nrow=length(ALGORITHM.NAMES))
450
  rownames(all.alg.times) = ALGORITHM.NAMES
451
  colnames(all.alg.times) = sapply(SUBTYPES.DATA, function(x) x$name)
452
  all.timings = benchmark.results$all.timings
453
  for (i in 1:length(all.timings)) {
454
    subtype = colnames(all.alg.times)[i]
455
    subtype.timings = all.timings[[subtype]]
456
    for (j in 1:length(subtype.timings)) {
457
      timing = subtype.timings[[ALGORITHM.NAMES[j]]][[omics]]
458
      all.alg.times[j, i] = timing
459
    }
460
  }
461
  return(all.alg.times)
462
}
463
464
benchmark.alg.agreement <- function(benchmark.results, omics='all') {
465
  all.clusterings = benchmark.results$all.clusterings
466
  rand.matrix = matrix(NA, ncol=length(ALGORITHM.NAMES), nrow=length(ALGORITHM.NAMES))
467
  colnames(rand.matrix) = ALGORITHM.NAMES
468
  rownames(rand.matrix) = ALGORITHM.NAMES
469
  for (alg1.index in 1:length(ALGORITHM.NAMES)) {
470
    alg1 = ALGORITHM.NAMES[alg1.index]
471
    for (alg2.index in 1:length(ALGORITHM.NAMES)) {
472
      alg2 = ALGORITHM.NAMES[alg2.index]
473
      rands = c()
474
      for (i in 1:length(SUBTYPES.DATA)) {
475
        clustering1 = all.clusterings[[i]][[alg1]][[omics]]
476
    clustering2 = all.clusterings[[i]][[alg2]][[omics]]
477
    rands = c(rands, adj.rand.index(clustering1, clustering2))
478
      }
479
      mean.rand = mean(rands)
480
      rand.matrix[alg1.index, alg2.index] = mean.rand
481
    }
482
  }
483
  return(rand.matrix)
484
}
485
486
get.logrank.pvalue <- function(survdiff.res) {
487
  1 - pchisq(survdiff.res$chisq, length(survdiff.res$n) - 1)  
488
}
489
490
run.benchmark <- function() {
491
  for (i in 1:length(SUBTYPES.DATA)) {
492
    current.subtype.data = SUBTYPES.DATA[[i]]
493
    subtype = current.subtype.data$name
494
    subtype.raw.data = get.raw.data(subtype, 
495
                                    only.primary=current.subtype.data$only.primary)
496
    
497
    subtype.raw.data = set.omics.list.attr(subtype.raw.data, 
498
                                           current.subtype.data)
499
    
500
    for (algorithm.name in ALGORITHM.NAMES) {
501
      for (j in c('all', '1', '2', '3')) {
502
        set.seed(42)
503
        print(paste('subtype', subtype, 'running algorithm', algorithm.name, j))
504
        clustering.path = file.path(get.clustering.results.dir.path(),
505
                                  paste(subtype, algorithm.name, j, sep='_'))
506
        timing.path = file.path(get.clustering.results.dir.path(),
507
                                  paste(subtype, algorithm.name, j, 'timing', sep='_'))
508
    
509
    
510
        if (!file.exists(clustering.path)) {
511
          algorithm.func.name = paste0('run.', algorithm.name)
512
          algorithm.func = get(algorithm.func.name)
513
      if (j == 'all') {
514
        cur.iteration.data = subtype.raw.data
515
      } else {
516
        cur.iteration.data = subtype.raw.data[as.numeric(j)]
517
      }
518
          algorithm.ret = algorithm.func(cur.iteration.data, current.subtype.data)
519
          clustering = algorithm.ret$clustering
520
          timing = algorithm.ret$timing
521
      print('before saving')
522
          save(clustering, file = clustering.path)
523
          save(timing, file = timing.path)
524
        }
525
      }
526
    }
527
  }
528
}
529
530
get.dataset.dir.path <- function() {
531
  return('DATASETS_PATH')
532
}
533
534
log.and.normalize <- function(omics.data, subtype.data, normalize=T,
535
                              filter.var=F) {
536
  # filter features with no variance at all
537
  for (i in 1:length(omics.data)) {
538
    omics.data[[i]] = omics.data[[i]][apply(omics.data[[i]], 1, var) > 0,]
539
  }
540
                  
541
  for (i in 1:length(omics.data)) {
542
    if (attr(omics.data[[i]], 'is.seq')) {
543
      omics.data[[i]] = log(1+omics.data[[i]])
544
    }
545
  }
546
  
547
  if (filter.var) {
548
    omics.data = lapply(omics.data, keep.high.var.features)
549
  }
550
  
551
  if (normalize) {
552
    omics.data = lapply(omics.data, normalize.matrix)    
553
  }
554
  
555
  return(omics.data)
556
}
557
558
normalize.matrix <- function(data.matrix) {
559
  temp = data.matrix - rowMeans(data.matrix)
560
  should.keep = (apply(temp, 1, sd) != 0)
561
  return ((temp / apply(temp, 1, sd))[should.keep, ])
562
}
563
564
filter.non.tumor.samples <- function(raw.datum, only.primary=only.primary) {
565
  # 01 is primary, 06 is metastatic, 03 is blood derived cancer
566
  if (!only.primary)
567
    return(raw.datum[,substring(colnames(raw.datum), 14, 15) %in% c('01', '03', '06')])
568
  else
569
    return(raw.datum[,substring(colnames(raw.datum), 14, 15) %in% c('01')])
570
}
571
572
get.fixed.names <- function(patient.names, include.type=F) {
573
  # fix the TCGA names to only include the patient ids
574
  if (include.type) {
575
    return(gsub('-', '\\.', toupper(substring(patient.names, 1, 15))))
576
  } else {
577
    return(gsub('-', '\\.', toupper(substring(patient.names, 1, 12))))  
578
  }
579
}
580
581
fix.patient.names <- function(subtype.raw.data, include.type=F) {
582
  for (i in 1:length(subtype.raw.data)) {
583
    colnames(subtype.raw.data[[i]]) = get.fixed.names(colnames(subtype.raw.data[[i]]),
584
                                                      include.type)
585
  }
586
  return(subtype.raw.data)
587
}
588
589
get.raw.data <- function(subtype.name,
590
                         datasets.path = get.dataset.dir.path(),
591
                         only.primary=NA) {
592
  omics.dir = file.path(datasets.path, subtype.name)
593
  omics.files = list.files(omics.dir)
594
  omics.files = setdiff(omics.files, c('survival'))  
595
  raw.data = lapply(file.path(omics.dir, omics.files), read.table)
596
  
597
  if (!is.na(only.primary)) {
598
    raw.data = lapply(raw.data, function(x) filter.non.tumor.samples(x, only.primary = only.primary))
599
  }
600
  name.corrected.data = fix.patient.names(raw.data)
601
  patients.intersection = Reduce(intersect, lapply(name.corrected.data, colnames))
602
  ret.data = lapply(name.corrected.data, function(datum) datum[,patients.intersection])  
603
  return(ret.data)
604
}
605
606
get.elbow <- function(values, is.max) {
607
  second.derivatives = c()
608
  for (i in 2:(length(values) - 1)) {
609
    second.derivative = values[i + 1] + values[i - 1] - 2 * values[i]
610
    second.derivatives = c(second.derivatives, second.derivative)
611
  }
612
  print(second.derivatives)
613
  if (is.max) {
614
    return(which.max(second.derivatives) + 1)
615
  } else {
616
    return(which.min(second.derivatives) + 1)
617
  }
618
}
619
620
# Does not support a single omic dataset
621
run.mcca <- function(omics.list, subtype.data) {
622
  if (length(omics.list) == 1) {
623
    return(list(clustering=rep(NA, ncol(omics.list[[1]])), timing=1))
624
  }
625
  start = Sys.time()
626
  omics.list = log.and.normalize(omics.list, subtype.data, 
627
                                 normalize = T,
628
                                 filter.var = T)
629
  
630
  subtype = subtype.data$name
631
  omics.transposed = lapply(omics.list, t)
632
  cca.ret = PMA::MultiCCA(omics.transposed, 
633
                          ncomponents = MAX.NUM.CLUSTERS)
634
  sample.rep = omics.transposed[[1]] %*% cca.ret$ws[[1]]
635
  
636
  explained.vars = sapply(1:MAX.NUM.CLUSTERS, 
637
              function(i) sum(unlist(apply(sample.rep[1:i,,drop=F], 2, var))))
638
  
639
  dimension = get.elbow(explained.vars, is.max=F)
640
  print(dimension)
641
  sample.rep = sample.rep[,1:dimension]
642
  sils = c()
643
  clustering.per.num.clusters = list()
644
  for (num.clusters in 2:MAX.NUM.CLUSTERS) {
645
    cur.clustering = kmeans(sample.rep, num.clusters, iter.max=100, nstart=30)$cluster  
646
    sil = get.clustering.silhouette(list(t(sample.rep)), cur.clustering)
647
    sils = c(sils, sil)
648
    clustering.per.num.clusters[[num.clusters - 1]] = cur.clustering
649
  }
650
  # NOTE: the next line contains an error. We mistakenly selected the minimal rather maximal silhouette.
651
  # See more details in: http://acgt.cs.tau.ac.il/multi_omic_benchmark/download.html.
652
  cca.clustering = clustering.per.num.clusters[[which.min(sils)]]
653
  time.taken = as.numeric(Sys.time() - start, units='secs')
654
  return(list(clustering=cca.clustering, timing=time.taken))
655
}
656
657
run.snf <- function(omics.list, subtype.data) {
658
  start = Sys.time()
659
  omics.list = log.and.normalize(omics.list, subtype.data)
660
  subtype = subtype.data$name
661
  alpha=0.5
662
  T.val=30
663
  num.neighbors = round(ncol(omics.list[[1]]) / 10)
664
  similarity.data = lapply(omics.list, function(x) {affinityMatrix(dist2(as.matrix(t(x)),as.matrix(t(x))), 
665
                                                                   num.neighbors, alpha)})
666
  if (length(similarity.data) == 1) {
667
    W = similarity.data[[1]]
668
  } else {
669
    W = SNF(similarity.data, num.neighbors, T.val)  
670
  }
671
  
672
  num.clusters = estimateNumberOfClustersGivenGraph(W, 2:MAX.NUM.CLUSTERS)[[3]]  
673
  clustering = spectralClustering(W, num.clusters)
674
  time.taken = as.numeric(Sys.time() - start, units='secs')
675
  return(list(clustering=clustering, timing=time.taken))
676
}
677
678
run.iCluster <- function(omics.list, subtype.data) {
679
  omics.list = log.and.normalize(omics.list, subtype.data, normalize = F)
680
681
  start = Sys.time()
682
  subtype = subtype.data$name
683
  dev.ratios = c()
684
  icluster.rets = list()
685
686
  if (length(omics.list) == 1) {
687
    icluster.ret = iClusterPlus::tune.iClusterBayes(cpus=(MAX.NUM.CLUSTERS - 1), t(omics.list[[1]]), 
688
                              K=1:(MAX.NUM.CLUSTERS - 1), type=c('gaussian'))$fit
689
  } else {
690
    icluster.ret = iClusterPlus::tune.iClusterBayes(cpus=(MAX.NUM.CLUSTERS - 1), t(omics.list[[1]]), 
691
                              t(omics.list[[2]]), 
692
                              t(omics.list[[3]]), 
693
                              K=1:(MAX.NUM.CLUSTERS - 1), type=rep('gaussian', 3))$fit
694
  }
695
  dev.ratios = lapply(1:(MAX.NUM.CLUSTERS - 1), function(i) icluster.ret[[i]]$dev.ratio)
696
697
  print('dev.ratios are:')
698
  print(dev.ratios)
699
  
700
  optimal.solution = icluster.ret[[which.max(dev.ratios)]]
701
  time.taken = as.numeric(Sys.time() - start, units='secs')
702
  return(list(clustering=optimal.solution$clusters, 
703
              timing=time.taken))
704
}
705
706
get.mkl.binary.path = function() {
707
  return('MKL_BINARY_PATH')
708
}
709
710
get.mkl.arguments.path = function() {
711
  return('MKL_ARGS_PATH')
712
}
713
714
715
run.mkl <- function(omics.list, subtype.data) {
716
  start = Sys.time()
717
  omics.list = log.and.normalize(omics.list, subtype.data)
718
  subtype = subtype.data$name
719
  omics.list = lapply(omics.list, normalize.matrix)
720
  time.taken = as.numeric(Sys.time() - start, units='secs')
721
  export.subtype.to.mkl(omics.list, subtype)
722
  
723
  start = Sys.time()
724
  bin.path = get.mkl.binary.path()
725
  subtype.dir = paste0(get.mkl.arguments.path(), subtype, '\\')
726
  paste0(subtype.dir, 'kernels')
727
  command = paste(bin.path, paste0(subtype.dir, 'kernels'),
728
                  paste0(subtype.dir, 'ids'),
729
                  paste0(subtype.dir, 'output'), 
730
                  '9', '5')
731
  command.return = system(command)
732
  stopifnot(command.return == 0)
733
  time.taken2 = as.numeric(Sys.time() - start, units='secs')
734
  clustering = get.mkl.clustering(subtype)
735
  return(list(clustering=clustering, 
736
              timing=time.taken + time.taken2))
737
}
738
739
run.nmf <- function(omics.list, subtype.data) {
740
  total.time.taken = 0
741
  start = Sys.time()
742
  omics.list = log.and.normalize(omics.list, subtype.data,
743
                                 filter.var = T, normalize = F)
744
  time.taken = as.numeric(Sys.time() - start, units='secs')
745
  total.time.taken = total.time.taken + time.taken
746
  
747
  save.subtype.matlab.format(omics.list)
748
  subtype = subtype.data$name
749
  if (length(omics.list) > 1) {
750
    command.ret = system('MULTI_NMF_COMMAND')
751
    stopifnot(command.ret == 0)
752
    nmf.timing = read.csv('SOME_TEMP_PATH_TIMING', header=F)[1, 1]
753
    total.time.taken = total.time.taken + nmf.timing
754
  } else {
755
   for (k in 1:MAX.NUM.CLUSTERS) {
756
     start = Sys.time()
757
     file.name = paste0('SOME_TEMP_PATH/', k, '_consensus')
758
     nmf.ret = nmf(omics.list[[1]], k, method='lee')
759
     coef.mat = t(coef(nmf.ret))
760
     time.taken = as.numeric(Sys.time() - start, units='secs')
761
     total.time.taken = total.time.taken + time.taken
762
     write.table(coef.mat, file=file.name, quote=F, row.names=F, col.names=F, sep=',')
763
   }
764
  }
765
  
766
  explained.vars = c()
767
  clustering.per.num.clusters = list()
768
  for (k in 1:MAX.NUM.CLUSTERS) {
769
    file.name = paste0('SOME_TEMP_PATH/', k, '_consensus')
770
    consensus.mat = read.csv(file.name, header=F)
771
    
772
    start = Sys.time()
773
    cur.clustering = apply(consensus.mat, 1, which.max)
774
    explained.var = sum(unlist(apply(consensus.mat, 2, var)))
775
    explained.vars = c(explained.vars, explained.var)
776
    clustering.per.num.clusters[[k]] = cur.clustering
777
    time.taken = as.numeric(Sys.time() - start, units='secs')
778
    total.time.taken = total.time.taken + time.taken
779
  }
780
  
781
  dimension = get.elbow(explained.vars, is.max=F)
782
  nmf.clustering = clustering.per.num.clusters[[dimension]]
783
  return(list(clustering=nmf.clustering, timing=total.time.taken))  
784
}
785
786
run.pins <- function(omics.list, subtype.data) {
787
  start = Sys.time()
788
  omics.list = log.and.normalize(omics.list, subtype.data, normalize = F)
789
  subtype = subtype.data$name
790
  omics.transposed = lapply(omics.list, t)
791
  if (length(omics.list) == 1) {
792
    pins.ret = PINSPlus::PerturbationClustering(data=omics.transposed[[1]],
793
                                            kMax = MAX.NUM.CLUSTERS)
794
    clustering = pins.ret$cluster
795
    
796
  } else {
797
    pins.ret = PINSPlus::SubtypingOmicsData(dataList=omics.transposed,
798
                                            kMax = MAX.NUM.CLUSTERS)
799
    clustering = pins.ret$cluster2
800
  }
801
  time.taken = as.numeric(Sys.time() - start, units='secs')
802
  return(list(clustering=clustering, timing=time.taken))
803
}
804
805
run.lracluster <- function(omics.list, subtype.data) {
806
  omics.list = log.and.normalize(omics.list, subtype.data, normalize = F)
807
  
808
  subtype = subtype.data$name
809
  start = Sys.time()
810
  
811
  dim.range = 1:MAX.NUM.CLUSTERS
812
  all.clustering.results = list()
813
  
814
  omics.matrix.list = lapply(omics.list, as.matrix)
815
  for (dimension in dim.range) {
816
    print(paste('running lra cluster for dimension', dimension))
817
    data.names = c('gene expression', 'methylation', 'miRNA expression')
818
    clustering.results = LRAcluster(omics.matrix.list, 
819
                                    rep('gaussian', length(omics.list)), 
820
                                    dimension=dimension, data.names)
821
    all.clustering.results[[dimension]] = clustering.results
822
  }
823
  explained.var = sapply(all.clustering.results, function(x) x$potential)
824
  print(explained.var)
825
  dimension = get.elbow(explained.var, is.max=F)
826
  print(dimension)
827
  solution = all.clustering.results[[dimension]]$coordinate
828
  
829
  sils = c()
830
  clustering.per.num.clusters = list()
831
  for (num.clusters in 2:MAX.NUM.CLUSTERS) {
832
    print(paste('running kmeans in lra cluster for num clusters', num.clusters))
833
    cur.clustering = kmeans(t(solution), num.clusters, iter.max=100, nstart=60)$cluster
834
    sil = get.clustering.silhouette(list(solution), cur.clustering)
835
    sils = c(sils, sil)
836
    clustering.per.num.clusters[[num.clusters - 1]] = cur.clustering
837
  }
838
  print(sils)
839
  # NOTE: the next line contains an error. We mistakenly selected the minimal rather maximal silhouette.
840
  # See more details in: http://acgt.cs.tau.ac.il/multi_omic_benchmark/download.html.
841
  chosen.clustering = clustering.per.num.clusters[[which.min(sils)]]
842
  time.taken = as.numeric(Sys.time() - start, units='secs')
843
  return(list(clustering=chosen.clustering, timing=time.taken))
844
}
845
846
run.kmeans <- function(omics.list, subtype.data) {
847
  start = Sys.time()
848
  omics.list = log.and.normalize(omics.list, subtype.data, 
849
                                 filter.var = T)
850
  
851
  subtype = subtype.data$name
852
  all.withinss = c()
853
  all.clusterings = list()
854
  k.range = 1:MAX.NUM.CLUSTERS
855
  for (k in k.range) {
856
    concat.omics = do.call(rbind, omics.list)
857
    kmeans.ret = kmeans(t(concat.omics), k, iter.max=100, nstart=60)
858
    all.withinss = c(all.withinss, kmeans.ret$tot.withinss)
859
    all.clusterings[[k]] = kmeans.ret$cluster
860
  }
861
  
862
  best.k = get.elbow(all.withinss, is.max=T)
863
  time.taken = as.numeric(Sys.time() - start, units='secs')
864
  return(list(clustering=all.clusterings[[best.k]], 
865
              timing=time.taken))
866
}
867
868
run.spectral <- function(omics.list, subtype.data) {
869
  start = Sys.time()
870
  omics.list = log.and.normalize(omics.list, subtype.data, 
871
                                 filter.var = T)
872
  subtype = subtype.data$name
873
  concat.omics = do.call(rbind, omics.list)
874
  
875
  similarity.data = affinityMatrix(dist2(as.matrix(t(concat.omics)),
876
                                         as.matrix(t(concat.omics))), 
877
                                   20, 0.5)
878
  num.clusters = estimateNumberOfClustersGivenGraph(similarity.data, 
879
                                      2:MAX.NUM.CLUSTERS)[[3]]  
880
  clustering = spectralClustering(similarity.data, num.clusters)
881
  time.taken = as.numeric(Sys.time() - start, units='secs')
882
  return(list(clustering=clustering, timing=time.taken))
883
}
884
885
load.libraries <- function() {
886
  library('PMA')
887
  library('R.matlab')
888
  library('SNFtool')
889
  library('PINSPlus')
890
  #library('LRAcluster')
891
  library('kernlab')
892
  library('survival')
893
  library('NMF')
894
  
895
  # bioconductor packages
896
  source("https://bioconductor.org/biocLite.R")
897
  biocLite("impute")
898
  #biocLite("iClusterPlus")
899
}
900
901
902
903
########################################
904
###############   MKL    ###############
905
########################################
906
907
radial.basis <- function(mat, gamma) {
908
  if (missing(gamma)) {
909
    gamma = 1 / (2*nrow(mat)**2)
910
  }
911
  npatients = ncol(mat)
912
  output.mat = matrix(0, ncol=npatients, nrow=npatients)
913
  for (i in 1:npatients) {
914
    for (j in 1:npatients) {
915
      output.mat[i, j] = exp(-norm(as.matrix(mat[,i] - mat[,j]), type = 'F')**2 * gamma)
916
    }
917
  }
918
  
919
  D = apply(output.mat, 2, sum) / npatients
920
  E = sum(D) / npatients
921
  J = matrix(1, nrow=npatients, ncol=1) %*% D
922
  ret = output.mat - J - t(J) + E * matrix(1, ncol=npatients, nrow=npatients)
923
  ret = diag(1/sqrt(diag(ret))) %*% ret %*% diag(1/sqrt(diag(ret)))
924
  return(ret)
925
}
926
927
clear.dir <- function(dir.path) {
928
  files.in.dir = list.files(dir.path)
929
  for (file.in.dir in files.in.dir) {
930
    full.file.path = file.path(dir.path, file.in.dir)
931
    file.remove(full.file.path)
932
  }
933
}
934
935
export.subtype.to.mkl <- function(omics.list, dir.name) {
936
  
937
  folder.path = file.path(get.mkl.arguments.path(), dir.name)
938
  if (!dir.exists(folder.path)) {
939
    dir.create(folder.path)
940
  }
941
  
942
  kernels.path = file.path(folder.path, 'kernels')
943
  
944
  if (!dir.exists(kernels.path)) {
945
    dir.create(kernels.path)
946
  }
947
  clear.dir(kernels.path)
948
  
949
  gammas = 10 ** seq(-6, 6, by=3)
950
  for (i in 1:length(omics.list)) {
951
    for (j in 1:length(gammas)) {
952
      datum = omics.list[[i]]
953
      gamma = gammas[[j]] / (2*nrow(datum)**2)
954
      mat = radial.basis(datum, gamma)
955
      R.matlab::writeMat(file.path(kernels.path, paste(i, '_', j, sep='')), mat=mat)
956
    }
957
  }
958
  
959
  output.path = file.path(folder.path, 'output')
960
  if (!dir.exists(output.path)) {
961
    dir.create(output.path)
962
  }
963
  clear.dir(output.path)
964
  
965
  write.table(colnames(omics.list[[1]]), file=file.path(folder.path, 'ids'),
966
              quote=F, row.names = F, col.names = F)
967
  
968
}
969
970
get.mkl.clustering <- function(dir.name) {
971
  folder.path = file.path(get.mkl.arguments.path(), dir.name)
972
  output.path = file.path(folder.path, 'output')
973
  output.files = list.files(output.path)
974
  clustering = read.csv(file.path(output.path, output.files[grep('clusters', output.files)]))[,2]
975
  return(clustering)
976
}
977
978
check.survival <- function(groups, subtype, survival.file.path) {
979
  if (missing(survival.file.path)) {
980
    survival.file.path = get.subtype.survival.path(subtype)
981
  }
982
  survival.data = read.table(survival.file.path, header = TRUE)
983
  patient.names = names(groups)
984
  patient.names.in.file = as.character(survival.data[, 1])
985
  patient.names.in.file = toupper(gsub('-', '\\.', substring(patient.names.in.file, 1, 12)))
986
  
987
  stopifnot(all(patient.names %in% patient.names.in.file))
988
  
989
  indices = match(patient.names, patient.names.in.file)
990
  ordered.survival.data = survival.data[indices,]
991
  ordered.survival.data["cluster"] <- groups
992
  ordered.survival.data$Survival[is.na(ordered.survival.data$Survival)] = 0
993
  ordered.survival.data$Death[is.na(ordered.survival.data$Death)] = 0
994
  return(survdiff(Surv(Survival, Death) ~ cluster, data=ordered.survival.data))
995
  
996
}
997
998
get.subtype.survival.path <- function(subtype) {
999
  datasets.path = get.dataset.dir.path()
1000
  survival.file.path = file.path(datasets.path, subtype, 'survival')
1001
  return(survival.file.path)
1002
}
1003
1004
get.clustering.silhouette <- function(raw.data, clustering) {
1005
  sils = c()
1006
  for (i in 1:length(raw.data)) {
1007
    x = raw.data[[i]]
1008
    distmatrix = dist2(as.matrix(t(x)),as.matrix(t(x)))
1009
    sil = silhouette(clustering, dmatrix = distmatrix)[,3]
1010
    sils = c(sils, mean(sil))
1011
  }
1012
  return(mean(sils))
1013
}
1014
1015
get.clinical.params.dir <- function() {
1016
  return('CLINICAL_PARAMS_PATH')
1017
}
1018
1019
get.clinical.params <- function(subtype.name) {
1020
  clinical.data.path = paste(get.clinical.params.dir(), subtype.name, sep = '')
1021
  clinical.params = read.table(clinical.data.path,
1022
                               sep='\t', header=T, row.names = 1, stringsAsFactors = F)
1023
  rownames.with.duplicates = get.fixed.names(rownames(clinical.params))  
1024
  clinical.params = clinical.params[!duplicated(rownames.with.duplicates),]
1025
  rownames(clinical.params) = rownames.with.duplicates[!duplicated(rownames.with.duplicates)]
1026
  return(clinical.params)
1027
}
1028
1029
check.clinical.enrichment <- function(clustering, subtype.name) {
1030
  clinical.params = get.clinical.params(subtype.name)  
1031
  
1032
  clinical.metadata = list(gender='DISCRETE', age_at_initial_pathologic_diagnosis='NUMERIC',
1033
    pathologic_M='DISCRETE', pathologic_N='DISCRETE', pathologic_T='DISCRETE', pathologic_stage='DISCRETE')
1034
  
1035
  pvalues = c()
1036
  
1037
  params.being.tested = c()
1038
  
1039
  for (clinical.param in names(clinical.metadata)) {
1040
    
1041
    if (!(clinical.param %in% colnames(clinical.params))) {
1042
      #print(paste0('WARNING: ', clinical.param, ' does not appear for subtype ', subtype.name))
1043
      next
1044
    }
1045
    
1046
    clinical.values = clinical.params[names(clustering),clinical.param]
1047
    is.discrete.param = clinical.metadata[clinical.param] == 'DISCRETE'
1048
    is.numeric.param = clinical.metadata[clinical.param] == 'NUMERIC'
1049
    stopifnot(is.discrete.param | is.numeric.param)
1050
    
1051
    # skip parameter if many missing values
1052
    
1053
    if (is.numeric.param) {
1054
      numeric.entries = !is.na(as.numeric(clinical.values))
1055
      if (2 * sum(numeric.entries) < length(clinical.values)) {
1056
        #print(paste0('WARNING: skipping on ', clinical.param, ' for subtype ', subtype.name))
1057
        next
1058
      }
1059
    } else {
1060
      not.na.entries = !is.na(clinical.values)
1061
      should.skip = F
1062
      if (2 * sum(not.na.entries) < length(clinical.values)) {
1063
        should.skip = T
1064
      } else if (length(table(clinical.values[not.na.entries])) == 1) {
1065
        should.skip = T
1066
      }
1067
      if (should.skip) {
1068
        #print(paste0('WARNING: skipping on ', clinical.param, ' for subtype ', subtype.name))
1069
        next
1070
      }
1071
    }
1072
    
1073
    params.being.tested = c(params.being.tested, clinical.param)
1074
    
1075
    if (is.discrete.param) {
1076
      #clustering.with.clinical = cbind(clustering, clinical.values)
1077
      #tbl = table(as.data.frame(clustering.with.clinical[!is.na(clinical.values),]))
1078
      #test.res = chisq.test(tbl)
1079
      #pvalue = test.res$p.value
1080
      pvalue = get.empirical.clinical(clustering[!is.na(clinical.values)], clinical.values[!is.na(clinical.values)], T)
1081
      
1082
    } else if (is.numeric.param) {
1083
      #test.res = kruskal.test(as.numeric(clinical.values[numeric.entries]),
1084
      #             clustering[numeric.entries])
1085
      #pvalue = test.res$p.value
1086
      pvalue = get.empirical.clinical(clustering[numeric.entries], as.numeric(clinical.values[numeric.entries]), F)
1087
    }
1088
    
1089
    pvalues = c(pvalues, pvalue)
1090
    
1091
  }
1092
  names(pvalues) = params.being.tested
1093
  return(pvalues)
1094
}
1095
1096
get.empirical.clinical <- function(clustering, clinical.values, is.chisq) {
1097
  set.seed(42)
1098
  if (is.chisq) {
1099
      clustering.with.clinical = cbind(clustering, clinical.values)
1100
      tbl = table(as.data.frame(clustering.with.clinical))
1101
      test.res = chisq.test(tbl)
1102
  } else {
1103
    test.res = kruskal.test(as.numeric(clinical.values), clustering)
1104
  }
1105
  orig.pvalue = test.res$p.value
1106
  num.iter = 1000
1107
  total.num.iters = 0
1108
  total.num.extreme = 0
1109
  should.continue = T
1110
  
1111
  while (should.continue) {
1112
    print('another iteration in empirical clinical')
1113
    perm.pvalues = as.numeric(mclapply(1:num.iter, function(i) {
1114
      cur.clustering = sample(clustering)
1115
      names(cur.clustering) = names(clustering)
1116
    
1117
      if (is.chisq) {
1118
        clustering.with.clinical = cbind(cur.clustering, clinical.values)
1119
        tbl = table(as.data.frame(clustering.with.clinical))
1120
        test.res = chisq.test(tbl)
1121
      } else {
1122
        test.res = kruskal.test(as.numeric(clinical.values), cur.clustering)
1123
      }
1124
      cur.pvalue = test.res$p.value
1125
      return(cur.pvalue)
1126
    }, mc.cores=50))
1127
    total.num.iters = total.num.iters + num.iter
1128
    total.num.extreme = total.num.extreme + sum(perm.pvalues <= orig.pvalue)
1129
    
1130
    binom.ret = binom.test(total.num.extreme, total.num.iters)
1131
    cur.pvalue = binom.ret$estimate
1132
    cur.conf.int = binom.ret$conf.int
1133
    
1134
    sig.threshold = 0.05
1135
    is.threshold.in.conf = cur.conf.int[1] < sig.threshold & cur.conf.int[2] > sig.threshold
1136
    if (!is.threshold.in.conf | total.num.iters > 1e5) {
1137
      should.continue = F
1138
    }
1139
  }
1140
  return(cur.pvalue)
1141
}
1142
1143
get.nmf.datasets.dir <- function() {
1144
  return('NMF_DATASETS_PATH')
1145
}
1146
1147
save.subtype.matlab.format <- function(subtype.raw.data) {
1148
  data.names = c('1', '2', '3')
1149
  
1150
  full.dir.name = get.nmf.datasets.dir()
1151
  dir.create(full.dir.name, showWarnings = F)
1152
  
1153
  for (i in 1:length(subtype.raw.data)) {
1154
    full.file.name = file.path(full.dir.name, data.names[i])
1155
    write.table(subtype.raw.data[[i]], full.file.name)
1156
  }
1157
}
1158
1159
keep.high.var.features <- function(omic, num.features=2000) {
1160
  if (nrow(omic) < num.features) {
1161
    return(omic)
1162
  } else {
1163
    feature.vars = apply(omic, 1, var)
1164
    threshold = feature.vars[order(feature.vars, decreasing = T)][num.features]
1165
    return(omic[feature.vars >= threshold,])    
1166
  }
1167
}