--- a +++ b/benchmark.R @@ -0,0 +1,1167 @@ +SUBTYPES.DATA = list( + list(name='aml', only.primary=F, is.rna.seq=T, is.mirna.seq=T, display.name='AML'), + list(name='breast', only.primary=T, is.rna.seq=T, is.mirna.seq=T, display.name='BIC'), + list(name='colon', only.primary=T, is.rna.seq=T, is.mirna.seq=T, display.name='COAD'), + list(name='gbm', only.primary=T, is.rna.seq=F, is.mirna.seq=F, display.name='GBM'), + list(name='kidney', only.primary=T, is.rna.seq=T, is.mirna.seq=T, display.name='KIRC'), + list(name='liver', only.primary=T, is.rna.seq=T, is.mirna.seq=T, display.name='LIHC'), + list(name='lung', only.primary=T, is.rna.seq=T, is.mirna.seq=T, display.name='LUSC'), + list(name='melanoma', only.primary=F, is.rna.seq=T, is.mirna.seq=T, display.name='SKCM'), + list(name='ovarian', only.primary=T, is.rna.seq=T, is.mirna.seq=T, display.name='OV'), + list(name='sarcoma', only.primary=T, is.rna.seq=T, is.mirna.seq=T, display.name='SARC')) + +MAX.NUM.CLUSTERS = 15 + +OMIC.SUBSETS = list('multi_omics', 'exp', 'methy', 'mirna') +names(OMIC.SUBSETS) = c('all', '1', '2', '3') + +get.clustering.results.dir.path <- function() { + return('RESULTS_DIR_PATH') +} + +get.plots.dir.path <- function() { + results.dir.path = get.clustering.results.dir.path() + return(file.path(results.dir.path, 'plots')) +} + +get.tables.dir.path <- function() { + results.dir.path = get.clustering.results.dir.path() + return(file.path(results.dir.path, 'tables')) +} + +subtype.to.display.name <- function(subtype) { + for (i in 1:length(SUBTYPES.DATA)) { + if (SUBTYPES.DATA[[i]]$name == subtype) { + return(SUBTYPES.DATA[[i]]$display.name) + } + } +} + +set.omics.list.attr <- function(subtype.raw.data, subtype.data) { + attr(subtype.raw.data[[1]], 'is.seq') = subtype.data$is.rna.seq + attr(subtype.raw.data[[2]], 'is.seq') = F + attr(subtype.raw.data[[3]], 'is.seq') = subtype.data$is.mirna.seq + return(subtype.raw.data) +} + +ALGORITHM.NAMES = c('kmeans', 'spectral', 'lracluster', 'pins', 'snf', 'mkl', + 'mcca', 'nmf', 'iCluster') +ALGORITHM.DISPLAY.NAMES = as.list(c('K-means', 'Spectral', 'LRAcluster', 'PINS', + 'SNF', 'rMKL-LPP', 'MCCA', 'MultiNMF', 'iClusterBayes')) +names(ALGORITHM.DISPLAY.NAMES) = ALGORITHM.NAMES + +print.matrix.latex.format <- function(mat) { + print(do.call(paste, as.list(c(colnames(mat), sep=' & ')))) + for (i in 1:nrow(mat)) { + print(do.call(paste, as.list(c(rownames(mat)[i], round(mat[i,], digits=2), sep=' & ')))) + } +} + + +perform.all.analyses <- function(benchmark.ret) { + par(mar=rep(1, 4)) + for (i in 1:4) { + + cur.func = list(benchmark.omics.time, benchmark.omics.num.clusters, + benchmark.omics.surv, benchmark.omics.clinical)[[i]] + for (omic.subset in names(OMIC.SUBSETS)) { + print(paste('current omic subset ', omic.subset)) + benchmark.data = cur.func(benchmark.ret, omic.subset) + + displayed.benchmark.data = benchmark.data + colnames(displayed.benchmark.data)[1:ncol(displayed.benchmark.data)] = + sapply(as.list(colnames(displayed.benchmark.data)[1:ncol(displayed.benchmark.data)]), + subtype.to.display.name) + rownames(displayed.benchmark.data) = unlist(ALGORITHM.DISPLAY.NAMES[rownames(displayed.benchmark.data)]) + print.matrix.latex.format(displayed.benchmark.data) + + table.name = c('runtime', 'num_cluster', 'survival', 'clinical')[i] + write.csv(displayed.benchmark.data, file=file.path(get.tables.dir.path(), paste0(table.name, '_', OMIC.SUBSETS[[omic.subset]], '.csv'))) + } + + + print('------------------------') + } + + # plots that include all datasets + for (i in 1:4) { + omic.subset = names(OMIC.SUBSETS)[[i]] + benchmark.surv = benchmark.omics.surv(benchmark.ret, omic.subset) + benchmark.clinical = benchmark.omics.clinical(benchmark.ret, omic.subset) + plot.name = list('multi_omics_surv_clinical.tiff', 'exp_surv_clinical.tiff', + 'methy_surv_clinical.tiff', 'mirna_surv_clinical.tiff')[[i]] + create.clinical.survival.plots(benchmark.surv, benchmark.clinical, plot.name) + } + + + # plots for the mean behaviour + create.mean.clinical.survival.plot(benchmark.ret) +} + +get.best.single.omic.mat <- function(single.omic.list1, single.omic.list2) { + best.mat1 = matrix(0, ncol=ncol(single.omic.list1[[1]]), + nrow=nrow(single.omic.list1[[1]])) + best.mat2 = matrix(0, ncol=ncol(single.omic.list1[[1]]), + nrow=nrow(single.omic.list1[[1]])) + for (i in 1:nrow(best.mat1)) { + for (j in 1:ncol(best.mat1)) { + values1 = sapply(single.omic.list1, function(mat) mat[i, j]) + values2 = sapply(single.omic.list2, function(mat) mat[i, j]) + if (any(is.na(values1))) { + best.mat1[i, j] = NA + best.mat2[i, j] = NA + } else { + best.omic = which.max(values1) + best.mat1[i, j] = values1[best.omic] + best.mat2[i, j] = values2[best.omic] + } + } + } + return(list(best.mat1, best.mat2)) +} + +create.mean.clinical.survival.plot <- function(benchmark.ret) { + tiff(file.path(get.plots.dir.path(), 'mean_surv_clinical.tiff'), width=4500, height=2250, res=300) + layout(matrix(c(1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5), nrow=2, byrow=T)) + single.omic.surv.benchmarks = list() + single.omic.clin.benchmarks = list() + for (i in 2:6) { + omic.subset = c(names(OMIC.SUBSETS), 'best_surv', 'best_clin')[i] + alg.cols = c("#8DD3C7", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5", "#BC80BD", "#842121") + pch = c(rep(15, 3), rep(3, 3), rep(19, 3)) + + if (i <= 4) { + benchmark.surv = benchmark.omics.surv(benchmark.ret, omic.subset) + benchmark.clinical = benchmark.omics.clinical(benchmark.ret, omic.subset) + + if (i %in% 2:4) { + single.omic.surv.benchmarks[[i - 1]] = benchmark.surv + single.omic.clin.benchmarks[[i - 1]] = benchmark.clinical + } + + } else if (i == 5) { + best.mats = get.best.single.omic.mat(single.omic.surv.benchmarks, single.omic.clin.benchmarks) + benchmark.surv = best.mats[[1]] + benchmark.clinical = best.mats[[2]] + + } else { + best.mats = get.best.single.omic.mat(single.omic.clin.benchmarks, single.omic.surv.benchmarks) + benchmark.surv = best.mats[[2]] + benchmark.clinical = best.mats[[1]] + } + + surv.sum = rowSums(benchmark.surv, na.rm=T) + clin.sum = rowSums(benchmark.clinical, na.rm=T) + + # for mcca, the sum is for an empty vector, and is equal 0, so we remove this + available.indices = surv.sum != 0 + surv.sum = surv.sum[available.indices] + clin.sum = clin.sum[available.indices] + alg.cols = alg.cols[available.indices] + pch = pch[available.indices] + + + if (omic.subset == '1') { + xlab = '-log10(logrank pvalue)' + ylab = '# enriched clinical parameters' + } else { + xlab = '' + ylab = '' + } + + if (i %in% c(1, 5, 6)) { + y.min = min(clin.sum) - 1 + x.min = min(surv.sum) - 1 + } else { + y.min = 0 + x.min = 0 + } + + + subplot.name = c('Multi-omics', 'Gene Expression', 'DNA Methylation', 'miRNA Expression', + 'Best single-omic (survival)', 'Best single-omic (clinical)')[i] + plot(surv.sum, clin.sum, main=subplot.name, xlab=xlab, ylab=ylab, + xlim=c(x.min, max(surv.sum) + 1), ylim=c(y.min, max(clin.sum + 1)), col=alg.cols, pch=pch, cex.lab=1.8, cex=4, cex.axis=1.5, cex.main=2.5, lwd=4) + } + dev.off() +} + +create.clinical.survival.plots <- function(benchmark.surv, benchmark.clinical, plot.name) { + num.subtypes = ncol(benchmark.surv) + tiff(file.path(get.plots.dir.path(), plot.name), width=4500, height=1875, res=300) + alg.cols = c("#8DD3C7", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5", "#BC80BD", "#842121") + pch = c(rep(15, 3), rep(3, 3), rep(19, 3)) + + par(mfrow=c(2, num.subtypes / 2), mar=c(4.1, 4, 3.2, 1)) + for (i in 1:num.subtypes) { + subtype = colnames(benchmark.surv)[i] + subtype.surv = benchmark.surv[,subtype] + subtype.clinical = benchmark.clinical[,subtype] + + available.indices = !is.na(subtype.surv) + subtype.surv = subtype.surv[available.indices] + subtype.clinical = subtype.clinical[available.indices] + current.cols = alg.cols[available.indices] + current.pch = pch[available.indices] + + surv.significance = -log10(0.05) + + if (i == 1) { + xlab = '-log10(logrank pvalue)' + ylab = '# enriched clinical parameters' + } else { + xlab = '' + ylab = '' + } + + plot(subtype.surv, subtype.clinical, main=subtype.to.display.name(subtype), xlab=xlab, ylab=ylab, + xlim=c(0, max(subtype.surv, surv.significance) + 0.2), ylim=c(0, max(subtype.clinical + 1)), col=current.cols, pch=current.pch, cex.lab=1.4, cex=2.4, cex.axis=1.5, cex.main=1.8, lwd=3) + abline(v=surv.significance, col='red') + } + + dev.off() +} + +plot.legend <- function() { + alg.cols = c("#8DD3C7", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5", "#BC80BD", "#842121") + pch = c(rep(15, 3), rep(3, 3), rep(19, 3)) + lwds = c(rep(NA, 3), rep(3, 3), rep(NA, 3)) + pt.cexs = 1.8 + algorithm.names = ALGORITHM.DISPLAY.NAMES + algorithm.names[length(algorithm.names)] = paste0(algorithm.names[length(algorithm.names)], ' ') + width=4200 + + if (F) { + alg.cols = alg.cols[-7] + pch = pch[-7] + lwds = lwds[-7] + algorithm.names = algorithm.names[-7] + pt.cexs = pt.cexs[-7] + width=3800 + } + + tiff(file.path(get.plots.dir.path(), 'legend.tif'), width=width, res=300) + par(font=2, mar=rep(1, 4)) + plot(0,xaxt='n',yaxt='n',bty='n',pch='',ylab='',xlab='') + legend(0.58, 1, legend=algorithm.names, col=alg.cols, pch=pch, horiz=T, pt.cex=pt.cexs, pt.lwd=lwds) + dev.off() +} + +analyze.benchmark <- function() { + all.clusterings = list() + all.timings = list() + for (i in 1:length(SUBTYPES.DATA)) { + current.subtype.data = SUBTYPES.DATA[[i]] + subtype = current.subtype.data$name + subtype.raw.data = get.raw.data(subtype, + only.primary=current.subtype.data$only.primary) + + all.clusterings[[subtype]] = list() + all.timings[[subtype]] = list() + + for (algorithm.name in ALGORITHM.NAMES) { + all.clusterings[[subtype]][[algorithm.name]] = list() + all.timings[[subtype]][[algorithm.name]] = list() + for (j in c('all', '1', '2', '3')) { + clustering.path = file.path(get.clustering.results.dir.path(), + paste(subtype, algorithm.name, j, sep='_')) + timing.path = file.path(get.clustering.results.dir.path(), + paste(subtype, algorithm.name, j, 'timing', sep='_')) + load(clustering.path) + load(timing.path) + if (!any(is.na(clustering))) { + names(clustering) = colnames(subtype.raw.data[[1]]) + } + + all.clusterings[[subtype]][[algorithm.name]][[j]] = clustering + all.timings[[subtype]][[algorithm.name]][[j]] = timing + } + } + } + return(list(all.clusterings=all.clusterings, all.timings=all.timings)) +} + +check.empirical.surv <- function(old.pvals, new.pvals) { + is.in.conf = matrix(0, ncol=ncol(old.pvals), nrow=nrow(old.pvals)) + for (i in 1:nrow(old.pvals)) { + for (j in 1:ncol(old.pvals)) { + old.pval = old.pvals[i, j] + if (is.na(old.pval)) { + is.in.conf[i, j] = NA + next + } + if (old.pval == 1e-10) old.pval = 1e-5 + num.runs = floor(30 / old.pval) + new.pval = new.pvals[i, j] + num.success = round(new.pval * num.runs) + print(c(num.success, num.runs)) + + conf.int = binom.test(num.success, num.runs)$conf.int + in.conf.int = old.pval >= conf.int[1] & old.pval <= conf.int[2] + is.in.conf[i, j] = in.conf.int + } + } + return(is.in.conf) +} + +get.empirical.surv <- function(clustering, subtype) { + set.seed(42) + surv.ret = check.survival(clustering, subtype) + orig.chisq = surv.ret$chisq + orig.pvalue = get.logrank.pvalue(surv.ret) + # The initial number of permutations to run + num.perms = round(min(max(10 / orig.pvalue, 1000), 1e6)) + should.continue = T + + total.num.perms = 0 + total.num.extreme.chisq = 0 + + while (should.continue) { + print('Another iteration in empirical survival calculation') + print(num.perms) + perm.chisq = as.numeric(mclapply(1:num.perms, function(i) { + cur.clustering = sample(clustering) + names(cur.clustering) = names(clustering) + cur.chisq = check.survival(cur.clustering, subtype)$chisq + return(cur.chisq) + }, mc.cores=50)) + + total.num.perms = total.num.perms + num.perms + total.num.extreme.chisq = total.num.extreme.chisq + sum(perm.chisq >= orig.chisq) + + binom.ret = binom.test(total.num.extreme.chisq, total.num.perms) + cur.pvalue = binom.ret$estimate + cur.conf.int = binom.ret$conf.int + + print(c(total.num.extreme.chisq, total.num.perms)) + print(cur.pvalue) + print(cur.conf.int) + + sig.threshold = 0.05 + is.conf.small = ((cur.conf.int[2] - cur.pvalue) < min(cur.pvalue / 10, 0.01)) & ((cur.pvalue - cur.conf.int[1]) < min(cur.pvalue / 10, 0.01)) + is.threshold.in.conf = cur.conf.int[1] < sig.threshold & cur.conf.int[2] > sig.threshold + if ((is.conf.small & !is.threshold.in.conf) | (total.num.perms > 2e7)) { + #if (is.conf.small) { + should.continue = F + } else { + num.perms = 1e5 + } + } + + return(list(pvalue = cur.pvalue, conf.int = cur.conf.int, total.num.perms=total.num.perms, + total.num.extreme.chisq=total.num.extreme.chisq)) +} + +benchmark.omics.surv <- function(benchmark.results, omics='all') { + + + all.surv.pvalues = matrix(1, ncol=length(SUBTYPES.DATA), nrow=length(ALGORITHM.NAMES)) + rownames(all.surv.pvalues) = ALGORITHM.NAMES + colnames(all.surv.pvalues) = sapply(SUBTYPES.DATA, function(x) x$name) + all.clusterings = benchmark.results$all.clusterings + for (i in 1:length(all.clusterings)) { + subtype = colnames(all.surv.pvalues)[i] + subtype.clusterings = all.clusterings[[subtype]] + for (j in 1:length(subtype.clusterings)) { + surv.path = file.path(get.clustering.results.dir.path(), + paste(subtype, ALGORITHM.NAMES[j], omics, 'surv', sep='_')) + if (file.exists(surv.path)) { + load(surv.path) + pvalue = empirical.surv.ret$pvalue + + } else { + clustering = subtype.clusterings[[ALGORITHM.NAMES[j]]][[omics]] + if (length(table(clustering)) > 1) { + #pvalue = -log10(get.logrank.pvalue(check.survival(clustering, subtype))) + empirical.surv.ret = get.empirical.surv(clustering, subtype) + save(empirical.surv.ret, file=surv.path) + pvalue = empirical.surv.ret$pvalue + + + } else { + pvalue = NA + } + } + all.surv.pvalues[j, i] = -log10(pvalue) + } + } + + return(all.surv.pvalues) +} + +benchmark.omics.num.clusters <- function(benchmark.results, omics='all') { + num.clusters = matrix(1, ncol=length(SUBTYPES.DATA), nrow=length(ALGORITHM.NAMES)) + rownames(num.clusters) = ALGORITHM.NAMES + colnames(num.clusters) = sapply(SUBTYPES.DATA, function(x) x$name) + all.clusterings = benchmark.results$all.clusterings + for (i in 1:length(all.clusterings)) { + subtype = colnames(num.clusters)[i] + subtype.clusterings = all.clusterings[[subtype]] + for (j in 1:length(subtype.clusterings)) { + clustering = subtype.clusterings[[ALGORITHM.NAMES[j]]][[omics]] + num.clusters[j, i] = max(clustering) + } + } + return(num.clusters) +} + +benchmark.omics.clinical <- function(benchmark.results, omics='all') { + + num.clinical.enrich = matrix(1, ncol=length(SUBTYPES.DATA), nrow=length(ALGORITHM.NAMES)) + rownames(num.clinical.enrich) = ALGORITHM.NAMES + colnames(num.clinical.enrich) = sapply(SUBTYPES.DATA, function(x) x$name) + total.num.tested.parameters = 0 + all.clusterings = benchmark.results$all.clusterings + for (i in 1:length(all.clusterings)) { + subtype = colnames(num.clinical.enrich)[i] + subtype.clusterings = all.clusterings[[subtype]] + for (j in 1:length(subtype.clusterings)) { + print('checking clinical enrichment') + print(c(i, j)) + + + clustering = subtype.clusterings[[ALGORITHM.NAMES[j]]][[omics]] + if (any(is.na(clustering))) { + num.clinical.enrich[j, i] = NA + } else { + clin.path = file.path(get.clustering.results.dir.path(), + paste(subtype, ALGORITHM.NAMES[j], omics, 'clin', sep='_')) + + if (file.exists(clin.path)) { + load(clin.path) + } else { + enrichment.pvalues = check.clinical.enrichment(clustering, subtype) + save(enrichment.pvalues, file=clin.path) + } + + if (j == 1) { + total.num.tested.parameters = total.num.tested.parameters + length(enrichment.pvalues) + } + num.clinical.enrich[j, i] = sum(enrichment.pvalues * length(enrichment.pvalues) < 0.05) + } + } + } + print(paste0('Total number of parameters tested:', total.num.tested.parameters)) + return(num.clinical.enrich) +} + +benchmark.omics.time <- function(benchmark.results, omics='all') { + all.alg.times = matrix(1, ncol=length(SUBTYPES.DATA), nrow=length(ALGORITHM.NAMES)) + rownames(all.alg.times) = ALGORITHM.NAMES + colnames(all.alg.times) = sapply(SUBTYPES.DATA, function(x) x$name) + all.timings = benchmark.results$all.timings + for (i in 1:length(all.timings)) { + subtype = colnames(all.alg.times)[i] + subtype.timings = all.timings[[subtype]] + for (j in 1:length(subtype.timings)) { + timing = subtype.timings[[ALGORITHM.NAMES[j]]][[omics]] + all.alg.times[j, i] = timing + } + } + return(all.alg.times) +} + +benchmark.alg.agreement <- function(benchmark.results, omics='all') { + all.clusterings = benchmark.results$all.clusterings + rand.matrix = matrix(NA, ncol=length(ALGORITHM.NAMES), nrow=length(ALGORITHM.NAMES)) + colnames(rand.matrix) = ALGORITHM.NAMES + rownames(rand.matrix) = ALGORITHM.NAMES + for (alg1.index in 1:length(ALGORITHM.NAMES)) { + alg1 = ALGORITHM.NAMES[alg1.index] + for (alg2.index in 1:length(ALGORITHM.NAMES)) { + alg2 = ALGORITHM.NAMES[alg2.index] + rands = c() + for (i in 1:length(SUBTYPES.DATA)) { + clustering1 = all.clusterings[[i]][[alg1]][[omics]] + clustering2 = all.clusterings[[i]][[alg2]][[omics]] + rands = c(rands, adj.rand.index(clustering1, clustering2)) + } + mean.rand = mean(rands) + rand.matrix[alg1.index, alg2.index] = mean.rand + } + } + return(rand.matrix) +} + +get.logrank.pvalue <- function(survdiff.res) { + 1 - pchisq(survdiff.res$chisq, length(survdiff.res$n) - 1) +} + +run.benchmark <- function() { + for (i in 1:length(SUBTYPES.DATA)) { + current.subtype.data = SUBTYPES.DATA[[i]] + subtype = current.subtype.data$name + subtype.raw.data = get.raw.data(subtype, + only.primary=current.subtype.data$only.primary) + + subtype.raw.data = set.omics.list.attr(subtype.raw.data, + current.subtype.data) + + for (algorithm.name in ALGORITHM.NAMES) { + for (j in c('all', '1', '2', '3')) { + set.seed(42) + print(paste('subtype', subtype, 'running algorithm', algorithm.name, j)) + clustering.path = file.path(get.clustering.results.dir.path(), + paste(subtype, algorithm.name, j, sep='_')) + timing.path = file.path(get.clustering.results.dir.path(), + paste(subtype, algorithm.name, j, 'timing', sep='_')) + + + if (!file.exists(clustering.path)) { + algorithm.func.name = paste0('run.', algorithm.name) + algorithm.func = get(algorithm.func.name) + if (j == 'all') { + cur.iteration.data = subtype.raw.data + } else { + cur.iteration.data = subtype.raw.data[as.numeric(j)] + } + algorithm.ret = algorithm.func(cur.iteration.data, current.subtype.data) + clustering = algorithm.ret$clustering + timing = algorithm.ret$timing + print('before saving') + save(clustering, file = clustering.path) + save(timing, file = timing.path) + } + } + } + } +} + +get.dataset.dir.path <- function() { + return('DATASETS_PATH') +} + +log.and.normalize <- function(omics.data, subtype.data, normalize=T, + filter.var=F) { + # filter features with no variance at all + for (i in 1:length(omics.data)) { + omics.data[[i]] = omics.data[[i]][apply(omics.data[[i]], 1, var) > 0,] + } + + for (i in 1:length(omics.data)) { + if (attr(omics.data[[i]], 'is.seq')) { + omics.data[[i]] = log(1+omics.data[[i]]) + } + } + + if (filter.var) { + omics.data = lapply(omics.data, keep.high.var.features) + } + + if (normalize) { + omics.data = lapply(omics.data, normalize.matrix) + } + + return(omics.data) +} + +normalize.matrix <- function(data.matrix) { + temp = data.matrix - rowMeans(data.matrix) + should.keep = (apply(temp, 1, sd) != 0) + return ((temp / apply(temp, 1, sd))[should.keep, ]) +} + +filter.non.tumor.samples <- function(raw.datum, only.primary=only.primary) { + # 01 is primary, 06 is metastatic, 03 is blood derived cancer + if (!only.primary) + return(raw.datum[,substring(colnames(raw.datum), 14, 15) %in% c('01', '03', '06')]) + else + return(raw.datum[,substring(colnames(raw.datum), 14, 15) %in% c('01')]) +} + +get.fixed.names <- function(patient.names, include.type=F) { + # fix the TCGA names to only include the patient ids + if (include.type) { + return(gsub('-', '\\.', toupper(substring(patient.names, 1, 15)))) + } else { + return(gsub('-', '\\.', toupper(substring(patient.names, 1, 12)))) + } +} + +fix.patient.names <- function(subtype.raw.data, include.type=F) { + for (i in 1:length(subtype.raw.data)) { + colnames(subtype.raw.data[[i]]) = get.fixed.names(colnames(subtype.raw.data[[i]]), + include.type) + } + return(subtype.raw.data) +} + +get.raw.data <- function(subtype.name, + datasets.path = get.dataset.dir.path(), + only.primary=NA) { + omics.dir = file.path(datasets.path, subtype.name) + omics.files = list.files(omics.dir) + omics.files = setdiff(omics.files, c('survival')) + raw.data = lapply(file.path(omics.dir, omics.files), read.table) + + if (!is.na(only.primary)) { + raw.data = lapply(raw.data, function(x) filter.non.tumor.samples(x, only.primary = only.primary)) + } + name.corrected.data = fix.patient.names(raw.data) + patients.intersection = Reduce(intersect, lapply(name.corrected.data, colnames)) + ret.data = lapply(name.corrected.data, function(datum) datum[,patients.intersection]) + return(ret.data) +} + +get.elbow <- function(values, is.max) { + second.derivatives = c() + for (i in 2:(length(values) - 1)) { + second.derivative = values[i + 1] + values[i - 1] - 2 * values[i] + second.derivatives = c(second.derivatives, second.derivative) + } + print(second.derivatives) + if (is.max) { + return(which.max(second.derivatives) + 1) + } else { + return(which.min(second.derivatives) + 1) + } +} + +# Does not support a single omic dataset +run.mcca <- function(omics.list, subtype.data) { + if (length(omics.list) == 1) { + return(list(clustering=rep(NA, ncol(omics.list[[1]])), timing=1)) + } + start = Sys.time() + omics.list = log.and.normalize(omics.list, subtype.data, + normalize = T, + filter.var = T) + + subtype = subtype.data$name + omics.transposed = lapply(omics.list, t) + cca.ret = PMA::MultiCCA(omics.transposed, + ncomponents = MAX.NUM.CLUSTERS) + sample.rep = omics.transposed[[1]] %*% cca.ret$ws[[1]] + + explained.vars = sapply(1:MAX.NUM.CLUSTERS, + function(i) sum(unlist(apply(sample.rep[1:i,,drop=F], 2, var)))) + + dimension = get.elbow(explained.vars, is.max=F) + print(dimension) + sample.rep = sample.rep[,1:dimension] + sils = c() + clustering.per.num.clusters = list() + for (num.clusters in 2:MAX.NUM.CLUSTERS) { + cur.clustering = kmeans(sample.rep, num.clusters, iter.max=100, nstart=30)$cluster + sil = get.clustering.silhouette(list(t(sample.rep)), cur.clustering) + sils = c(sils, sil) + clustering.per.num.clusters[[num.clusters - 1]] = cur.clustering + } + # NOTE: the next line contains an error. We mistakenly selected the minimal rather maximal silhouette. + # See more details in: http://acgt.cs.tau.ac.il/multi_omic_benchmark/download.html. + cca.clustering = clustering.per.num.clusters[[which.min(sils)]] + time.taken = as.numeric(Sys.time() - start, units='secs') + return(list(clustering=cca.clustering, timing=time.taken)) +} + +run.snf <- function(omics.list, subtype.data) { + start = Sys.time() + omics.list = log.and.normalize(omics.list, subtype.data) + subtype = subtype.data$name + alpha=0.5 + T.val=30 + num.neighbors = round(ncol(omics.list[[1]]) / 10) + similarity.data = lapply(omics.list, function(x) {affinityMatrix(dist2(as.matrix(t(x)),as.matrix(t(x))), + num.neighbors, alpha)}) + if (length(similarity.data) == 1) { + W = similarity.data[[1]] + } else { + W = SNF(similarity.data, num.neighbors, T.val) + } + + num.clusters = estimateNumberOfClustersGivenGraph(W, 2:MAX.NUM.CLUSTERS)[[3]] + clustering = spectralClustering(W, num.clusters) + time.taken = as.numeric(Sys.time() - start, units='secs') + return(list(clustering=clustering, timing=time.taken)) +} + +run.iCluster <- function(omics.list, subtype.data) { + omics.list = log.and.normalize(omics.list, subtype.data, normalize = F) + + start = Sys.time() + subtype = subtype.data$name + dev.ratios = c() + icluster.rets = list() + + if (length(omics.list) == 1) { + icluster.ret = iClusterPlus::tune.iClusterBayes(cpus=(MAX.NUM.CLUSTERS - 1), t(omics.list[[1]]), + K=1:(MAX.NUM.CLUSTERS - 1), type=c('gaussian'))$fit + } else { + icluster.ret = iClusterPlus::tune.iClusterBayes(cpus=(MAX.NUM.CLUSTERS - 1), t(omics.list[[1]]), + t(omics.list[[2]]), + t(omics.list[[3]]), + K=1:(MAX.NUM.CLUSTERS - 1), type=rep('gaussian', 3))$fit + } + dev.ratios = lapply(1:(MAX.NUM.CLUSTERS - 1), function(i) icluster.ret[[i]]$dev.ratio) + + print('dev.ratios are:') + print(dev.ratios) + + optimal.solution = icluster.ret[[which.max(dev.ratios)]] + time.taken = as.numeric(Sys.time() - start, units='secs') + return(list(clustering=optimal.solution$clusters, + timing=time.taken)) +} + +get.mkl.binary.path = function() { + return('MKL_BINARY_PATH') +} + +get.mkl.arguments.path = function() { + return('MKL_ARGS_PATH') +} + + +run.mkl <- function(omics.list, subtype.data) { + start = Sys.time() + omics.list = log.and.normalize(omics.list, subtype.data) + subtype = subtype.data$name + omics.list = lapply(omics.list, normalize.matrix) + time.taken = as.numeric(Sys.time() - start, units='secs') + export.subtype.to.mkl(omics.list, subtype) + + start = Sys.time() + bin.path = get.mkl.binary.path() + subtype.dir = paste0(get.mkl.arguments.path(), subtype, '\\') + paste0(subtype.dir, 'kernels') + command = paste(bin.path, paste0(subtype.dir, 'kernels'), + paste0(subtype.dir, 'ids'), + paste0(subtype.dir, 'output'), + '9', '5') + command.return = system(command) + stopifnot(command.return == 0) + time.taken2 = as.numeric(Sys.time() - start, units='secs') + clustering = get.mkl.clustering(subtype) + return(list(clustering=clustering, + timing=time.taken + time.taken2)) +} + +run.nmf <- function(omics.list, subtype.data) { + total.time.taken = 0 + start = Sys.time() + omics.list = log.and.normalize(omics.list, subtype.data, + filter.var = T, normalize = F) + time.taken = as.numeric(Sys.time() - start, units='secs') + total.time.taken = total.time.taken + time.taken + + save.subtype.matlab.format(omics.list) + subtype = subtype.data$name + if (length(omics.list) > 1) { + command.ret = system('MULTI_NMF_COMMAND') + stopifnot(command.ret == 0) + nmf.timing = read.csv('SOME_TEMP_PATH_TIMING', header=F)[1, 1] + total.time.taken = total.time.taken + nmf.timing + } else { + for (k in 1:MAX.NUM.CLUSTERS) { + start = Sys.time() + file.name = paste0('SOME_TEMP_PATH/', k, '_consensus') + nmf.ret = nmf(omics.list[[1]], k, method='lee') + coef.mat = t(coef(nmf.ret)) + time.taken = as.numeric(Sys.time() - start, units='secs') + total.time.taken = total.time.taken + time.taken + write.table(coef.mat, file=file.name, quote=F, row.names=F, col.names=F, sep=',') + } + } + + explained.vars = c() + clustering.per.num.clusters = list() + for (k in 1:MAX.NUM.CLUSTERS) { + file.name = paste0('SOME_TEMP_PATH/', k, '_consensus') + consensus.mat = read.csv(file.name, header=F) + + start = Sys.time() + cur.clustering = apply(consensus.mat, 1, which.max) + explained.var = sum(unlist(apply(consensus.mat, 2, var))) + explained.vars = c(explained.vars, explained.var) + clustering.per.num.clusters[[k]] = cur.clustering + time.taken = as.numeric(Sys.time() - start, units='secs') + total.time.taken = total.time.taken + time.taken + } + + dimension = get.elbow(explained.vars, is.max=F) + nmf.clustering = clustering.per.num.clusters[[dimension]] + return(list(clustering=nmf.clustering, timing=total.time.taken)) +} + +run.pins <- function(omics.list, subtype.data) { + start = Sys.time() + omics.list = log.and.normalize(omics.list, subtype.data, normalize = F) + subtype = subtype.data$name + omics.transposed = lapply(omics.list, t) + if (length(omics.list) == 1) { + pins.ret = PINSPlus::PerturbationClustering(data=omics.transposed[[1]], + kMax = MAX.NUM.CLUSTERS) + clustering = pins.ret$cluster + + } else { + pins.ret = PINSPlus::SubtypingOmicsData(dataList=omics.transposed, + kMax = MAX.NUM.CLUSTERS) + clustering = pins.ret$cluster2 + } + time.taken = as.numeric(Sys.time() - start, units='secs') + return(list(clustering=clustering, timing=time.taken)) +} + +run.lracluster <- function(omics.list, subtype.data) { + omics.list = log.and.normalize(omics.list, subtype.data, normalize = F) + + subtype = subtype.data$name + start = Sys.time() + + dim.range = 1:MAX.NUM.CLUSTERS + all.clustering.results = list() + + omics.matrix.list = lapply(omics.list, as.matrix) + for (dimension in dim.range) { + print(paste('running lra cluster for dimension', dimension)) + data.names = c('gene expression', 'methylation', 'miRNA expression') + clustering.results = LRAcluster(omics.matrix.list, + rep('gaussian', length(omics.list)), + dimension=dimension, data.names) + all.clustering.results[[dimension]] = clustering.results + } + explained.var = sapply(all.clustering.results, function(x) x$potential) + print(explained.var) + dimension = get.elbow(explained.var, is.max=F) + print(dimension) + solution = all.clustering.results[[dimension]]$coordinate + + sils = c() + clustering.per.num.clusters = list() + for (num.clusters in 2:MAX.NUM.CLUSTERS) { + print(paste('running kmeans in lra cluster for num clusters', num.clusters)) + cur.clustering = kmeans(t(solution), num.clusters, iter.max=100, nstart=60)$cluster + sil = get.clustering.silhouette(list(solution), cur.clustering) + sils = c(sils, sil) + clustering.per.num.clusters[[num.clusters - 1]] = cur.clustering + } + print(sils) + # NOTE: the next line contains an error. We mistakenly selected the minimal rather maximal silhouette. + # See more details in: http://acgt.cs.tau.ac.il/multi_omic_benchmark/download.html. + chosen.clustering = clustering.per.num.clusters[[which.min(sils)]] + time.taken = as.numeric(Sys.time() - start, units='secs') + return(list(clustering=chosen.clustering, timing=time.taken)) +} + +run.kmeans <- function(omics.list, subtype.data) { + start = Sys.time() + omics.list = log.and.normalize(omics.list, subtype.data, + filter.var = T) + + subtype = subtype.data$name + all.withinss = c() + all.clusterings = list() + k.range = 1:MAX.NUM.CLUSTERS + for (k in k.range) { + concat.omics = do.call(rbind, omics.list) + kmeans.ret = kmeans(t(concat.omics), k, iter.max=100, nstart=60) + all.withinss = c(all.withinss, kmeans.ret$tot.withinss) + all.clusterings[[k]] = kmeans.ret$cluster + } + + best.k = get.elbow(all.withinss, is.max=T) + time.taken = as.numeric(Sys.time() - start, units='secs') + return(list(clustering=all.clusterings[[best.k]], + timing=time.taken)) +} + +run.spectral <- function(omics.list, subtype.data) { + start = Sys.time() + omics.list = log.and.normalize(omics.list, subtype.data, + filter.var = T) + subtype = subtype.data$name + concat.omics = do.call(rbind, omics.list) + + similarity.data = affinityMatrix(dist2(as.matrix(t(concat.omics)), + as.matrix(t(concat.omics))), + 20, 0.5) + num.clusters = estimateNumberOfClustersGivenGraph(similarity.data, + 2:MAX.NUM.CLUSTERS)[[3]] + clustering = spectralClustering(similarity.data, num.clusters) + time.taken = as.numeric(Sys.time() - start, units='secs') + return(list(clustering=clustering, timing=time.taken)) +} + +load.libraries <- function() { + library('PMA') + library('R.matlab') + library('SNFtool') + library('PINSPlus') + #library('LRAcluster') + library('kernlab') + library('survival') + library('NMF') + + # bioconductor packages + source("https://bioconductor.org/biocLite.R") + biocLite("impute") + #biocLite("iClusterPlus") +} + + + +######################################## +############### MKL ############### +######################################## + +radial.basis <- function(mat, gamma) { + if (missing(gamma)) { + gamma = 1 / (2*nrow(mat)**2) + } + npatients = ncol(mat) + output.mat = matrix(0, ncol=npatients, nrow=npatients) + for (i in 1:npatients) { + for (j in 1:npatients) { + output.mat[i, j] = exp(-norm(as.matrix(mat[,i] - mat[,j]), type = 'F')**2 * gamma) + } + } + + D = apply(output.mat, 2, sum) / npatients + E = sum(D) / npatients + J = matrix(1, nrow=npatients, ncol=1) %*% D + ret = output.mat - J - t(J) + E * matrix(1, ncol=npatients, nrow=npatients) + ret = diag(1/sqrt(diag(ret))) %*% ret %*% diag(1/sqrt(diag(ret))) + return(ret) +} + +clear.dir <- function(dir.path) { + files.in.dir = list.files(dir.path) + for (file.in.dir in files.in.dir) { + full.file.path = file.path(dir.path, file.in.dir) + file.remove(full.file.path) + } +} + +export.subtype.to.mkl <- function(omics.list, dir.name) { + + folder.path = file.path(get.mkl.arguments.path(), dir.name) + if (!dir.exists(folder.path)) { + dir.create(folder.path) + } + + kernels.path = file.path(folder.path, 'kernels') + + if (!dir.exists(kernels.path)) { + dir.create(kernels.path) + } + clear.dir(kernels.path) + + gammas = 10 ** seq(-6, 6, by=3) + for (i in 1:length(omics.list)) { + for (j in 1:length(gammas)) { + datum = omics.list[[i]] + gamma = gammas[[j]] / (2*nrow(datum)**2) + mat = radial.basis(datum, gamma) + R.matlab::writeMat(file.path(kernels.path, paste(i, '_', j, sep='')), mat=mat) + } + } + + output.path = file.path(folder.path, 'output') + if (!dir.exists(output.path)) { + dir.create(output.path) + } + clear.dir(output.path) + + write.table(colnames(omics.list[[1]]), file=file.path(folder.path, 'ids'), + quote=F, row.names = F, col.names = F) + +} + +get.mkl.clustering <- function(dir.name) { + folder.path = file.path(get.mkl.arguments.path(), dir.name) + output.path = file.path(folder.path, 'output') + output.files = list.files(output.path) + clustering = read.csv(file.path(output.path, output.files[grep('clusters', output.files)]))[,2] + return(clustering) +} + +check.survival <- function(groups, subtype, survival.file.path) { + if (missing(survival.file.path)) { + survival.file.path = get.subtype.survival.path(subtype) + } + survival.data = read.table(survival.file.path, header = TRUE) + patient.names = names(groups) + patient.names.in.file = as.character(survival.data[, 1]) + patient.names.in.file = toupper(gsub('-', '\\.', substring(patient.names.in.file, 1, 12))) + + stopifnot(all(patient.names %in% patient.names.in.file)) + + indices = match(patient.names, patient.names.in.file) + ordered.survival.data = survival.data[indices,] + ordered.survival.data["cluster"] <- groups + ordered.survival.data$Survival[is.na(ordered.survival.data$Survival)] = 0 + ordered.survival.data$Death[is.na(ordered.survival.data$Death)] = 0 + return(survdiff(Surv(Survival, Death) ~ cluster, data=ordered.survival.data)) + +} + +get.subtype.survival.path <- function(subtype) { + datasets.path = get.dataset.dir.path() + survival.file.path = file.path(datasets.path, subtype, 'survival') + return(survival.file.path) +} + +get.clustering.silhouette <- function(raw.data, clustering) { + sils = c() + for (i in 1:length(raw.data)) { + x = raw.data[[i]] + distmatrix = dist2(as.matrix(t(x)),as.matrix(t(x))) + sil = silhouette(clustering, dmatrix = distmatrix)[,3] + sils = c(sils, mean(sil)) + } + return(mean(sils)) +} + +get.clinical.params.dir <- function() { + return('CLINICAL_PARAMS_PATH') +} + +get.clinical.params <- function(subtype.name) { + clinical.data.path = paste(get.clinical.params.dir(), subtype.name, sep = '') + clinical.params = read.table(clinical.data.path, + sep='\t', header=T, row.names = 1, stringsAsFactors = F) + rownames.with.duplicates = get.fixed.names(rownames(clinical.params)) + clinical.params = clinical.params[!duplicated(rownames.with.duplicates),] + rownames(clinical.params) = rownames.with.duplicates[!duplicated(rownames.with.duplicates)] + return(clinical.params) +} + +check.clinical.enrichment <- function(clustering, subtype.name) { + clinical.params = get.clinical.params(subtype.name) + + clinical.metadata = list(gender='DISCRETE', age_at_initial_pathologic_diagnosis='NUMERIC', + pathologic_M='DISCRETE', pathologic_N='DISCRETE', pathologic_T='DISCRETE', pathologic_stage='DISCRETE') + + pvalues = c() + + params.being.tested = c() + + for (clinical.param in names(clinical.metadata)) { + + if (!(clinical.param %in% colnames(clinical.params))) { + #print(paste0('WARNING: ', clinical.param, ' does not appear for subtype ', subtype.name)) + next + } + + clinical.values = clinical.params[names(clustering),clinical.param] + is.discrete.param = clinical.metadata[clinical.param] == 'DISCRETE' + is.numeric.param = clinical.metadata[clinical.param] == 'NUMERIC' + stopifnot(is.discrete.param | is.numeric.param) + + # skip parameter if many missing values + + if (is.numeric.param) { + numeric.entries = !is.na(as.numeric(clinical.values)) + if (2 * sum(numeric.entries) < length(clinical.values)) { + #print(paste0('WARNING: skipping on ', clinical.param, ' for subtype ', subtype.name)) + next + } + } else { + not.na.entries = !is.na(clinical.values) + should.skip = F + if (2 * sum(not.na.entries) < length(clinical.values)) { + should.skip = T + } else if (length(table(clinical.values[not.na.entries])) == 1) { + should.skip = T + } + if (should.skip) { + #print(paste0('WARNING: skipping on ', clinical.param, ' for subtype ', subtype.name)) + next + } + } + + params.being.tested = c(params.being.tested, clinical.param) + + if (is.discrete.param) { + #clustering.with.clinical = cbind(clustering, clinical.values) + #tbl = table(as.data.frame(clustering.with.clinical[!is.na(clinical.values),])) + #test.res = chisq.test(tbl) + #pvalue = test.res$p.value + pvalue = get.empirical.clinical(clustering[!is.na(clinical.values)], clinical.values[!is.na(clinical.values)], T) + + } else if (is.numeric.param) { + #test.res = kruskal.test(as.numeric(clinical.values[numeric.entries]), + # clustering[numeric.entries]) + #pvalue = test.res$p.value + pvalue = get.empirical.clinical(clustering[numeric.entries], as.numeric(clinical.values[numeric.entries]), F) + } + + pvalues = c(pvalues, pvalue) + + } + names(pvalues) = params.being.tested + return(pvalues) +} + +get.empirical.clinical <- function(clustering, clinical.values, is.chisq) { + set.seed(42) + if (is.chisq) { + clustering.with.clinical = cbind(clustering, clinical.values) + tbl = table(as.data.frame(clustering.with.clinical)) + test.res = chisq.test(tbl) + } else { + test.res = kruskal.test(as.numeric(clinical.values), clustering) + } + orig.pvalue = test.res$p.value + num.iter = 1000 + total.num.iters = 0 + total.num.extreme = 0 + should.continue = T + + while (should.continue) { + print('another iteration in empirical clinical') + perm.pvalues = as.numeric(mclapply(1:num.iter, function(i) { + cur.clustering = sample(clustering) + names(cur.clustering) = names(clustering) + + if (is.chisq) { + clustering.with.clinical = cbind(cur.clustering, clinical.values) + tbl = table(as.data.frame(clustering.with.clinical)) + test.res = chisq.test(tbl) + } else { + test.res = kruskal.test(as.numeric(clinical.values), cur.clustering) + } + cur.pvalue = test.res$p.value + return(cur.pvalue) + }, mc.cores=50)) + total.num.iters = total.num.iters + num.iter + total.num.extreme = total.num.extreme + sum(perm.pvalues <= orig.pvalue) + + binom.ret = binom.test(total.num.extreme, total.num.iters) + cur.pvalue = binom.ret$estimate + cur.conf.int = binom.ret$conf.int + + sig.threshold = 0.05 + is.threshold.in.conf = cur.conf.int[1] < sig.threshold & cur.conf.int[2] > sig.threshold + if (!is.threshold.in.conf | total.num.iters > 1e5) { + should.continue = F + } + } + return(cur.pvalue) +} + +get.nmf.datasets.dir <- function() { + return('NMF_DATASETS_PATH') +} + +save.subtype.matlab.format <- function(subtype.raw.data) { + data.names = c('1', '2', '3') + + full.dir.name = get.nmf.datasets.dir() + dir.create(full.dir.name, showWarnings = F) + + for (i in 1:length(subtype.raw.data)) { + full.file.name = file.path(full.dir.name, data.names[i]) + write.table(subtype.raw.data[[i]], full.file.name) + } +} + +keep.high.var.features <- function(omic, num.features=2000) { + if (nrow(omic) < num.features) { + return(omic) + } else { + feature.vars = apply(omic, 1, var) + threshold = feature.vars[order(feature.vars, decreasing = T)][num.features] + return(omic[feature.vars >= threshold,]) + } +}