Multi-Omics-Benchmark / Git / Diff of /benchmark.R

Models:
AlyssaS/
Multi-Omics-Benchmark
Downloads: 1
Diff of /benchmark.R [000000] .. [835948]
Switch to side-by-side view

--- a
+++ b/benchmark.R
@@ -0,0 +1,1167 @@
+SUBTYPES.DATA = list(
+  list(name='aml', only.primary=F, is.rna.seq=T, is.mirna.seq=T, display.name='AML'),
+  list(name='breast', only.primary=T, is.rna.seq=T, is.mirna.seq=T, display.name='BIC'),
+  list(name='colon', only.primary=T, is.rna.seq=T, is.mirna.seq=T, display.name='COAD'),
+  list(name='gbm', only.primary=T, is.rna.seq=F, is.mirna.seq=F, display.name='GBM'),
+  list(name='kidney', only.primary=T, is.rna.seq=T, is.mirna.seq=T, display.name='KIRC'),
+  list(name='liver', only.primary=T, is.rna.seq=T, is.mirna.seq=T, display.name='LIHC'),
+  list(name='lung', only.primary=T, is.rna.seq=T, is.mirna.seq=T, display.name='LUSC'),
+  list(name='melanoma', only.primary=F, is.rna.seq=T, is.mirna.seq=T, display.name='SKCM'),
+  list(name='ovarian', only.primary=T, is.rna.seq=T, is.mirna.seq=T, display.name='OV'),
+  list(name='sarcoma', only.primary=T, is.rna.seq=T, is.mirna.seq=T, display.name='SARC'))
+
+MAX.NUM.CLUSTERS = 15
+
+OMIC.SUBSETS = list('multi_omics', 'exp', 'methy', 'mirna')
+names(OMIC.SUBSETS) = c('all', '1', '2', '3')
+
+get.clustering.results.dir.path <- function() {
+  return('RESULTS_DIR_PATH')
+}
+
+get.plots.dir.path <- function() {
+  results.dir.path = get.clustering.results.dir.path()
+  return(file.path(results.dir.path, 'plots'))
+}
+
+get.tables.dir.path <- function() {
+  results.dir.path = get.clustering.results.dir.path()
+  return(file.path(results.dir.path, 'tables'))
+}
+
+subtype.to.display.name <- function(subtype) {
+  for (i in 1:length(SUBTYPES.DATA)) {
+    if (SUBTYPES.DATA[[i]]$name == subtype) {
+      return(SUBTYPES.DATA[[i]]$display.name)
+    }
+  }
+}
+
+set.omics.list.attr <- function(subtype.raw.data, subtype.data) {
+  attr(subtype.raw.data[[1]], 'is.seq') = subtype.data$is.rna.seq
+  attr(subtype.raw.data[[2]], 'is.seq') = F
+  attr(subtype.raw.data[[3]], 'is.seq') = subtype.data$is.mirna.seq
+  return(subtype.raw.data)
+}
+
+ALGORITHM.NAMES = c('kmeans', 'spectral', 'lracluster', 'pins', 'snf', 'mkl', 
+                     'mcca', 'nmf', 'iCluster')
+ALGORITHM.DISPLAY.NAMES = as.list(c('K-means', 'Spectral', 'LRAcluster', 'PINS', 
+                           'SNF', 'rMKL-LPP', 'MCCA', 'MultiNMF', 'iClusterBayes'))
+names(ALGORITHM.DISPLAY.NAMES) = ALGORITHM.NAMES
+			
+print.matrix.latex.format <- function(mat) {
+  print(do.call(paste, as.list(c(colnames(mat), sep=' & '))))
+  for (i in 1:nrow(mat)) {
+    print(do.call(paste, as.list(c(rownames(mat)[i], round(mat[i,], digits=2), sep=' & '))))
+  }
+}
+
+
+perform.all.analyses <- function(benchmark.ret) {
+  par(mar=rep(1, 4))
+  for (i in 1:4) {
+    
+    cur.func = list(benchmark.omics.time, benchmark.omics.num.clusters,
+      benchmark.omics.surv, benchmark.omics.clinical)[[i]]
+    for (omic.subset in names(OMIC.SUBSETS)) {
+      print(paste('current omic subset ', omic.subset))
+      benchmark.data = cur.func(benchmark.ret, omic.subset)
+
+      displayed.benchmark.data = benchmark.data
+      colnames(displayed.benchmark.data)[1:ncol(displayed.benchmark.data)] = 
+        sapply(as.list(colnames(displayed.benchmark.data)[1:ncol(displayed.benchmark.data)]), 
+	subtype.to.display.name)
+      rownames(displayed.benchmark.data) = unlist(ALGORITHM.DISPLAY.NAMES[rownames(displayed.benchmark.data)])
+      print.matrix.latex.format(displayed.benchmark.data)
+      
+      table.name = c('runtime', 'num_cluster', 'survival', 'clinical')[i]
+      write.csv(displayed.benchmark.data, file=file.path(get.tables.dir.path(), paste0(table.name, '_', OMIC.SUBSETS[[omic.subset]], '.csv')))
+    }
+    
+    
+    print('------------------------')
+  }
+  
+  # plots that include all datasets
+  for (i in 1:4) {
+    omic.subset = names(OMIC.SUBSETS)[[i]]
+    benchmark.surv = benchmark.omics.surv(benchmark.ret, omic.subset)
+    benchmark.clinical = benchmark.omics.clinical(benchmark.ret, omic.subset)
+    plot.name = list('multi_omics_surv_clinical.tiff', 'exp_surv_clinical.tiff', 
+      'methy_surv_clinical.tiff', 'mirna_surv_clinical.tiff')[[i]]
+    create.clinical.survival.plots(benchmark.surv, benchmark.clinical, plot.name)
+  }
+  
+  
+  # plots for the mean behaviour
+  create.mean.clinical.survival.plot(benchmark.ret)
+}
+
+get.best.single.omic.mat <- function(single.omic.list1, single.omic.list2) {
+  best.mat1 = matrix(0, ncol=ncol(single.omic.list1[[1]]), 
+                            nrow=nrow(single.omic.list1[[1]]))
+  best.mat2 = matrix(0, ncol=ncol(single.omic.list1[[1]]), 
+                            nrow=nrow(single.omic.list1[[1]]))
+  for (i in 1:nrow(best.mat1)) {
+    for (j in 1:ncol(best.mat1)) {
+      values1 = sapply(single.omic.list1, function(mat) mat[i, j])
+      values2 = sapply(single.omic.list2, function(mat) mat[i, j])
+      if (any(is.na(values1))) {
+        best.mat1[i, j] = NA
+	best.mat2[i, j] = NA
+      } else {
+        best.omic = which.max(values1)
+        best.mat1[i, j] = values1[best.omic]
+	best.mat2[i, j] = values2[best.omic]
+      }
+    }
+  }
+  return(list(best.mat1, best.mat2))
+}
+
+create.mean.clinical.survival.plot <- function(benchmark.ret) {
+  tiff(file.path(get.plots.dir.path(), 'mean_surv_clinical.tiff'), width=4500, height=2250, res=300)
+  layout(matrix(c(1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5), nrow=2, byrow=T))
+  single.omic.surv.benchmarks = list()
+  single.omic.clin.benchmarks = list()
+  for (i in 2:6) {
+    omic.subset = c(names(OMIC.SUBSETS), 'best_surv', 'best_clin')[i]
+    alg.cols = c("#8DD3C7", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5", "#BC80BD", "#842121")
+    pch = c(rep(15, 3), rep(3, 3), rep(19, 3))
+    
+    if (i <= 4) {
+      benchmark.surv = benchmark.omics.surv(benchmark.ret, omic.subset)
+      benchmark.clinical = benchmark.omics.clinical(benchmark.ret, omic.subset)
+    
+      if (i %in% 2:4) {
+        single.omic.surv.benchmarks[[i - 1]] = benchmark.surv
+        single.omic.clin.benchmarks[[i - 1]] = benchmark.clinical
+      }
+    
+    } else if (i == 5) {
+      best.mats = get.best.single.omic.mat(single.omic.surv.benchmarks, single.omic.clin.benchmarks)
+      benchmark.surv = best.mats[[1]]
+      benchmark.clinical = best.mats[[2]]
+
+    } else {
+      best.mats = get.best.single.omic.mat(single.omic.clin.benchmarks, single.omic.surv.benchmarks)
+      benchmark.surv = best.mats[[2]]
+      benchmark.clinical = best.mats[[1]]
+    }
+    
+    surv.sum = rowSums(benchmark.surv, na.rm=T)
+    clin.sum = rowSums(benchmark.clinical, na.rm=T)
+    
+    # for mcca, the sum is for an empty vector, and is equal 0, so we remove this
+    available.indices = surv.sum != 0
+    surv.sum = surv.sum[available.indices]
+    clin.sum = clin.sum[available.indices]
+    alg.cols = alg.cols[available.indices]
+    pch = pch[available.indices]
+    
+    
+    if (omic.subset == '1') {
+      xlab = '-log10(logrank pvalue)'
+      ylab = '# enriched clinical parameters'
+    } else {
+      xlab = ''
+      ylab = ''
+    }
+    
+    if (i %in% c(1, 5, 6)) {
+      y.min = min(clin.sum) - 1
+      x.min = min(surv.sum) - 1
+    } else {
+      y.min = 0
+      x.min = 0
+    }
+    
+    
+    subplot.name = c('Multi-omics', 'Gene Expression', 'DNA Methylation', 'miRNA Expression',
+                     'Best single-omic (survival)', 'Best single-omic (clinical)')[i]
+    plot(surv.sum, clin.sum, main=subplot.name, xlab=xlab, ylab=ylab,
+         xlim=c(x.min, max(surv.sum) + 1), ylim=c(y.min, max(clin.sum + 1)), col=alg.cols, pch=pch, cex.lab=1.8, cex=4, cex.axis=1.5, cex.main=2.5, lwd=4)
+  }
+  dev.off()
+}
+
+create.clinical.survival.plots <- function(benchmark.surv, benchmark.clinical, plot.name) {
+  num.subtypes = ncol(benchmark.surv)
+  tiff(file.path(get.plots.dir.path(), plot.name), width=4500, height=1875, res=300)
+  alg.cols = c("#8DD3C7", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5", "#BC80BD", "#842121")
+  pch = c(rep(15, 3), rep(3, 3), rep(19, 3))
+
+  par(mfrow=c(2, num.subtypes / 2), mar=c(4.1, 4, 3.2, 1))
+  for (i in 1:num.subtypes) {
+    subtype = colnames(benchmark.surv)[i]
+    subtype.surv = benchmark.surv[,subtype]
+    subtype.clinical = benchmark.clinical[,subtype]
+    
+    available.indices = !is.na(subtype.surv)
+    subtype.surv = subtype.surv[available.indices]
+    subtype.clinical = subtype.clinical[available.indices]
+    current.cols = alg.cols[available.indices]
+    current.pch = pch[available.indices]
+    
+    surv.significance = -log10(0.05)
+    
+    if (i == 1) {
+      xlab = '-log10(logrank pvalue)'
+      ylab = '# enriched clinical parameters'
+    } else {
+      xlab = ''
+      ylab = ''
+    }
+    
+    plot(subtype.surv, subtype.clinical, main=subtype.to.display.name(subtype), xlab=xlab, ylab=ylab,
+         xlim=c(0, max(subtype.surv, surv.significance) + 0.2), ylim=c(0, max(subtype.clinical + 1)), col=current.cols, pch=current.pch, cex.lab=1.4, cex=2.4, cex.axis=1.5, cex.main=1.8, lwd=3)
+    abline(v=surv.significance, col='red')
+  }
+  
+  dev.off()
+}
+
+plot.legend <- function() {
+  alg.cols = c("#8DD3C7", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5", "#BC80BD", "#842121")
+  pch = c(rep(15, 3), rep(3, 3), rep(19, 3))
+  lwds = c(rep(NA, 3), rep(3, 3), rep(NA, 3))
+  pt.cexs = 1.8
+  algorithm.names = ALGORITHM.DISPLAY.NAMES
+  algorithm.names[length(algorithm.names)] = paste0(algorithm.names[length(algorithm.names)], ' ')
+  width=4200
+  
+  if (F) {
+    alg.cols = alg.cols[-7]
+    pch = pch[-7]
+    lwds = lwds[-7]
+    algorithm.names = algorithm.names[-7]
+    pt.cexs = pt.cexs[-7]
+    width=3800
+  }
+  
+  tiff(file.path(get.plots.dir.path(), 'legend.tif'), width=width, res=300)
+  par(font=2, mar=rep(1, 4))
+  plot(0,xaxt='n',yaxt='n',bty='n',pch='',ylab='',xlab='')
+  legend(0.58, 1, legend=algorithm.names, col=alg.cols, pch=pch, horiz=T, pt.cex=pt.cexs, pt.lwd=lwds)
+  dev.off()
+}
+
+analyze.benchmark <- function() {
+  all.clusterings = list()
+  all.timings = list()
+  for (i in 1:length(SUBTYPES.DATA)) {
+    current.subtype.data = SUBTYPES.DATA[[i]]
+    subtype = current.subtype.data$name
+    subtype.raw.data = get.raw.data(subtype, 
+                                    only.primary=current.subtype.data$only.primary)
+				    
+    all.clusterings[[subtype]] = list()
+    all.timings[[subtype]] = list()
+    
+    for (algorithm.name in ALGORITHM.NAMES) {
+      all.clusterings[[subtype]][[algorithm.name]] = list()
+      all.timings[[subtype]][[algorithm.name]] = list()
+      for (j in c('all', '1', '2', '3')) {
+        clustering.path = file.path(get.clustering.results.dir.path(),
+                                  paste(subtype, algorithm.name, j, sep='_'))
+        timing.path = file.path(get.clustering.results.dir.path(),
+                                  paste(subtype, algorithm.name, j, 'timing', sep='_'))
+	load(clustering.path)
+	load(timing.path)
+	if (!any(is.na(clustering))) {
+	  names(clustering) = colnames(subtype.raw.data[[1]])
+	}
+	
+	all.clusterings[[subtype]][[algorithm.name]][[j]] = clustering
+	all.timings[[subtype]][[algorithm.name]][[j]] = timing
+      }
+    }
+  }
+  return(list(all.clusterings=all.clusterings, all.timings=all.timings))
+}
+
+check.empirical.surv <- function(old.pvals, new.pvals) {
+  is.in.conf = matrix(0, ncol=ncol(old.pvals), nrow=nrow(old.pvals))
+  for (i in 1:nrow(old.pvals)) {
+    for (j in 1:ncol(old.pvals)) {
+      old.pval = old.pvals[i, j]
+      if (is.na(old.pval)) {
+        is.in.conf[i, j] = NA
+	next
+      }
+      if (old.pval == 1e-10) old.pval = 1e-5
+      num.runs = floor(30 / old.pval)
+      new.pval = new.pvals[i, j]
+      num.success = round(new.pval * num.runs)
+      print(c(num.success, num.runs))
+      
+      conf.int = binom.test(num.success, num.runs)$conf.int
+      in.conf.int = old.pval >= conf.int[1]  & old.pval <= conf.int[2]
+      is.in.conf[i, j] = in.conf.int
+    }
+  }
+  return(is.in.conf)
+}
+
+get.empirical.surv <- function(clustering, subtype) {
+  set.seed(42)
+  surv.ret = check.survival(clustering, subtype)
+  orig.chisq = surv.ret$chisq
+  orig.pvalue = get.logrank.pvalue(surv.ret)
+  # The initial number of permutations to run
+  num.perms = round(min(max(10 / orig.pvalue, 1000), 1e6))
+  should.continue = T
+  
+  total.num.perms = 0
+  total.num.extreme.chisq = 0
+  
+  while (should.continue) {
+    print('Another iteration in empirical survival calculation')
+    print(num.perms)
+    perm.chisq = as.numeric(mclapply(1:num.perms, function(i) {
+      cur.clustering = sample(clustering)
+      names(cur.clustering) = names(clustering)
+      cur.chisq = check.survival(cur.clustering, subtype)$chisq
+      return(cur.chisq)
+    }, mc.cores=50))
+    
+    total.num.perms = total.num.perms + num.perms
+    total.num.extreme.chisq = total.num.extreme.chisq + sum(perm.chisq >= orig.chisq)
+    
+    binom.ret = binom.test(total.num.extreme.chisq, total.num.perms)
+    cur.pvalue = binom.ret$estimate
+    cur.conf.int = binom.ret$conf.int
+    
+    print(c(total.num.extreme.chisq, total.num.perms))
+    print(cur.pvalue)
+    print(cur.conf.int)
+    
+    sig.threshold = 0.05
+    is.conf.small = ((cur.conf.int[2] - cur.pvalue) < min(cur.pvalue / 10, 0.01)) & ((cur.pvalue - cur.conf.int[1]) < min(cur.pvalue / 10, 0.01))
+    is.threshold.in.conf = cur.conf.int[1] < sig.threshold & cur.conf.int[2] > sig.threshold
+    if ((is.conf.small & !is.threshold.in.conf) | (total.num.perms > 2e7)) {
+    #if (is.conf.small) {
+      should.continue = F
+    } else {
+      num.perms = 1e5
+    }
+  }
+  
+  return(list(pvalue = cur.pvalue, conf.int = cur.conf.int, total.num.perms=total.num.perms, 
+              total.num.extreme.chisq=total.num.extreme.chisq))
+}
+
+benchmark.omics.surv <- function(benchmark.results, omics='all') {
+
+  
+  all.surv.pvalues = matrix(1, ncol=length(SUBTYPES.DATA), nrow=length(ALGORITHM.NAMES))
+  rownames(all.surv.pvalues) = ALGORITHM.NAMES
+  colnames(all.surv.pvalues) = sapply(SUBTYPES.DATA, function(x) x$name)
+  all.clusterings = benchmark.results$all.clusterings
+  for (i in 1:length(all.clusterings)) {
+    subtype = colnames(all.surv.pvalues)[i]
+    subtype.clusterings = all.clusterings[[subtype]]
+    for (j in 1:length(subtype.clusterings)) {
+      surv.path = file.path(get.clustering.results.dir.path(),
+                                  paste(subtype, ALGORITHM.NAMES[j], omics, 'surv', sep='_'))
+      if (file.exists(surv.path)) {
+        load(surv.path)
+	pvalue = empirical.surv.ret$pvalue
+	
+      } else {
+        clustering = subtype.clusterings[[ALGORITHM.NAMES[j]]][[omics]]
+        if (length(table(clustering)) > 1) {
+          #pvalue = -log10(get.logrank.pvalue(check.survival(clustering, subtype)))
+	  empirical.surv.ret = get.empirical.surv(clustering, subtype)
+	  save(empirical.surv.ret, file=surv.path)
+	  pvalue = empirical.surv.ret$pvalue
+	  
+
+        } else {
+          pvalue = NA
+        }  
+      }
+      all.surv.pvalues[j, i] = -log10(pvalue)
+    }
+  }
+
+  return(all.surv.pvalues)
+}
+
+benchmark.omics.num.clusters <- function(benchmark.results, omics='all') {
+  num.clusters = matrix(1, ncol=length(SUBTYPES.DATA), nrow=length(ALGORITHM.NAMES))
+  rownames(num.clusters) = ALGORITHM.NAMES
+  colnames(num.clusters) = sapply(SUBTYPES.DATA, function(x) x$name)
+  all.clusterings = benchmark.results$all.clusterings
+  for (i in 1:length(all.clusterings)) {
+    subtype = colnames(num.clusters)[i]
+    subtype.clusterings = all.clusterings[[subtype]]
+    for (j in 1:length(subtype.clusterings)) {
+      clustering = subtype.clusterings[[ALGORITHM.NAMES[j]]][[omics]]
+      num.clusters[j, i] = max(clustering)
+    }
+  }
+  return(num.clusters)
+}
+
+benchmark.omics.clinical <- function(benchmark.results, omics='all') {
+				  
+  num.clinical.enrich = matrix(1, ncol=length(SUBTYPES.DATA), nrow=length(ALGORITHM.NAMES))
+  rownames(num.clinical.enrich) = ALGORITHM.NAMES
+  colnames(num.clinical.enrich) = sapply(SUBTYPES.DATA, function(x) x$name)
+  total.num.tested.parameters = 0
+  all.clusterings = benchmark.results$all.clusterings
+  for (i in 1:length(all.clusterings)) {
+    subtype = colnames(num.clinical.enrich)[i]
+    subtype.clusterings = all.clusterings[[subtype]]
+    for (j in 1:length(subtype.clusterings)) {
+      print('checking clinical enrichment')
+      print(c(i, j))
+      
+      
+      clustering = subtype.clusterings[[ALGORITHM.NAMES[j]]][[omics]]
+      if (any(is.na(clustering))) {
+        num.clinical.enrich[j, i] = NA
+      } else {
+        clin.path = file.path(get.clustering.results.dir.path(),
+                                  paste(subtype, ALGORITHM.NAMES[j], omics, 'clin', sep='_'))
+				  
+        if (file.exists(clin.path)) {
+	  load(clin.path)
+	} else {
+	  enrichment.pvalues = check.clinical.enrichment(clustering, subtype)
+	  save(enrichment.pvalues, file=clin.path)
+	}
+        
+	if (j == 1) {
+	  total.num.tested.parameters = total.num.tested.parameters + length(enrichment.pvalues)
+	}
+        num.clinical.enrich[j, i] = sum(enrichment.pvalues * length(enrichment.pvalues) < 0.05)
+      }
+    }
+  }
+  print(paste0('Total number of parameters tested:', total.num.tested.parameters))
+  return(num.clinical.enrich)
+}
+
+benchmark.omics.time <- function(benchmark.results, omics='all') {
+  all.alg.times = matrix(1, ncol=length(SUBTYPES.DATA), nrow=length(ALGORITHM.NAMES))
+  rownames(all.alg.times) = ALGORITHM.NAMES
+  colnames(all.alg.times) = sapply(SUBTYPES.DATA, function(x) x$name)
+  all.timings = benchmark.results$all.timings
+  for (i in 1:length(all.timings)) {
+    subtype = colnames(all.alg.times)[i]
+    subtype.timings = all.timings[[subtype]]
+    for (j in 1:length(subtype.timings)) {
+      timing = subtype.timings[[ALGORITHM.NAMES[j]]][[omics]]
+      all.alg.times[j, i] = timing
+    }
+  }
+  return(all.alg.times)
+}
+
+benchmark.alg.agreement <- function(benchmark.results, omics='all') {
+  all.clusterings = benchmark.results$all.clusterings
+  rand.matrix = matrix(NA, ncol=length(ALGORITHM.NAMES), nrow=length(ALGORITHM.NAMES))
+  colnames(rand.matrix) = ALGORITHM.NAMES
+  rownames(rand.matrix) = ALGORITHM.NAMES
+  for (alg1.index in 1:length(ALGORITHM.NAMES)) {
+    alg1 = ALGORITHM.NAMES[alg1.index]
+    for (alg2.index in 1:length(ALGORITHM.NAMES)) {
+      alg2 = ALGORITHM.NAMES[alg2.index]
+      rands = c()
+      for (i in 1:length(SUBTYPES.DATA)) {
+        clustering1 = all.clusterings[[i]][[alg1]][[omics]]
+	clustering2 = all.clusterings[[i]][[alg2]][[omics]]
+	rands = c(rands, adj.rand.index(clustering1, clustering2))
+      }
+      mean.rand = mean(rands)
+      rand.matrix[alg1.index, alg2.index] = mean.rand
+    }
+  }
+  return(rand.matrix)
+}
+
+get.logrank.pvalue <- function(survdiff.res) {
+  1 - pchisq(survdiff.res$chisq, length(survdiff.res$n) - 1)  
+}
+
+run.benchmark <- function() {
+  for (i in 1:length(SUBTYPES.DATA)) {
+    current.subtype.data = SUBTYPES.DATA[[i]]
+    subtype = current.subtype.data$name
+    subtype.raw.data = get.raw.data(subtype, 
+                                    only.primary=current.subtype.data$only.primary)
+    
+    subtype.raw.data = set.omics.list.attr(subtype.raw.data, 
+                                           current.subtype.data)
+    
+    for (algorithm.name in ALGORITHM.NAMES) {
+      for (j in c('all', '1', '2', '3')) {
+        set.seed(42)
+        print(paste('subtype', subtype, 'running algorithm', algorithm.name, j))
+        clustering.path = file.path(get.clustering.results.dir.path(),
+                                  paste(subtype, algorithm.name, j, sep='_'))
+        timing.path = file.path(get.clustering.results.dir.path(),
+                                  paste(subtype, algorithm.name, j, 'timing', sep='_'))
+	
+	
+        if (!file.exists(clustering.path)) {
+          algorithm.func.name = paste0('run.', algorithm.name)
+          algorithm.func = get(algorithm.func.name)
+	  if (j == 'all') {
+	    cur.iteration.data = subtype.raw.data
+	  } else {
+	    cur.iteration.data = subtype.raw.data[as.numeric(j)]
+	  }
+          algorithm.ret = algorithm.func(cur.iteration.data, current.subtype.data)
+          clustering = algorithm.ret$clustering
+          timing = algorithm.ret$timing
+	  print('before saving')
+          save(clustering, file = clustering.path)
+          save(timing, file = timing.path)
+        }
+      }
+    }
+  }
+}
+
+get.dataset.dir.path <- function() {
+  return('DATASETS_PATH')
+}
+
+log.and.normalize <- function(omics.data, subtype.data, normalize=T,
+                              filter.var=F) {
+  # filter features with no variance at all
+  for (i in 1:length(omics.data)) {
+    omics.data[[i]] = omics.data[[i]][apply(omics.data[[i]], 1, var) > 0,]
+  }
+			      
+  for (i in 1:length(omics.data)) {
+    if (attr(omics.data[[i]], 'is.seq')) {
+      omics.data[[i]] = log(1+omics.data[[i]])
+    }
+  }
+  
+  if (filter.var) {
+    omics.data = lapply(omics.data, keep.high.var.features)
+  }
+  
+  if (normalize) {
+    omics.data = lapply(omics.data, normalize.matrix)    
+  }
+  
+  return(omics.data)
+}
+
+normalize.matrix <- function(data.matrix) {
+  temp = data.matrix - rowMeans(data.matrix)
+  should.keep = (apply(temp, 1, sd) != 0)
+  return ((temp / apply(temp, 1, sd))[should.keep, ])
+}
+
+filter.non.tumor.samples <- function(raw.datum, only.primary=only.primary) {
+  # 01 is primary, 06 is metastatic, 03 is blood derived cancer
+  if (!only.primary)
+    return(raw.datum[,substring(colnames(raw.datum), 14, 15) %in% c('01', '03', '06')])
+  else
+    return(raw.datum[,substring(colnames(raw.datum), 14, 15) %in% c('01')])
+}
+
+get.fixed.names <- function(patient.names, include.type=F) {
+  # fix the TCGA names to only include the patient ids
+  if (include.type) {
+    return(gsub('-', '\\.', toupper(substring(patient.names, 1, 15))))
+  } else {
+    return(gsub('-', '\\.', toupper(substring(patient.names, 1, 12))))  
+  }
+}
+
+fix.patient.names <- function(subtype.raw.data, include.type=F) {
+  for (i in 1:length(subtype.raw.data)) {
+    colnames(subtype.raw.data[[i]]) = get.fixed.names(colnames(subtype.raw.data[[i]]),
+                                                      include.type)
+  }
+  return(subtype.raw.data)
+}
+
+get.raw.data <- function(subtype.name,
+                         datasets.path = get.dataset.dir.path(),
+                         only.primary=NA) {
+  omics.dir = file.path(datasets.path, subtype.name)
+  omics.files = list.files(omics.dir)
+  omics.files = setdiff(omics.files, c('survival'))  
+  raw.data = lapply(file.path(omics.dir, omics.files), read.table)
+  
+  if (!is.na(only.primary)) {
+    raw.data = lapply(raw.data, function(x) filter.non.tumor.samples(x, only.primary = only.primary))
+  }
+  name.corrected.data = fix.patient.names(raw.data)
+  patients.intersection = Reduce(intersect, lapply(name.corrected.data, colnames))
+  ret.data = lapply(name.corrected.data, function(datum) datum[,patients.intersection])  
+  return(ret.data)
+}
+
+get.elbow <- function(values, is.max) {
+  second.derivatives = c()
+  for (i in 2:(length(values) - 1)) {
+    second.derivative = values[i + 1] + values[i - 1] - 2 * values[i]
+    second.derivatives = c(second.derivatives, second.derivative)
+  }
+  print(second.derivatives)
+  if (is.max) {
+    return(which.max(second.derivatives) + 1)
+  } else {
+    return(which.min(second.derivatives) + 1)
+  }
+}
+
+# Does not support a single omic dataset
+run.mcca <- function(omics.list, subtype.data) {
+  if (length(omics.list) == 1) {
+    return(list(clustering=rep(NA, ncol(omics.list[[1]])), timing=1))
+  }
+  start = Sys.time()
+  omics.list = log.and.normalize(omics.list, subtype.data, 
+                                 normalize = T,
+                                 filter.var = T)
+  
+  subtype = subtype.data$name
+  omics.transposed = lapply(omics.list, t)
+  cca.ret = PMA::MultiCCA(omics.transposed, 
+                          ncomponents = MAX.NUM.CLUSTERS)
+  sample.rep = omics.transposed[[1]] %*% cca.ret$ws[[1]]
+  
+  explained.vars = sapply(1:MAX.NUM.CLUSTERS, 
+              function(i) sum(unlist(apply(sample.rep[1:i,,drop=F], 2, var))))
+  
+  dimension = get.elbow(explained.vars, is.max=F)
+  print(dimension)
+  sample.rep = sample.rep[,1:dimension]
+  sils = c()
+  clustering.per.num.clusters = list()
+  for (num.clusters in 2:MAX.NUM.CLUSTERS) {
+    cur.clustering = kmeans(sample.rep, num.clusters, iter.max=100, nstart=30)$cluster  
+    sil = get.clustering.silhouette(list(t(sample.rep)), cur.clustering)
+    sils = c(sils, sil)
+    clustering.per.num.clusters[[num.clusters - 1]] = cur.clustering
+  }
+  # NOTE: the next line contains an error. We mistakenly selected the minimal rather maximal silhouette.
+  # See more details in: http://acgt.cs.tau.ac.il/multi_omic_benchmark/download.html.
+  cca.clustering = clustering.per.num.clusters[[which.min(sils)]]
+  time.taken = as.numeric(Sys.time() - start, units='secs')
+  return(list(clustering=cca.clustering, timing=time.taken))
+}
+
+run.snf <- function(omics.list, subtype.data) {
+  start = Sys.time()
+  omics.list = log.and.normalize(omics.list, subtype.data)
+  subtype = subtype.data$name
+  alpha=0.5
+  T.val=30
+  num.neighbors = round(ncol(omics.list[[1]]) / 10)
+  similarity.data = lapply(omics.list, function(x) {affinityMatrix(dist2(as.matrix(t(x)),as.matrix(t(x))), 
+                                                                   num.neighbors, alpha)})
+  if (length(similarity.data) == 1) {
+    W = similarity.data[[1]]
+  } else {
+    W = SNF(similarity.data, num.neighbors, T.val)  
+  }
+  
+  num.clusters = estimateNumberOfClustersGivenGraph(W, 2:MAX.NUM.CLUSTERS)[[3]]  
+  clustering = spectralClustering(W, num.clusters)
+  time.taken = as.numeric(Sys.time() - start, units='secs')
+  return(list(clustering=clustering, timing=time.taken))
+}
+
+run.iCluster <- function(omics.list, subtype.data) {
+  omics.list = log.and.normalize(omics.list, subtype.data, normalize = F)
+
+  start = Sys.time()
+  subtype = subtype.data$name
+  dev.ratios = c()
+  icluster.rets = list()
+
+  if (length(omics.list) == 1) {
+    icluster.ret = iClusterPlus::tune.iClusterBayes(cpus=(MAX.NUM.CLUSTERS - 1), t(omics.list[[1]]), 
+                              K=1:(MAX.NUM.CLUSTERS - 1), type=c('gaussian'))$fit
+  } else {
+    icluster.ret = iClusterPlus::tune.iClusterBayes(cpus=(MAX.NUM.CLUSTERS - 1), t(omics.list[[1]]), 
+                              t(omics.list[[2]]), 
+                              t(omics.list[[3]]), 
+                              K=1:(MAX.NUM.CLUSTERS - 1), type=rep('gaussian', 3))$fit
+  }
+  dev.ratios = lapply(1:(MAX.NUM.CLUSTERS - 1), function(i) icluster.ret[[i]]$dev.ratio)
+
+  print('dev.ratios are:')
+  print(dev.ratios)
+  
+  optimal.solution = icluster.ret[[which.max(dev.ratios)]]
+  time.taken = as.numeric(Sys.time() - start, units='secs')
+  return(list(clustering=optimal.solution$clusters, 
+              timing=time.taken))
+}
+
+get.mkl.binary.path = function() {
+  return('MKL_BINARY_PATH')
+}
+
+get.mkl.arguments.path = function() {
+  return('MKL_ARGS_PATH')
+}
+
+
+run.mkl <- function(omics.list, subtype.data) {
+  start = Sys.time()
+  omics.list = log.and.normalize(omics.list, subtype.data)
+  subtype = subtype.data$name
+  omics.list = lapply(omics.list, normalize.matrix)
+  time.taken = as.numeric(Sys.time() - start, units='secs')
+  export.subtype.to.mkl(omics.list, subtype)
+  
+  start = Sys.time()
+  bin.path = get.mkl.binary.path()
+  subtype.dir = paste0(get.mkl.arguments.path(), subtype, '\\')
+  paste0(subtype.dir, 'kernels')
+  command = paste(bin.path, paste0(subtype.dir, 'kernels'),
+                  paste0(subtype.dir, 'ids'),
+                  paste0(subtype.dir, 'output'), 
+                  '9', '5')
+  command.return = system(command)
+  stopifnot(command.return == 0)
+  time.taken2 = as.numeric(Sys.time() - start, units='secs')
+  clustering = get.mkl.clustering(subtype)
+  return(list(clustering=clustering, 
+              timing=time.taken + time.taken2))
+}
+
+run.nmf <- function(omics.list, subtype.data) {
+  total.time.taken = 0
+  start = Sys.time()
+  omics.list = log.and.normalize(omics.list, subtype.data,
+                                 filter.var = T, normalize = F)
+  time.taken = as.numeric(Sys.time() - start, units='secs')
+  total.time.taken = total.time.taken + time.taken
+  
+  save.subtype.matlab.format(omics.list)
+  subtype = subtype.data$name
+  if (length(omics.list) > 1) {
+    command.ret = system('MULTI_NMF_COMMAND')
+    stopifnot(command.ret == 0)
+    nmf.timing = read.csv('SOME_TEMP_PATH_TIMING', header=F)[1, 1]
+    total.time.taken = total.time.taken + nmf.timing
+  } else {
+   for (k in 1:MAX.NUM.CLUSTERS) {
+     start = Sys.time()
+     file.name = paste0('SOME_TEMP_PATH/', k, '_consensus')
+     nmf.ret = nmf(omics.list[[1]], k, method='lee')
+     coef.mat = t(coef(nmf.ret))
+     time.taken = as.numeric(Sys.time() - start, units='secs')
+     total.time.taken = total.time.taken + time.taken
+     write.table(coef.mat, file=file.name, quote=F, row.names=F, col.names=F, sep=',')
+   }
+  }
+  
+  explained.vars = c()
+  clustering.per.num.clusters = list()
+  for (k in 1:MAX.NUM.CLUSTERS) {
+    file.name = paste0('SOME_TEMP_PATH/', k, '_consensus')
+    consensus.mat = read.csv(file.name, header=F)
+    
+    start = Sys.time()
+    cur.clustering = apply(consensus.mat, 1, which.max)
+    explained.var = sum(unlist(apply(consensus.mat, 2, var)))
+    explained.vars = c(explained.vars, explained.var)
+    clustering.per.num.clusters[[k]] = cur.clustering
+    time.taken = as.numeric(Sys.time() - start, units='secs')
+    total.time.taken = total.time.taken + time.taken
+  }
+  
+  dimension = get.elbow(explained.vars, is.max=F)
+  nmf.clustering = clustering.per.num.clusters[[dimension]]
+  return(list(clustering=nmf.clustering, timing=total.time.taken))  
+}
+
+run.pins <- function(omics.list, subtype.data) {
+  start = Sys.time()
+  omics.list = log.and.normalize(omics.list, subtype.data, normalize = F)
+  subtype = subtype.data$name
+  omics.transposed = lapply(omics.list, t)
+  if (length(omics.list) == 1) {
+    pins.ret = PINSPlus::PerturbationClustering(data=omics.transposed[[1]],
+                                            kMax = MAX.NUM.CLUSTERS)
+    clustering = pins.ret$cluster
+    
+  } else {
+    pins.ret = PINSPlus::SubtypingOmicsData(dataList=omics.transposed,
+                                            kMax = MAX.NUM.CLUSTERS)
+    clustering = pins.ret$cluster2
+  }
+  time.taken = as.numeric(Sys.time() - start, units='secs')
+  return(list(clustering=clustering, timing=time.taken))
+}
+
+run.lracluster <- function(omics.list, subtype.data) {
+  omics.list = log.and.normalize(omics.list, subtype.data, normalize = F)
+  
+  subtype = subtype.data$name
+  start = Sys.time()
+  
+  dim.range = 1:MAX.NUM.CLUSTERS
+  all.clustering.results = list()
+  
+  omics.matrix.list = lapply(omics.list, as.matrix)
+  for (dimension in dim.range) {
+    print(paste('running lra cluster for dimension', dimension))
+    data.names = c('gene expression', 'methylation', 'miRNA expression')
+    clustering.results = LRAcluster(omics.matrix.list, 
+                                    rep('gaussian', length(omics.list)), 
+                                    dimension=dimension, data.names)
+    all.clustering.results[[dimension]] = clustering.results
+  }
+  explained.var = sapply(all.clustering.results, function(x) x$potential)
+  print(explained.var)
+  dimension = get.elbow(explained.var, is.max=F)
+  print(dimension)
+  solution = all.clustering.results[[dimension]]$coordinate
+  
+  sils = c()
+  clustering.per.num.clusters = list()
+  for (num.clusters in 2:MAX.NUM.CLUSTERS) {
+    print(paste('running kmeans in lra cluster for num clusters', num.clusters))
+    cur.clustering = kmeans(t(solution), num.clusters, iter.max=100, nstart=60)$cluster
+    sil = get.clustering.silhouette(list(solution), cur.clustering)
+    sils = c(sils, sil)
+    clustering.per.num.clusters[[num.clusters - 1]] = cur.clustering
+  }
+  print(sils)
+  # NOTE: the next line contains an error. We mistakenly selected the minimal rather maximal silhouette.
+  # See more details in: http://acgt.cs.tau.ac.il/multi_omic_benchmark/download.html.
+  chosen.clustering = clustering.per.num.clusters[[which.min(sils)]]
+  time.taken = as.numeric(Sys.time() - start, units='secs')
+  return(list(clustering=chosen.clustering, timing=time.taken))
+}
+
+run.kmeans <- function(omics.list, subtype.data) {
+  start = Sys.time()
+  omics.list = log.and.normalize(omics.list, subtype.data, 
+                                 filter.var = T)
+  
+  subtype = subtype.data$name
+  all.withinss = c()
+  all.clusterings = list()
+  k.range = 1:MAX.NUM.CLUSTERS
+  for (k in k.range) {
+    concat.omics = do.call(rbind, omics.list)
+    kmeans.ret = kmeans(t(concat.omics), k, iter.max=100, nstart=60)
+    all.withinss = c(all.withinss, kmeans.ret$tot.withinss)
+    all.clusterings[[k]] = kmeans.ret$cluster
+  }
+  
+  best.k = get.elbow(all.withinss, is.max=T)
+  time.taken = as.numeric(Sys.time() - start, units='secs')
+  return(list(clustering=all.clusterings[[best.k]], 
+              timing=time.taken))
+}
+
+run.spectral <- function(omics.list, subtype.data) {
+  start = Sys.time()
+  omics.list = log.and.normalize(omics.list, subtype.data, 
+                                 filter.var = T)
+  subtype = subtype.data$name
+  concat.omics = do.call(rbind, omics.list)
+  
+  similarity.data = affinityMatrix(dist2(as.matrix(t(concat.omics)),
+                                         as.matrix(t(concat.omics))), 
+                                   20, 0.5)
+  num.clusters = estimateNumberOfClustersGivenGraph(similarity.data, 
+                                      2:MAX.NUM.CLUSTERS)[[3]]  
+  clustering = spectralClustering(similarity.data, num.clusters)
+  time.taken = as.numeric(Sys.time() - start, units='secs')
+  return(list(clustering=clustering, timing=time.taken))
+}
+
+load.libraries <- function() {
+  library('PMA')
+  library('R.matlab')
+  library('SNFtool')
+  library('PINSPlus')
+  #library('LRAcluster')
+  library('kernlab')
+  library('survival')
+  library('NMF')
+  
+  # bioconductor packages
+  source("https://bioconductor.org/biocLite.R")
+  biocLite("impute")
+  #biocLite("iClusterPlus")
+}
+
+
+
+########################################
+###############   MKL    ###############
+########################################
+
+radial.basis <- function(mat, gamma) {
+  if (missing(gamma)) {
+    gamma = 1 / (2*nrow(mat)**2)
+  }
+  npatients = ncol(mat)
+  output.mat = matrix(0, ncol=npatients, nrow=npatients)
+  for (i in 1:npatients) {
+    for (j in 1:npatients) {
+      output.mat[i, j] = exp(-norm(as.matrix(mat[,i] - mat[,j]), type = 'F')**2 * gamma)
+    }
+  }
+  
+  D = apply(output.mat, 2, sum) / npatients
+  E = sum(D) / npatients
+  J = matrix(1, nrow=npatients, ncol=1) %*% D
+  ret = output.mat - J - t(J) + E * matrix(1, ncol=npatients, nrow=npatients)
+  ret = diag(1/sqrt(diag(ret))) %*% ret %*% diag(1/sqrt(diag(ret)))
+  return(ret)
+}
+
+clear.dir <- function(dir.path) {
+  files.in.dir = list.files(dir.path)
+  for (file.in.dir in files.in.dir) {
+    full.file.path = file.path(dir.path, file.in.dir)
+    file.remove(full.file.path)
+  }
+}
+
+export.subtype.to.mkl <- function(omics.list, dir.name) {
+  
+  folder.path = file.path(get.mkl.arguments.path(), dir.name)
+  if (!dir.exists(folder.path)) {
+    dir.create(folder.path)
+  }
+  
+  kernels.path = file.path(folder.path, 'kernels')
+  
+  if (!dir.exists(kernels.path)) {
+    dir.create(kernels.path)
+  }
+  clear.dir(kernels.path)
+  
+  gammas = 10 ** seq(-6, 6, by=3)
+  for (i in 1:length(omics.list)) {
+    for (j in 1:length(gammas)) {
+      datum = omics.list[[i]]
+      gamma = gammas[[j]] / (2*nrow(datum)**2)
+      mat = radial.basis(datum, gamma)
+      R.matlab::writeMat(file.path(kernels.path, paste(i, '_', j, sep='')), mat=mat)
+    }
+  }
+  
+  output.path = file.path(folder.path, 'output')
+  if (!dir.exists(output.path)) {
+    dir.create(output.path)
+  }
+  clear.dir(output.path)
+  
+  write.table(colnames(omics.list[[1]]), file=file.path(folder.path, 'ids'),
+              quote=F, row.names = F, col.names = F)
+  
+}
+
+get.mkl.clustering <- function(dir.name) {
+  folder.path = file.path(get.mkl.arguments.path(), dir.name)
+  output.path = file.path(folder.path, 'output')
+  output.files = list.files(output.path)
+  clustering = read.csv(file.path(output.path, output.files[grep('clusters', output.files)]))[,2]
+  return(clustering)
+}
+
+check.survival <- function(groups, subtype, survival.file.path) {
+  if (missing(survival.file.path)) {
+    survival.file.path = get.subtype.survival.path(subtype)
+  }
+  survival.data = read.table(survival.file.path, header = TRUE)
+  patient.names = names(groups)
+  patient.names.in.file = as.character(survival.data[, 1])
+  patient.names.in.file = toupper(gsub('-', '\\.', substring(patient.names.in.file, 1, 12)))
+  
+  stopifnot(all(patient.names %in% patient.names.in.file))
+  
+  indices = match(patient.names, patient.names.in.file)
+  ordered.survival.data = survival.data[indices,]
+  ordered.survival.data["cluster"] <- groups
+  ordered.survival.data$Survival[is.na(ordered.survival.data$Survival)] = 0
+  ordered.survival.data$Death[is.na(ordered.survival.data$Death)] = 0
+  return(survdiff(Surv(Survival, Death) ~ cluster, data=ordered.survival.data))
+  
+}
+
+get.subtype.survival.path <- function(subtype) {
+  datasets.path = get.dataset.dir.path()
+  survival.file.path = file.path(datasets.path, subtype, 'survival')
+  return(survival.file.path)
+}
+
+get.clustering.silhouette <- function(raw.data, clustering) {
+  sils = c()
+  for (i in 1:length(raw.data)) {
+    x = raw.data[[i]]
+    distmatrix = dist2(as.matrix(t(x)),as.matrix(t(x)))
+    sil = silhouette(clustering, dmatrix = distmatrix)[,3]
+    sils = c(sils, mean(sil))
+  }
+  return(mean(sils))
+}
+
+get.clinical.params.dir <- function() {
+  return('CLINICAL_PARAMS_PATH')
+}
+
+get.clinical.params <- function(subtype.name) {
+  clinical.data.path = paste(get.clinical.params.dir(), subtype.name, sep = '')
+  clinical.params = read.table(clinical.data.path,
+                               sep='\t', header=T, row.names = 1, stringsAsFactors = F)
+  rownames.with.duplicates = get.fixed.names(rownames(clinical.params))  
+  clinical.params = clinical.params[!duplicated(rownames.with.duplicates),]
+  rownames(clinical.params) = rownames.with.duplicates[!duplicated(rownames.with.duplicates)]
+  return(clinical.params)
+}
+
+check.clinical.enrichment <- function(clustering, subtype.name) {
+  clinical.params = get.clinical.params(subtype.name)  
+  
+  clinical.metadata = list(gender='DISCRETE', age_at_initial_pathologic_diagnosis='NUMERIC',
+    pathologic_M='DISCRETE', pathologic_N='DISCRETE', pathologic_T='DISCRETE', pathologic_stage='DISCRETE')
+  
+  pvalues = c()
+  
+  params.being.tested = c()
+  
+  for (clinical.param in names(clinical.metadata)) {
+    
+    if (!(clinical.param %in% colnames(clinical.params))) {
+      #print(paste0('WARNING: ', clinical.param, ' does not appear for subtype ', subtype.name))
+      next
+    }
+    
+    clinical.values = clinical.params[names(clustering),clinical.param]
+    is.discrete.param = clinical.metadata[clinical.param] == 'DISCRETE'
+    is.numeric.param = clinical.metadata[clinical.param] == 'NUMERIC'
+    stopifnot(is.discrete.param | is.numeric.param)
+    
+    # skip parameter if many missing values
+    
+    if (is.numeric.param) {
+      numeric.entries = !is.na(as.numeric(clinical.values))
+      if (2 * sum(numeric.entries) < length(clinical.values)) {
+        #print(paste0('WARNING: skipping on ', clinical.param, ' for subtype ', subtype.name))
+        next
+      }
+    } else {
+      not.na.entries = !is.na(clinical.values)
+      should.skip = F
+      if (2 * sum(not.na.entries) < length(clinical.values)) {
+        should.skip = T
+      } else if (length(table(clinical.values[not.na.entries])) == 1) {
+        should.skip = T
+      }
+      if (should.skip) {
+        #print(paste0('WARNING: skipping on ', clinical.param, ' for subtype ', subtype.name))
+        next
+      }
+    }
+    
+    params.being.tested = c(params.being.tested, clinical.param)
+    
+    if (is.discrete.param) {
+      #clustering.with.clinical = cbind(clustering, clinical.values)
+      #tbl = table(as.data.frame(clustering.with.clinical[!is.na(clinical.values),]))
+      #test.res = chisq.test(tbl)
+      #pvalue = test.res$p.value
+      pvalue = get.empirical.clinical(clustering[!is.na(clinical.values)], clinical.values[!is.na(clinical.values)], T)
+      
+    } else if (is.numeric.param) {
+      #test.res = kruskal.test(as.numeric(clinical.values[numeric.entries]),
+      #				clustering[numeric.entries])
+      #pvalue = test.res$p.value
+      pvalue = get.empirical.clinical(clustering[numeric.entries], as.numeric(clinical.values[numeric.entries]), F)
+    }
+    
+    pvalues = c(pvalues, pvalue)
+    
+  }
+  names(pvalues) = params.being.tested
+  return(pvalues)
+}
+
+get.empirical.clinical <- function(clustering, clinical.values, is.chisq) {
+  set.seed(42)
+  if (is.chisq) {
+      clustering.with.clinical = cbind(clustering, clinical.values)
+      tbl = table(as.data.frame(clustering.with.clinical))
+      test.res = chisq.test(tbl)
+  } else {
+    test.res = kruskal.test(as.numeric(clinical.values), clustering)
+  }
+  orig.pvalue = test.res$p.value
+  num.iter = 1000
+  total.num.iters = 0
+  total.num.extreme = 0
+  should.continue = T
+  
+  while (should.continue) {
+    print('another iteration in empirical clinical')
+    perm.pvalues = as.numeric(mclapply(1:num.iter, function(i) {
+      cur.clustering = sample(clustering)
+      names(cur.clustering) = names(clustering)
+    
+      if (is.chisq) {
+        clustering.with.clinical = cbind(cur.clustering, clinical.values)
+        tbl = table(as.data.frame(clustering.with.clinical))
+        test.res = chisq.test(tbl)
+      } else {
+        test.res = kruskal.test(as.numeric(clinical.values), cur.clustering)
+      }
+      cur.pvalue = test.res$p.value
+      return(cur.pvalue)
+    }, mc.cores=50))
+    total.num.iters = total.num.iters + num.iter
+    total.num.extreme = total.num.extreme + sum(perm.pvalues <= orig.pvalue)
+    
+    binom.ret = binom.test(total.num.extreme, total.num.iters)
+    cur.pvalue = binom.ret$estimate
+    cur.conf.int = binom.ret$conf.int
+    
+    sig.threshold = 0.05
+    is.threshold.in.conf = cur.conf.int[1] < sig.threshold & cur.conf.int[2] > sig.threshold
+    if (!is.threshold.in.conf | total.num.iters > 1e5) {
+      should.continue = F
+    }
+  }
+  return(cur.pvalue)
+}
+
+get.nmf.datasets.dir <- function() {
+  return('NMF_DATASETS_PATH')
+}
+
+save.subtype.matlab.format <- function(subtype.raw.data) {
+  data.names = c('1', '2', '3')
+  
+  full.dir.name = get.nmf.datasets.dir()
+  dir.create(full.dir.name, showWarnings = F)
+  
+  for (i in 1:length(subtype.raw.data)) {
+    full.file.name = file.path(full.dir.name, data.names[i])
+    write.table(subtype.raw.data[[i]], full.file.name)
+  }
+}
+
+keep.high.var.features <- function(omic, num.features=2000) {
+  if (nrow(omic) < num.features) {
+    return(omic)
+  } else {
+    feature.vars = apply(omic, 1, var)
+    threshold = feature.vars[order(feature.vars, decreasing = T)][num.features]
+    return(omic[feature.vars >= threshold,])    
+  }
+}