Multi-Omics-Benchmark / Git / [835948] /benchmark.R

Models:
AlyssaS/
Multi-Omics-Benchmark
Downloads: 1
[835948]: / benchmark.R
History
Download this file
1168 lines (994 with data), 42.7 kB

SUBTYPES.DATA = list(
  list(name='aml', only.primary=F, is.rna.seq=T, is.mirna.seq=T, display.name='AML'),
  list(name='breast', only.primary=T, is.rna.seq=T, is.mirna.seq=T, display.name='BIC'),
  list(name='colon', only.primary=T, is.rna.seq=T, is.mirna.seq=T, display.name='COAD'),
  list(name='gbm', only.primary=T, is.rna.seq=F, is.mirna.seq=F, display.name='GBM'),
  list(name='kidney', only.primary=T, is.rna.seq=T, is.mirna.seq=T, display.name='KIRC'),
  list(name='liver', only.primary=T, is.rna.seq=T, is.mirna.seq=T, display.name='LIHC'),
  list(name='lung', only.primary=T, is.rna.seq=T, is.mirna.seq=T, display.name='LUSC'),
  list(name='melanoma', only.primary=F, is.rna.seq=T, is.mirna.seq=T, display.name='SKCM'),
  list(name='ovarian', only.primary=T, is.rna.seq=T, is.mirna.seq=T, display.name='OV'),
  list(name='sarcoma', only.primary=T, is.rna.seq=T, is.mirna.seq=T, display.name='SARC'))

MAX.NUM.CLUSTERS = 15

OMIC.SUBSETS = list('multi_omics', 'exp', 'methy', 'mirna')
names(OMIC.SUBSETS) = c('all', '1', '2', '3')

get.clustering.results.dir.path <- function() {
  return('RESULTS_DIR_PATH')
}

get.plots.dir.path <- function() {
  results.dir.path = get.clustering.results.dir.path()
  return(file.path(results.dir.path, 'plots'))
}

get.tables.dir.path <- function() {
  results.dir.path = get.clustering.results.dir.path()
  return(file.path(results.dir.path, 'tables'))
}

subtype.to.display.name <- function(subtype) {
  for (i in 1:length(SUBTYPES.DATA)) {
    if (SUBTYPES.DATA[[i]]$name == subtype) {
      return(SUBTYPES.DATA[[i]]$display.name)
    }
  }
}

set.omics.list.attr <- function(subtype.raw.data, subtype.data) {
  attr(subtype.raw.data[[1]], 'is.seq') = subtype.data$is.rna.seq
  attr(subtype.raw.data[[2]], 'is.seq') = F
  attr(subtype.raw.data[[3]], 'is.seq') = subtype.data$is.mirna.seq
  return(subtype.raw.data)
}

ALGORITHM.NAMES = c('kmeans', 'spectral', 'lracluster', 'pins', 'snf', 'mkl', 
                     'mcca', 'nmf', 'iCluster')
ALGORITHM.DISPLAY.NAMES = as.list(c('K-means', 'Spectral', 'LRAcluster', 'PINS', 
                           'SNF', 'rMKL-LPP', 'MCCA', 'MultiNMF', 'iClusterBayes'))
names(ALGORITHM.DISPLAY.NAMES) = ALGORITHM.NAMES
			
print.matrix.latex.format <- function(mat) {
  print(do.call(paste, as.list(c(colnames(mat), sep=' & '))))
  for (i in 1:nrow(mat)) {
    print(do.call(paste, as.list(c(rownames(mat)[i], round(mat[i,], digits=2), sep=' & '))))
  }
}


perform.all.analyses <- function(benchmark.ret) {
  par(mar=rep(1, 4))
  for (i in 1:4) {
    
    cur.func = list(benchmark.omics.time, benchmark.omics.num.clusters,
      benchmark.omics.surv, benchmark.omics.clinical)[[i]]
    for (omic.subset in names(OMIC.SUBSETS)) {
      print(paste('current omic subset ', omic.subset))
      benchmark.data = cur.func(benchmark.ret, omic.subset)

      displayed.benchmark.data = benchmark.data
      colnames(displayed.benchmark.data)[1:ncol(displayed.benchmark.data)] = 
        sapply(as.list(colnames(displayed.benchmark.data)[1:ncol(displayed.benchmark.data)]), 
	subtype.to.display.name)
      rownames(displayed.benchmark.data) = unlist(ALGORITHM.DISPLAY.NAMES[rownames(displayed.benchmark.data)])
      print.matrix.latex.format(displayed.benchmark.data)
      
      table.name = c('runtime', 'num_cluster', 'survival', 'clinical')[i]
      write.csv(displayed.benchmark.data, file=file.path(get.tables.dir.path(), paste0(table.name, '_', OMIC.SUBSETS[[omic.subset]], '.csv')))
    }
    
    
    print('------------------------')
  }
  
  # plots that include all datasets
  for (i in 1:4) {
    omic.subset = names(OMIC.SUBSETS)[[i]]
    benchmark.surv = benchmark.omics.surv(benchmark.ret, omic.subset)
    benchmark.clinical = benchmark.omics.clinical(benchmark.ret, omic.subset)
    plot.name = list('multi_omics_surv_clinical.tiff', 'exp_surv_clinical.tiff', 
      'methy_surv_clinical.tiff', 'mirna_surv_clinical.tiff')[[i]]
    create.clinical.survival.plots(benchmark.surv, benchmark.clinical, plot.name)
  }
  
  
  # plots for the mean behaviour
  create.mean.clinical.survival.plot(benchmark.ret)
}

get.best.single.omic.mat <- function(single.omic.list1, single.omic.list2) {
  best.mat1 = matrix(0, ncol=ncol(single.omic.list1[[1]]), 
                            nrow=nrow(single.omic.list1[[1]]))
  best.mat2 = matrix(0, ncol=ncol(single.omic.list1[[1]]), 
                            nrow=nrow(single.omic.list1[[1]]))
  for (i in 1:nrow(best.mat1)) {
    for (j in 1:ncol(best.mat1)) {
      values1 = sapply(single.omic.list1, function(mat) mat[i, j])
      values2 = sapply(single.omic.list2, function(mat) mat[i, j])
      if (any(is.na(values1))) {
        best.mat1[i, j] = NA
	best.mat2[i, j] = NA
      } else {
        best.omic = which.max(values1)
        best.mat1[i, j] = values1[best.omic]
	best.mat2[i, j] = values2[best.omic]
      }
    }
  }
  return(list(best.mat1, best.mat2))
}

create.mean.clinical.survival.plot <- function(benchmark.ret) {
  tiff(file.path(get.plots.dir.path(), 'mean_surv_clinical.tiff'), width=4500, height=2250, res=300)
  layout(matrix(c(1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5), nrow=2, byrow=T))
  single.omic.surv.benchmarks = list()
  single.omic.clin.benchmarks = list()
  for (i in 2:6) {
    omic.subset = c(names(OMIC.SUBSETS), 'best_surv', 'best_clin')[i]
    alg.cols = c("#8DD3C7", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5", "#BC80BD", "#842121")
    pch = c(rep(15, 3), rep(3, 3), rep(19, 3))
    
    if (i <= 4) {
      benchmark.surv = benchmark.omics.surv(benchmark.ret, omic.subset)
      benchmark.clinical = benchmark.omics.clinical(benchmark.ret, omic.subset)
    
      if (i %in% 2:4) {
        single.omic.surv.benchmarks[[i - 1]] = benchmark.surv
        single.omic.clin.benchmarks[[i - 1]] = benchmark.clinical
      }
    
    } else if (i == 5) {
      best.mats = get.best.single.omic.mat(single.omic.surv.benchmarks, single.omic.clin.benchmarks)
      benchmark.surv = best.mats[[1]]
      benchmark.clinical = best.mats[[2]]

    } else {
      best.mats = get.best.single.omic.mat(single.omic.clin.benchmarks, single.omic.surv.benchmarks)
      benchmark.surv = best.mats[[2]]
      benchmark.clinical = best.mats[[1]]
    }
    
    surv.sum = rowSums(benchmark.surv, na.rm=T)
    clin.sum = rowSums(benchmark.clinical, na.rm=T)
    
    # for mcca, the sum is for an empty vector, and is equal 0, so we remove this
    available.indices = surv.sum != 0
    surv.sum = surv.sum[available.indices]
    clin.sum = clin.sum[available.indices]
    alg.cols = alg.cols[available.indices]
    pch = pch[available.indices]
    
    
    if (omic.subset == '1') {
      xlab = '-log10(logrank pvalue)'
      ylab = '# enriched clinical parameters'
    } else {
      xlab = ''
      ylab = ''
    }
    
    if (i %in% c(1, 5, 6)) {
      y.min = min(clin.sum) - 1
      x.min = min(surv.sum) - 1
    } else {
      y.min = 0
      x.min = 0
    }
    
    
    subplot.name = c('Multi-omics', 'Gene Expression', 'DNA Methylation', 'miRNA Expression',
                     'Best single-omic (survival)', 'Best single-omic (clinical)')[i]
    plot(surv.sum, clin.sum, main=subplot.name, xlab=xlab, ylab=ylab,
         xlim=c(x.min, max(surv.sum) + 1), ylim=c(y.min, max(clin.sum + 1)), col=alg.cols, pch=pch, cex.lab=1.8, cex=4, cex.axis=1.5, cex.main=2.5, lwd=4)
  }
  dev.off()
}

create.clinical.survival.plots <- function(benchmark.surv, benchmark.clinical, plot.name) {
  num.subtypes = ncol(benchmark.surv)
  tiff(file.path(get.plots.dir.path(), plot.name), width=4500, height=1875, res=300)
  alg.cols = c("#8DD3C7", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5", "#BC80BD", "#842121")
  pch = c(rep(15, 3), rep(3, 3), rep(19, 3))

  par(mfrow=c(2, num.subtypes / 2), mar=c(4.1, 4, 3.2, 1))
  for (i in 1:num.subtypes) {
    subtype = colnames(benchmark.surv)[i]
    subtype.surv = benchmark.surv[,subtype]
    subtype.clinical = benchmark.clinical[,subtype]
    
    available.indices = !is.na(subtype.surv)
    subtype.surv = subtype.surv[available.indices]
    subtype.clinical = subtype.clinical[available.indices]
    current.cols = alg.cols[available.indices]
    current.pch = pch[available.indices]
    
    surv.significance = -log10(0.05)
    
    if (i == 1) {
      xlab = '-log10(logrank pvalue)'
      ylab = '# enriched clinical parameters'
    } else {
      xlab = ''
      ylab = ''
    }
    
    plot(subtype.surv, subtype.clinical, main=subtype.to.display.name(subtype), xlab=xlab, ylab=ylab,
         xlim=c(0, max(subtype.surv, surv.significance) + 0.2), ylim=c(0, max(subtype.clinical + 1)), col=current.cols, pch=current.pch, cex.lab=1.4, cex=2.4, cex.axis=1.5, cex.main=1.8, lwd=3)
    abline(v=surv.significance, col='red')
  }
  
  dev.off()
}

plot.legend <- function() {
  alg.cols = c("#8DD3C7", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5", "#BC80BD", "#842121")
  pch = c(rep(15, 3), rep(3, 3), rep(19, 3))
  lwds = c(rep(NA, 3), rep(3, 3), rep(NA, 3))
  pt.cexs = 1.8
  algorithm.names = ALGORITHM.DISPLAY.NAMES
  algorithm.names[length(algorithm.names)] = paste0(algorithm.names[length(algorithm.names)], ' ')
  width=4200
  
  if (F) {
    alg.cols = alg.cols[-7]
    pch = pch[-7]
    lwds = lwds[-7]
    algorithm.names = algorithm.names[-7]
    pt.cexs = pt.cexs[-7]
    width=3800
  }
  
  tiff(file.path(get.plots.dir.path(), 'legend.tif'), width=width, res=300)
  par(font=2, mar=rep(1, 4))
  plot(0,xaxt='n',yaxt='n',bty='n',pch='',ylab='',xlab='')
  legend(0.58, 1, legend=algorithm.names, col=alg.cols, pch=pch, horiz=T, pt.cex=pt.cexs, pt.lwd=lwds)
  dev.off()
}

analyze.benchmark <- function() {
  all.clusterings = list()
  all.timings = list()
  for (i in 1:length(SUBTYPES.DATA)) {
    current.subtype.data = SUBTYPES.DATA[[i]]
    subtype = current.subtype.data$name
    subtype.raw.data = get.raw.data(subtype, 
                                    only.primary=current.subtype.data$only.primary)
				    
    all.clusterings[[subtype]] = list()
    all.timings[[subtype]] = list()
    
    for (algorithm.name in ALGORITHM.NAMES) {
      all.clusterings[[subtype]][[algorithm.name]] = list()
      all.timings[[subtype]][[algorithm.name]] = list()
      for (j in c('all', '1', '2', '3')) {
        clustering.path = file.path(get.clustering.results.dir.path(),
                                  paste(subtype, algorithm.name, j, sep='_'))
        timing.path = file.path(get.clustering.results.dir.path(),
                                  paste(subtype, algorithm.name, j, 'timing', sep='_'))
	load(clustering.path)
	load(timing.path)
	if (!any(is.na(clustering))) {
	  names(clustering) = colnames(subtype.raw.data[[1]])
	}
	
	all.clusterings[[subtype]][[algorithm.name]][[j]] = clustering
	all.timings[[subtype]][[algorithm.name]][[j]] = timing
      }
    }
  }
  return(list(all.clusterings=all.clusterings, all.timings=all.timings))
}

check.empirical.surv <- function(old.pvals, new.pvals) {
  is.in.conf = matrix(0, ncol=ncol(old.pvals), nrow=nrow(old.pvals))
  for (i in 1:nrow(old.pvals)) {
    for (j in 1:ncol(old.pvals)) {
      old.pval = old.pvals[i, j]
      if (is.na(old.pval)) {
        is.in.conf[i, j] = NA
	next
      }
      if (old.pval == 1e-10) old.pval = 1e-5
      num.runs = floor(30 / old.pval)
      new.pval = new.pvals[i, j]
      num.success = round(new.pval * num.runs)
      print(c(num.success, num.runs))
      
      conf.int = binom.test(num.success, num.runs)$conf.int
      in.conf.int = old.pval >= conf.int[1]  & old.pval <= conf.int[2]
      is.in.conf[i, j] = in.conf.int
    }
  }
  return(is.in.conf)
}

get.empirical.surv <- function(clustering, subtype) {
  set.seed(42)
  surv.ret = check.survival(clustering, subtype)
  orig.chisq = surv.ret$chisq
  orig.pvalue = get.logrank.pvalue(surv.ret)
  # The initial number of permutations to run
  num.perms = round(min(max(10 / orig.pvalue, 1000), 1e6))
  should.continue = T
  
  total.num.perms = 0
  total.num.extreme.chisq = 0
  
  while (should.continue) {
    print('Another iteration in empirical survival calculation')
    print(num.perms)
    perm.chisq = as.numeric(mclapply(1:num.perms, function(i) {
      cur.clustering = sample(clustering)
      names(cur.clustering) = names(clustering)
      cur.chisq = check.survival(cur.clustering, subtype)$chisq
      return(cur.chisq)
    }, mc.cores=50))
    
    total.num.perms = total.num.perms + num.perms
    total.num.extreme.chisq = total.num.extreme.chisq + sum(perm.chisq >= orig.chisq)
    
    binom.ret = binom.test(total.num.extreme.chisq, total.num.perms)
    cur.pvalue = binom.ret$estimate
    cur.conf.int = binom.ret$conf.int
    
    print(c(total.num.extreme.chisq, total.num.perms))
    print(cur.pvalue)
    print(cur.conf.int)
    
    sig.threshold = 0.05
    is.conf.small = ((cur.conf.int[2] - cur.pvalue) < min(cur.pvalue / 10, 0.01)) & ((cur.pvalue - cur.conf.int[1]) < min(cur.pvalue / 10, 0.01))
    is.threshold.in.conf = cur.conf.int[1] < sig.threshold & cur.conf.int[2] > sig.threshold
    if ((is.conf.small & !is.threshold.in.conf) | (total.num.perms > 2e7)) {
    #if (is.conf.small) {
      should.continue = F
    } else {
      num.perms = 1e5
    }
  }
  
  return(list(pvalue = cur.pvalue, conf.int = cur.conf.int, total.num.perms=total.num.perms, 
              total.num.extreme.chisq=total.num.extreme.chisq))
}

benchmark.omics.surv <- function(benchmark.results, omics='all') {

  
  all.surv.pvalues = matrix(1, ncol=length(SUBTYPES.DATA), nrow=length(ALGORITHM.NAMES))
  rownames(all.surv.pvalues) = ALGORITHM.NAMES
  colnames(all.surv.pvalues) = sapply(SUBTYPES.DATA, function(x) x$name)
  all.clusterings = benchmark.results$all.clusterings
  for (i in 1:length(all.clusterings)) {
    subtype = colnames(all.surv.pvalues)[i]
    subtype.clusterings = all.clusterings[[subtype]]
    for (j in 1:length(subtype.clusterings)) {
      surv.path = file.path(get.clustering.results.dir.path(),
                                  paste(subtype, ALGORITHM.NAMES[j], omics, 'surv', sep='_'))
      if (file.exists(surv.path)) {
        load(surv.path)
	pvalue = empirical.surv.ret$pvalue
	
      } else {
        clustering = subtype.clusterings[[ALGORITHM.NAMES[j]]][[omics]]
        if (length(table(clustering)) > 1) {
          #pvalue = -log10(get.logrank.pvalue(check.survival(clustering, subtype)))
	  empirical.surv.ret = get.empirical.surv(clustering, subtype)
	  save(empirical.surv.ret, file=surv.path)
	  pvalue = empirical.surv.ret$pvalue
	  

        } else {
          pvalue = NA
        }  
      }
      all.surv.pvalues[j, i] = -log10(pvalue)
    }
  }

  return(all.surv.pvalues)
}

benchmark.omics.num.clusters <- function(benchmark.results, omics='all') {
  num.clusters = matrix(1, ncol=length(SUBTYPES.DATA), nrow=length(ALGORITHM.NAMES))
  rownames(num.clusters) = ALGORITHM.NAMES
  colnames(num.clusters) = sapply(SUBTYPES.DATA, function(x) x$name)
  all.clusterings = benchmark.results$all.clusterings
  for (i in 1:length(all.clusterings)) {
    subtype = colnames(num.clusters)[i]
    subtype.clusterings = all.clusterings[[subtype]]
    for (j in 1:length(subtype.clusterings)) {
      clustering = subtype.clusterings[[ALGORITHM.NAMES[j]]][[omics]]
      num.clusters[j, i] = max(clustering)
    }
  }
  return(num.clusters)
}

benchmark.omics.clinical <- function(benchmark.results, omics='all') {
				  
  num.clinical.enrich = matrix(1, ncol=length(SUBTYPES.DATA), nrow=length(ALGORITHM.NAMES))
  rownames(num.clinical.enrich) = ALGORITHM.NAMES
  colnames(num.clinical.enrich) = sapply(SUBTYPES.DATA, function(x) x$name)
  total.num.tested.parameters = 0
  all.clusterings = benchmark.results$all.clusterings
  for (i in 1:length(all.clusterings)) {
    subtype = colnames(num.clinical.enrich)[i]
    subtype.clusterings = all.clusterings[[subtype]]
    for (j in 1:length(subtype.clusterings)) {
      print('checking clinical enrichment')
      print(c(i, j))
      
      
      clustering = subtype.clusterings[[ALGORITHM.NAMES[j]]][[omics]]
      if (any(is.na(clustering))) {
        num.clinical.enrich[j, i] = NA
      } else {
        clin.path = file.path(get.clustering.results.dir.path(),
                                  paste(subtype, ALGORITHM.NAMES[j], omics, 'clin', sep='_'))
				  
        if (file.exists(clin.path)) {
	  load(clin.path)
	} else {
	  enrichment.pvalues = check.clinical.enrichment(clustering, subtype)
	  save(enrichment.pvalues, file=clin.path)
	}
        
	if (j == 1) {
	  total.num.tested.parameters = total.num.tested.parameters + length(enrichment.pvalues)
	}
        num.clinical.enrich[j, i] = sum(enrichment.pvalues * length(enrichment.pvalues) < 0.05)
      }
    }
  }
  print(paste0('Total number of parameters tested:', total.num.tested.parameters))
  return(num.clinical.enrich)
}

benchmark.omics.time <- function(benchmark.results, omics='all') {
  all.alg.times = matrix(1, ncol=length(SUBTYPES.DATA), nrow=length(ALGORITHM.NAMES))
  rownames(all.alg.times) = ALGORITHM.NAMES
  colnames(all.alg.times) = sapply(SUBTYPES.DATA, function(x) x$name)
  all.timings = benchmark.results$all.timings
  for (i in 1:length(all.timings)) {
    subtype = colnames(all.alg.times)[i]
    subtype.timings = all.timings[[subtype]]
    for (j in 1:length(subtype.timings)) {
      timing = subtype.timings[[ALGORITHM.NAMES[j]]][[omics]]
      all.alg.times[j, i] = timing
    }
  }
  return(all.alg.times)
}

benchmark.alg.agreement <- function(benchmark.results, omics='all') {
  all.clusterings = benchmark.results$all.clusterings
  rand.matrix = matrix(NA, ncol=length(ALGORITHM.NAMES), nrow=length(ALGORITHM.NAMES))
  colnames(rand.matrix) = ALGORITHM.NAMES
  rownames(rand.matrix) = ALGORITHM.NAMES
  for (alg1.index in 1:length(ALGORITHM.NAMES)) {
    alg1 = ALGORITHM.NAMES[alg1.index]
    for (alg2.index in 1:length(ALGORITHM.NAMES)) {
      alg2 = ALGORITHM.NAMES[alg2.index]
      rands = c()
      for (i in 1:length(SUBTYPES.DATA)) {
        clustering1 = all.clusterings[[i]][[alg1]][[omics]]
	clustering2 = all.clusterings[[i]][[alg2]][[omics]]
	rands = c(rands, adj.rand.index(clustering1, clustering2))
      }
      mean.rand = mean(rands)
      rand.matrix[alg1.index, alg2.index] = mean.rand
    }
  }
  return(rand.matrix)
}

get.logrank.pvalue <- function(survdiff.res) {
  1 - pchisq(survdiff.res$chisq, length(survdiff.res$n) - 1)  
}

run.benchmark <- function() {
  for (i in 1:length(SUBTYPES.DATA)) {
    current.subtype.data = SUBTYPES.DATA[[i]]
    subtype = current.subtype.data$name
    subtype.raw.data = get.raw.data(subtype, 
                                    only.primary=current.subtype.data$only.primary)
    
    subtype.raw.data = set.omics.list.attr(subtype.raw.data, 
                                           current.subtype.data)
    
    for (algorithm.name in ALGORITHM.NAMES) {
      for (j in c('all', '1', '2', '3')) {
        set.seed(42)
        print(paste('subtype', subtype, 'running algorithm', algorithm.name, j))
        clustering.path = file.path(get.clustering.results.dir.path(),
                                  paste(subtype, algorithm.name, j, sep='_'))
        timing.path = file.path(get.clustering.results.dir.path(),
                                  paste(subtype, algorithm.name, j, 'timing', sep='_'))
	
	
        if (!file.exists(clustering.path)) {
          algorithm.func.name = paste0('run.', algorithm.name)
          algorithm.func = get(algorithm.func.name)
	  if (j == 'all') {
	    cur.iteration.data = subtype.raw.data
	  } else {
	    cur.iteration.data = subtype.raw.data[as.numeric(j)]
	  }
          algorithm.ret = algorithm.func(cur.iteration.data, current.subtype.data)
          clustering = algorithm.ret$clustering
          timing = algorithm.ret$timing
	  print('before saving')
          save(clustering, file = clustering.path)
          save(timing, file = timing.path)
        }
      }
    }
  }
}

get.dataset.dir.path <- function() {
  return('DATASETS_PATH')
}

log.and.normalize <- function(omics.data, subtype.data, normalize=T,
                              filter.var=F) {
  # filter features with no variance at all
  for (i in 1:length(omics.data)) {
    omics.data[[i]] = omics.data[[i]][apply(omics.data[[i]], 1, var) > 0,]
  }
			      
  for (i in 1:length(omics.data)) {
    if (attr(omics.data[[i]], 'is.seq')) {
      omics.data[[i]] = log(1+omics.data[[i]])
    }
  }
  
  if (filter.var) {
    omics.data = lapply(omics.data, keep.high.var.features)
  }
  
  if (normalize) {
    omics.data = lapply(omics.data, normalize.matrix)    
  }
  
  return(omics.data)
}

normalize.matrix <- function(data.matrix) {
  temp = data.matrix - rowMeans(data.matrix)
  should.keep = (apply(temp, 1, sd) != 0)
  return ((temp / apply(temp, 1, sd))[should.keep, ])
}

filter.non.tumor.samples <- function(raw.datum, only.primary=only.primary) {
  # 01 is primary, 06 is metastatic, 03 is blood derived cancer
  if (!only.primary)
    return(raw.datum[,substring(colnames(raw.datum), 14, 15) %in% c('01', '03', '06')])
  else
    return(raw.datum[,substring(colnames(raw.datum), 14, 15) %in% c('01')])
}

get.fixed.names <- function(patient.names, include.type=F) {
  # fix the TCGA names to only include the patient ids
  if (include.type) {
    return(gsub('-', '\\.', toupper(substring(patient.names, 1, 15))))
  } else {
    return(gsub('-', '\\.', toupper(substring(patient.names, 1, 12))))  
  }
}

fix.patient.names <- function(subtype.raw.data, include.type=F) {
  for (i in 1:length(subtype.raw.data)) {
    colnames(subtype.raw.data[[i]]) = get.fixed.names(colnames(subtype.raw.data[[i]]),
                                                      include.type)
  }
  return(subtype.raw.data)
}

get.raw.data <- function(subtype.name,
                         datasets.path = get.dataset.dir.path(),
                         only.primary=NA) {
  omics.dir = file.path(datasets.path, subtype.name)
  omics.files = list.files(omics.dir)
  omics.files = setdiff(omics.files, c('survival'))  
  raw.data = lapply(file.path(omics.dir, omics.files), read.table)
  
  if (!is.na(only.primary)) {
    raw.data = lapply(raw.data, function(x) filter.non.tumor.samples(x, only.primary = only.primary))
  }
  name.corrected.data = fix.patient.names(raw.data)
  patients.intersection = Reduce(intersect, lapply(name.corrected.data, colnames))
  ret.data = lapply(name.corrected.data, function(datum) datum[,patients.intersection])  
  return(ret.data)
}

get.elbow <- function(values, is.max) {
  second.derivatives = c()
  for (i in 2:(length(values) - 1)) {
    second.derivative = values[i + 1] + values[i - 1] - 2 * values[i]
    second.derivatives = c(second.derivatives, second.derivative)
  }
  print(second.derivatives)
  if (is.max) {
    return(which.max(second.derivatives) + 1)
  } else {
    return(which.min(second.derivatives) + 1)
  }
}

# Does not support a single omic dataset
run.mcca <- function(omics.list, subtype.data) {
  if (length(omics.list) == 1) {
    return(list(clustering=rep(NA, ncol(omics.list[[1]])), timing=1))
  }
  start = Sys.time()
  omics.list = log.and.normalize(omics.list, subtype.data, 
                                 normalize = T,
                                 filter.var = T)
  
  subtype = subtype.data$name
  omics.transposed = lapply(omics.list, t)
  cca.ret = PMA::MultiCCA(omics.transposed, 
                          ncomponents = MAX.NUM.CLUSTERS)
  sample.rep = omics.transposed[[1]] %*% cca.ret$ws[[1]]
  
  explained.vars = sapply(1:MAX.NUM.CLUSTERS, 
              function(i) sum(unlist(apply(sample.rep[1:i,,drop=F], 2, var))))
  
  dimension = get.elbow(explained.vars, is.max=F)
  print(dimension)
  sample.rep = sample.rep[,1:dimension]
  sils = c()
  clustering.per.num.clusters = list()
  for (num.clusters in 2:MAX.NUM.CLUSTERS) {
    cur.clustering = kmeans(sample.rep, num.clusters, iter.max=100, nstart=30)$cluster  
    sil = get.clustering.silhouette(list(t(sample.rep)), cur.clustering)
    sils = c(sils, sil)
    clustering.per.num.clusters[[num.clusters - 1]] = cur.clustering
  }
  # NOTE: the next line contains an error. We mistakenly selected the minimal rather maximal silhouette.
  # See more details in: http://acgt.cs.tau.ac.il/multi_omic_benchmark/download.html.
  cca.clustering = clustering.per.num.clusters[[which.min(sils)]]
  time.taken = as.numeric(Sys.time() - start, units='secs')
  return(list(clustering=cca.clustering, timing=time.taken))
}

run.snf <- function(omics.list, subtype.data) {
  start = Sys.time()
  omics.list = log.and.normalize(omics.list, subtype.data)
  subtype = subtype.data$name
  alpha=0.5
  T.val=30
  num.neighbors = round(ncol(omics.list[[1]]) / 10)
  similarity.data = lapply(omics.list, function(x) {affinityMatrix(dist2(as.matrix(t(x)),as.matrix(t(x))), 
                                                                   num.neighbors, alpha)})
  if (length(similarity.data) == 1) {
    W = similarity.data[[1]]
  } else {
    W = SNF(similarity.data, num.neighbors, T.val)  
  }
  
  num.clusters = estimateNumberOfClustersGivenGraph(W, 2:MAX.NUM.CLUSTERS)[[3]]  
  clustering = spectralClustering(W, num.clusters)
  time.taken = as.numeric(Sys.time() - start, units='secs')
  return(list(clustering=clustering, timing=time.taken))
}

run.iCluster <- function(omics.list, subtype.data) {
  omics.list = log.and.normalize(omics.list, subtype.data, normalize = F)

  start = Sys.time()
  subtype = subtype.data$name
  dev.ratios = c()
  icluster.rets = list()

  if (length(omics.list) == 1) {
    icluster.ret = iClusterPlus::tune.iClusterBayes(cpus=(MAX.NUM.CLUSTERS - 1), t(omics.list[[1]]), 
                              K=1:(MAX.NUM.CLUSTERS - 1), type=c('gaussian'))$fit
  } else {
    icluster.ret = iClusterPlus::tune.iClusterBayes(cpus=(MAX.NUM.CLUSTERS - 1), t(omics.list[[1]]), 
                              t(omics.list[[2]]), 
                              t(omics.list[[3]]), 
                              K=1:(MAX.NUM.CLUSTERS - 1), type=rep('gaussian', 3))$fit
  }
  dev.ratios = lapply(1:(MAX.NUM.CLUSTERS - 1), function(i) icluster.ret[[i]]$dev.ratio)

  print('dev.ratios are:')
  print(dev.ratios)
  
  optimal.solution = icluster.ret[[which.max(dev.ratios)]]
  time.taken = as.numeric(Sys.time() - start, units='secs')
  return(list(clustering=optimal.solution$clusters, 
              timing=time.taken))
}

get.mkl.binary.path = function() {
  return('MKL_BINARY_PATH')
}

get.mkl.arguments.path = function() {
  return('MKL_ARGS_PATH')
}


run.mkl <- function(omics.list, subtype.data) {
  start = Sys.time()
  omics.list = log.and.normalize(omics.list, subtype.data)
  subtype = subtype.data$name
  omics.list = lapply(omics.list, normalize.matrix)
  time.taken = as.numeric(Sys.time() - start, units='secs')
  export.subtype.to.mkl(omics.list, subtype)
  
  start = Sys.time()
  bin.path = get.mkl.binary.path()
  subtype.dir = paste0(get.mkl.arguments.path(), subtype, '\\')
  paste0(subtype.dir, 'kernels')
  command = paste(bin.path, paste0(subtype.dir, 'kernels'),
                  paste0(subtype.dir, 'ids'),
                  paste0(subtype.dir, 'output'), 
                  '9', '5')
  command.return = system(command)
  stopifnot(command.return == 0)
  time.taken2 = as.numeric(Sys.time() - start, units='secs')
  clustering = get.mkl.clustering(subtype)
  return(list(clustering=clustering, 
              timing=time.taken + time.taken2))
}

run.nmf <- function(omics.list, subtype.data) {
  total.time.taken = 0
  start = Sys.time()
  omics.list = log.and.normalize(omics.list, subtype.data,
                                 filter.var = T, normalize = F)
  time.taken = as.numeric(Sys.time() - start, units='secs')
  total.time.taken = total.time.taken + time.taken
  
  save.subtype.matlab.format(omics.list)
  subtype = subtype.data$name
  if (length(omics.list) > 1) {
    command.ret = system('MULTI_NMF_COMMAND')
    stopifnot(command.ret == 0)
    nmf.timing = read.csv('SOME_TEMP_PATH_TIMING', header=F)[1, 1]
    total.time.taken = total.time.taken + nmf.timing
  } else {
   for (k in 1:MAX.NUM.CLUSTERS) {
     start = Sys.time()
     file.name = paste0('SOME_TEMP_PATH/', k, '_consensus')
     nmf.ret = nmf(omics.list[[1]], k, method='lee')
     coef.mat = t(coef(nmf.ret))
     time.taken = as.numeric(Sys.time() - start, units='secs')
     total.time.taken = total.time.taken + time.taken
     write.table(coef.mat, file=file.name, quote=F, row.names=F, col.names=F, sep=',')
   }
  }
  
  explained.vars = c()
  clustering.per.num.clusters = list()
  for (k in 1:MAX.NUM.CLUSTERS) {
    file.name = paste0('SOME_TEMP_PATH/', k, '_consensus')
    consensus.mat = read.csv(file.name, header=F)
    
    start = Sys.time()
    cur.clustering = apply(consensus.mat, 1, which.max)
    explained.var = sum(unlist(apply(consensus.mat, 2, var)))
    explained.vars = c(explained.vars, explained.var)
    clustering.per.num.clusters[[k]] = cur.clustering
    time.taken = as.numeric(Sys.time() - start, units='secs')
    total.time.taken = total.time.taken + time.taken
  }
  
  dimension = get.elbow(explained.vars, is.max=F)
  nmf.clustering = clustering.per.num.clusters[[dimension]]
  return(list(clustering=nmf.clustering, timing=total.time.taken))  
}

run.pins <- function(omics.list, subtype.data) {
  start = Sys.time()
  omics.list = log.and.normalize(omics.list, subtype.data, normalize = F)
  subtype = subtype.data$name
  omics.transposed = lapply(omics.list, t)
  if (length(omics.list) == 1) {
    pins.ret = PINSPlus::PerturbationClustering(data=omics.transposed[[1]],
                                            kMax = MAX.NUM.CLUSTERS)
    clustering = pins.ret$cluster
    
  } else {
    pins.ret = PINSPlus::SubtypingOmicsData(dataList=omics.transposed,
                                            kMax = MAX.NUM.CLUSTERS)
    clustering = pins.ret$cluster2
  }
  time.taken = as.numeric(Sys.time() - start, units='secs')
  return(list(clustering=clustering, timing=time.taken))
}

run.lracluster <- function(omics.list, subtype.data) {
  omics.list = log.and.normalize(omics.list, subtype.data, normalize = F)
  
  subtype = subtype.data$name
  start = Sys.time()
  
  dim.range = 1:MAX.NUM.CLUSTERS
  all.clustering.results = list()
  
  omics.matrix.list = lapply(omics.list, as.matrix)
  for (dimension in dim.range) {
    print(paste('running lra cluster for dimension', dimension))
    data.names = c('gene expression', 'methylation', 'miRNA expression')
    clustering.results = LRAcluster(omics.matrix.list, 
                                    rep('gaussian', length(omics.list)), 
                                    dimension=dimension, data.names)
    all.clustering.results[[dimension]] = clustering.results
  }
  explained.var = sapply(all.clustering.results, function(x) x$potential)
  print(explained.var)
  dimension = get.elbow(explained.var, is.max=F)
  print(dimension)
  solution = all.clustering.results[[dimension]]$coordinate
  
  sils = c()
  clustering.per.num.clusters = list()
  for (num.clusters in 2:MAX.NUM.CLUSTERS) {
    print(paste('running kmeans in lra cluster for num clusters', num.clusters))
    cur.clustering = kmeans(t(solution), num.clusters, iter.max=100, nstart=60)$cluster
    sil = get.clustering.silhouette(list(solution), cur.clustering)
    sils = c(sils, sil)
    clustering.per.num.clusters[[num.clusters - 1]] = cur.clustering
  }
  print(sils)
  # NOTE: the next line contains an error. We mistakenly selected the minimal rather maximal silhouette.
  # See more details in: http://acgt.cs.tau.ac.il/multi_omic_benchmark/download.html.
  chosen.clustering = clustering.per.num.clusters[[which.min(sils)]]
  time.taken = as.numeric(Sys.time() - start, units='secs')
  return(list(clustering=chosen.clustering, timing=time.taken))
}

run.kmeans <- function(omics.list, subtype.data) {
  start = Sys.time()
  omics.list = log.and.normalize(omics.list, subtype.data, 
                                 filter.var = T)
  
  subtype = subtype.data$name
  all.withinss = c()
  all.clusterings = list()
  k.range = 1:MAX.NUM.CLUSTERS
  for (k in k.range) {
    concat.omics = do.call(rbind, omics.list)
    kmeans.ret = kmeans(t(concat.omics), k, iter.max=100, nstart=60)
    all.withinss = c(all.withinss, kmeans.ret$tot.withinss)
    all.clusterings[[k]] = kmeans.ret$cluster
  }
  
  best.k = get.elbow(all.withinss, is.max=T)
  time.taken = as.numeric(Sys.time() - start, units='secs')
  return(list(clustering=all.clusterings[[best.k]], 
              timing=time.taken))
}

run.spectral <- function(omics.list, subtype.data) {
  start = Sys.time()
  omics.list = log.and.normalize(omics.list, subtype.data, 
                                 filter.var = T)
  subtype = subtype.data$name
  concat.omics = do.call(rbind, omics.list)
  
  similarity.data = affinityMatrix(dist2(as.matrix(t(concat.omics)),
                                         as.matrix(t(concat.omics))), 
                                   20, 0.5)
  num.clusters = estimateNumberOfClustersGivenGraph(similarity.data, 
                                      2:MAX.NUM.CLUSTERS)[[3]]  
  clustering = spectralClustering(similarity.data, num.clusters)
  time.taken = as.numeric(Sys.time() - start, units='secs')
  return(list(clustering=clustering, timing=time.taken))
}

load.libraries <- function() {
  library('PMA')
  library('R.matlab')
  library('SNFtool')
  library('PINSPlus')
  #library('LRAcluster')
  library('kernlab')
  library('survival')
  library('NMF')
  
  # bioconductor packages
  source("https://bioconductor.org/biocLite.R")
  biocLite("impute")
  #biocLite("iClusterPlus")
}



########################################
###############   MKL    ###############
########################################

radial.basis <- function(mat, gamma) {
  if (missing(gamma)) {
    gamma = 1 / (2*nrow(mat)**2)
  }
  npatients = ncol(mat)
  output.mat = matrix(0, ncol=npatients, nrow=npatients)
  for (i in 1:npatients) {
    for (j in 1:npatients) {
      output.mat[i, j] = exp(-norm(as.matrix(mat[,i] - mat[,j]), type = 'F')**2 * gamma)
    }
  }
  
  D = apply(output.mat, 2, sum) / npatients
  E = sum(D) / npatients
  J = matrix(1, nrow=npatients, ncol=1) %*% D
  ret = output.mat - J - t(J) + E * matrix(1, ncol=npatients, nrow=npatients)
  ret = diag(1/sqrt(diag(ret))) %*% ret %*% diag(1/sqrt(diag(ret)))
  return(ret)
}

clear.dir <- function(dir.path) {
  files.in.dir = list.files(dir.path)
  for (file.in.dir in files.in.dir) {
    full.file.path = file.path(dir.path, file.in.dir)
    file.remove(full.file.path)
  }
}

export.subtype.to.mkl <- function(omics.list, dir.name) {
  
  folder.path = file.path(get.mkl.arguments.path(), dir.name)
  if (!dir.exists(folder.path)) {
    dir.create(folder.path)
  }
  
  kernels.path = file.path(folder.path, 'kernels')
  
  if (!dir.exists(kernels.path)) {
    dir.create(kernels.path)
  }
  clear.dir(kernels.path)
  
  gammas = 10 ** seq(-6, 6, by=3)
  for (i in 1:length(omics.list)) {
    for (j in 1:length(gammas)) {
      datum = omics.list[[i]]
      gamma = gammas[[j]] / (2*nrow(datum)**2)
      mat = radial.basis(datum, gamma)
      R.matlab::writeMat(file.path(kernels.path, paste(i, '_', j, sep='')), mat=mat)
    }
  }
  
  output.path = file.path(folder.path, 'output')
  if (!dir.exists(output.path)) {
    dir.create(output.path)
  }
  clear.dir(output.path)
  
  write.table(colnames(omics.list[[1]]), file=file.path(folder.path, 'ids'),
              quote=F, row.names = F, col.names = F)
  
}

get.mkl.clustering <- function(dir.name) {
  folder.path = file.path(get.mkl.arguments.path(), dir.name)
  output.path = file.path(folder.path, 'output')
  output.files = list.files(output.path)
  clustering = read.csv(file.path(output.path, output.files[grep('clusters', output.files)]))[,2]
  return(clustering)
}

check.survival <- function(groups, subtype, survival.file.path) {
  if (missing(survival.file.path)) {
    survival.file.path = get.subtype.survival.path(subtype)
  }
  survival.data = read.table(survival.file.path, header = TRUE)
  patient.names = names(groups)
  patient.names.in.file = as.character(survival.data[, 1])
  patient.names.in.file = toupper(gsub('-', '\\.', substring(patient.names.in.file, 1, 12)))
  
  stopifnot(all(patient.names %in% patient.names.in.file))
  
  indices = match(patient.names, patient.names.in.file)
  ordered.survival.data = survival.data[indices,]
  ordered.survival.data["cluster"] <- groups
  ordered.survival.data$Survival[is.na(ordered.survival.data$Survival)] = 0
  ordered.survival.data$Death[is.na(ordered.survival.data$Death)] = 0
  return(survdiff(Surv(Survival, Death) ~ cluster, data=ordered.survival.data))
  
}

get.subtype.survival.path <- function(subtype) {
  datasets.path = get.dataset.dir.path()
  survival.file.path = file.path(datasets.path, subtype, 'survival')
  return(survival.file.path)
}

get.clustering.silhouette <- function(raw.data, clustering) {
  sils = c()
  for (i in 1:length(raw.data)) {
    x = raw.data[[i]]
    distmatrix = dist2(as.matrix(t(x)),as.matrix(t(x)))
    sil = silhouette(clustering, dmatrix = distmatrix)[,3]
    sils = c(sils, mean(sil))
  }
  return(mean(sils))
}

get.clinical.params.dir <- function() {
  return('CLINICAL_PARAMS_PATH')
}

get.clinical.params <- function(subtype.name) {
  clinical.data.path = paste(get.clinical.params.dir(), subtype.name, sep = '')
  clinical.params = read.table(clinical.data.path,
                               sep='\t', header=T, row.names = 1, stringsAsFactors = F)
  rownames.with.duplicates = get.fixed.names(rownames(clinical.params))  
  clinical.params = clinical.params[!duplicated(rownames.with.duplicates),]
  rownames(clinical.params) = rownames.with.duplicates[!duplicated(rownames.with.duplicates)]
  return(clinical.params)
}

check.clinical.enrichment <- function(clustering, subtype.name) {
  clinical.params = get.clinical.params(subtype.name)  
  
  clinical.metadata = list(gender='DISCRETE', age_at_initial_pathologic_diagnosis='NUMERIC',
    pathologic_M='DISCRETE', pathologic_N='DISCRETE', pathologic_T='DISCRETE', pathologic_stage='DISCRETE')
  
  pvalues = c()
  
  params.being.tested = c()
  
  for (clinical.param in names(clinical.metadata)) {
    
    if (!(clinical.param %in% colnames(clinical.params))) {
      #print(paste0('WARNING: ', clinical.param, ' does not appear for subtype ', subtype.name))
      next
    }
    
    clinical.values = clinical.params[names(clustering),clinical.param]
    is.discrete.param = clinical.metadata[clinical.param] == 'DISCRETE'
    is.numeric.param = clinical.metadata[clinical.param] == 'NUMERIC'
    stopifnot(is.discrete.param | is.numeric.param)
    
    # skip parameter if many missing values
    
    if (is.numeric.param) {
      numeric.entries = !is.na(as.numeric(clinical.values))
      if (2 * sum(numeric.entries) < length(clinical.values)) {
        #print(paste0('WARNING: skipping on ', clinical.param, ' for subtype ', subtype.name))
        next
      }
    } else {
      not.na.entries = !is.na(clinical.values)
      should.skip = F
      if (2 * sum(not.na.entries) < length(clinical.values)) {
        should.skip = T
      } else if (length(table(clinical.values[not.na.entries])) == 1) {
        should.skip = T
      }
      if (should.skip) {
        #print(paste0('WARNING: skipping on ', clinical.param, ' for subtype ', subtype.name))
        next
      }
    }
    
    params.being.tested = c(params.being.tested, clinical.param)
    
    if (is.discrete.param) {
      #clustering.with.clinical = cbind(clustering, clinical.values)
      #tbl = table(as.data.frame(clustering.with.clinical[!is.na(clinical.values),]))
      #test.res = chisq.test(tbl)
      #pvalue = test.res$p.value
      pvalue = get.empirical.clinical(clustering[!is.na(clinical.values)], clinical.values[!is.na(clinical.values)], T)
      
    } else if (is.numeric.param) {
      #test.res = kruskal.test(as.numeric(clinical.values[numeric.entries]),
      #				clustering[numeric.entries])
      #pvalue = test.res$p.value
      pvalue = get.empirical.clinical(clustering[numeric.entries], as.numeric(clinical.values[numeric.entries]), F)
    }
    
    pvalues = c(pvalues, pvalue)
    
  }
  names(pvalues) = params.being.tested
  return(pvalues)
}

get.empirical.clinical <- function(clustering, clinical.values, is.chisq) {
  set.seed(42)
  if (is.chisq) {
      clustering.with.clinical = cbind(clustering, clinical.values)
      tbl = table(as.data.frame(clustering.with.clinical))
      test.res = chisq.test(tbl)
  } else {
    test.res = kruskal.test(as.numeric(clinical.values), clustering)
  }
  orig.pvalue = test.res$p.value
  num.iter = 1000
  total.num.iters = 0
  total.num.extreme = 0
  should.continue = T
  
  while (should.continue) {
    print('another iteration in empirical clinical')
    perm.pvalues = as.numeric(mclapply(1:num.iter, function(i) {
      cur.clustering = sample(clustering)
      names(cur.clustering) = names(clustering)
    
      if (is.chisq) {
        clustering.with.clinical = cbind(cur.clustering, clinical.values)
        tbl = table(as.data.frame(clustering.with.clinical))
        test.res = chisq.test(tbl)
      } else {
        test.res = kruskal.test(as.numeric(clinical.values), cur.clustering)
      }
      cur.pvalue = test.res$p.value
      return(cur.pvalue)
    }, mc.cores=50))
    total.num.iters = total.num.iters + num.iter
    total.num.extreme = total.num.extreme + sum(perm.pvalues <= orig.pvalue)
    
    binom.ret = binom.test(total.num.extreme, total.num.iters)
    cur.pvalue = binom.ret$estimate
    cur.conf.int = binom.ret$conf.int
    
    sig.threshold = 0.05
    is.threshold.in.conf = cur.conf.int[1] < sig.threshold & cur.conf.int[2] > sig.threshold
    if (!is.threshold.in.conf | total.num.iters > 1e5) {
      should.continue = F
    }
  }
  return(cur.pvalue)
}

get.nmf.datasets.dir <- function() {
  return('NMF_DATASETS_PATH')
}

save.subtype.matlab.format <- function(subtype.raw.data) {
  data.names = c('1', '2', '3')
  
  full.dir.name = get.nmf.datasets.dir()
  dir.create(full.dir.name, showWarnings = F)
  
  for (i in 1:length(subtype.raw.data)) {
    full.file.name = file.path(full.dir.name, data.names[i])
    write.table(subtype.raw.data[[i]], full.file.name)
  }
}

keep.high.var.features <- function(omic, num.features=2000) {
  if (nrow(omic) < num.features) {
    return(omic)
  } else {
    feature.vars = apply(omic, 1, var)
    threshold = feature.vars[order(feature.vars, decreasing = T)][num.features]
    return(omic[feature.vars >= threshold,])    
  }
}