--- a +++ b/exseek/scripts/differential_expression.R @@ -0,0 +1,210 @@ +#! /usr/bin/env Rscript + +suppressPackageStartupMessages(library(argparse)) +parser <- ArgumentParser(description='Differential expression') +parser$add_argument('-i', '--matrix', type='character', required=TRUE, + help='input count matrix. Rows are genes. Columns are samples.') +parser$add_argument('-c', '--classes', type='character', required=TRUE, + help='input sample class information. Column 1: sample_id, Column 2: class') +parser$add_argument('-s', '--samples', type='character', + help='input file containing sample ids for differential expression') +parser$add_argument('-p', '--positive-class', type='character', required=TRUE, + help='comma-separated class names to use as positive class') +parser$add_argument('-n', '--negative-class', type='character', required=TRUE, + help='comma-separated class names to use as negative class') +parser$add_argument('-b', '--batch', type='character', required=FALSE, + help='batch information to remove') +parser$add_argument('--batch-index', type='integer', default=1, + help='column number of the batch to remove') +parser$add_argument('-m', '--method', type='character', default="deseq2", + choices=c('deseq2', 'edger_exact', 'edger_glmqlf', 'edger_glmlrt', 'wilcox', 'limma', 'ttest'), + help='differential expression method to use') +parser$add_argument('--norm-method', type='character', default='TMM', + choices=c('RLE', 'CPM', 'TMM', 'upperquartile'), + help='normalization method for count-based methods') +parser$add_argument('--pseudo-count', type='double', default=1.0, + help='pseudo-count added to log2 transform in ttest') +parser$add_argument('-o', '--output-file', type='character', required=TRUE, + help='output file') +args <- parser$parse_args() + +message('read count matrix: ', args$matrix) +mat <- read.table(args$matrix, header = TRUE, row.names=1, check.names=FALSE, sep='\t') +message('read class information: ', args$classes) +class_info <- read.table(args$classes, row.names=1, check.names=FALSE, header = TRUE, sep='\t', as.is=TRUE) +class_info <- class_info[colnames(mat),] +names(class_info) <- colnames(mat) +if(!is.null(args$samples)){ + message('read sample ids: ', args$samples) + samples <- read.table(args$samples, check.names=FALSE, header=FALSE, as.is=TRUE)[,1] + mat <- mat[, samples] + class_info <- class_info[samples] +} +# get positive and negative class +for(cls in strsplit(args$positive_class, ',')[[1]]){ + class_info[class_info == cls] <- 'positive' +} +positive_samples <- names(class_info)[class_info == 'positive'] +if(length(positive_samples) == 0){ + stop('No positive samples found') +} +message('Number of positive samples: ', length(positive_samples)) +negative_samples <- NULL +for(cls in strsplit(args$negative_class, ',')[[1]]){ + class_info[class_info == cls] <- 'negative' +} +negative_samples <- names(class_info)[class_info == 'negative'] +if(length(negative_samples) == 0){ + stop('No negative samples found') +} +message('Number of negative samples: ', length(negative_samples)) + +samples <- c(positive_samples, negative_samples) + +group <- class_info[samples] +mat <- as.matrix(mat[,samples]) +#class_info <- as.matrix(class_info) +#colnames(class_info) <- 'label' +mat <- as.matrix(mat) + +# read batch information +if(!is.null(args$batch)){ + message('read batch information from: ', args$batch) + batch <- read.table(args$batch, check.names=FALSE, header=TRUE, as.is=TRUE, row.names=1, sep='\t') + if((args$batch_index < 1) || (args$batch_index > ncol(batch))){ + stop('Batch index out of bound') + } + batch <- batch[samples, args$batch_index] +}else{ + batch <- NULL +} + +# set normalization method +if(args$norm_method == 'CPM'){ + norm_method <- 'none' +} else{ + norm_method <- args$norm_method +} +message('perform differential expression using ', args$method) +# Required columns for a differential expression file: baseMean, log2FoldChange, pvalue, padj +if(args$method == 'deseq2'){ + suppressPackageStartupMessages(library(DESeq2)) + if(!is.null(batch)){ + # include batch into regression + dds <- DESeqDataSetFromMatrix(countData = mat, + colData = as.matrix(data.frame(group=group, batch=batch)), + design = ~group + batch) + } else { + dds <- DESeqDataSetFromMatrix(countData = mat, + colData = as.matrix(data.frame(group=group)), + design = ~group) + } + dds <- DESeq(dds) + res <- results(dds, contrast=c('group', 'positive', 'negative')) + #res <- res[order(res$padj)] + write.table(as.data.frame(res), args$output_file, sep='\t', quote=FALSE, row.names=TRUE) +} else if(grepl('^edger_', args$method)) { + suppressPackageStartupMessages(library(edgeR)) + y <- DGEList(counts=mat, samples=samples, group=group) + y <- calcNormFactors(y, method=norm_method) + if(!is.null(batch)){ + # regress out batch information as an additive term + design <- model.matrix(~group + batch) + } else { + design <- model.matrix(~group) + } + y <- estimateDisp(y, design) + if(args$method == 'edger_glmqlf'){ + fit <- glmQLFit(y, design) + test <- glmQLFTest(fit, coef=2) + res <- topTags(test, n=nrow(mat), sort.by='none') + } else if(args$method == 'edger_glmlrt'){ + fit <- glmFit(y, design) + test <- glmLRT(fit, coef=2) + res <- topTags(test, n=nrow(mat), sort.by='none') + } else if(args$method == 'edger_exact'){ + if(!is.null(batch)) message('ignoring batch information for exact text') + test <- exactTest(y) + res <- topTags(test, n=nrow(mat), sort.by='none') + } + res <- cbind(res$table, baseMean=2^(res$table$logCPM)) + # rename columns + mapped_names <- colnames(res) + for(i in 1:ncol(res)){ + if(colnames(res)[i] == 'logFC'){ + mapped_names[i] <- 'log2FoldChange' + }else if(colnames(res)[i] == 'PValue'){ + mapped_names[i] <- 'pvalue' + }else if(colnames(res)[i] == 'FDR') { + mapped_names[i] <- 'padj' + }else{ + mapped_names[i] <- colnames(res)[i] + } + } + colnames(res) <- mapped_names + + # write results to file + message('Write results to output file: ', args$output_file) + write.table(res, args$output_file, sep='\t', quote=FALSE, row.names=TRUE) +} else if(args$method == 'wilcox') { + suppressPackageStartupMessages(library(edgeR)) + # normalize + matrix_cpm <- cpm(mat, method=norm_method) + test_func <- function(x){ + wilcox.test(x[group == 'negative'], x[group == 'positive'], alternative='two.sided')$p.value + } + pvalues <- apply(matrix_cpm, 1, test_func) + matrix_logcpm = log2(matrix + args$pseudo_count) + logFC <- apply(matrix_logcpm[,which(group == 'positive')], 1, mean) - + apply(matrix_logcpm[,which(group == 'negative')], 1, mean) + res <- data.frame(log2FoldChange=logFC, + pvalue=pvalues, + padj=p.adjust(pvalues, method='BH'), + baseMean=apply(matrix_cpm, 1, mean)) + message('Write results to output file: ', args$output_file) + write.table(res, args$output_file, sep='\t', quote=FALSE, row.names=TRUE) +} else if(args$method == 'limma'){ + suppressPackageStartupMessages(library(limma)) + suppressPackageStartupMessages(library(edgeR)) + y <- DGEList(counts=mat, samples=samples, group=group) + y <- calcNormFactors(y, method=norm_method) + if(!is.null(batch)){ + model <- model.matrix(~group + batch) + } else { + model <- model.matrix(~group) + } + y <- voom(y, model, plot=FALSE) + fit <- lmFit(y, model) + fit <- eBayes(fit, robust=TRUE, trend=TRUE) + #fit2 <- contrasts.ft(fit) + #fit2 <- eBayes(fit2, robust=TRUE, trend=TRUE) + #top_table <- topTable(fit2, sort.by='none', n=Inf) + top_table <- topTable(fit, coef=2, sort.by='none', n=Inf) + # rename columns + mapped_names <- colnames(top_table) + for(i in 1:ncol(top_table)){ + if(colnames(top_table)[i] == 'logFC'){ + mapped_names[i] <- 'log2FoldChange' + }else if(colnames(top_table)[i] == 'P.Value'){ + mapped_names[i] <- 'pvalue' + }else if(colnames(top_table)[i] == 'adj.P.Val') { + mapped_names[i] <- 'padj' + }else{ + mapped_names[i] <- colnames(top_table)[i] + } + } + colnames(top_table) <- mapped_names + + # write results to file + message('Write results to output file: ', args$output_file) + write.table(top_table, args$output_file, sep='\t', quote=FALSE, row.names=TRUE) +} else if(args$method == "ttest") { + suppressPackageStartupMessages(library(genefilter)) + #mat <- log2(mat + args$pseudo_count) + res <- rowttests(mat, as.factor(group)) + res$padj <- p.adjust(res$p.value, method='BH') + res$log2FoldChange <- rowMeans(mat[, group == 'positive']) - rowMeans(mat[, group == 'negative']) + write.table(res, args$output_file, sep='\t', quote=FALSE, row.names=TRUE) +} else { + stop('unknown differential expression method: ', args$method) +}