exSEEK / Git / [4c33d4] /exseek/scripts/differential

Models:
DanielG/
exSEEK
Downloads: 1
[4c33d4]: / exseek / scripts / differential_expression.R
History
Download this file
211 lines (202 with data), 9.0 kB

#! /usr/bin/env Rscript

suppressPackageStartupMessages(library(argparse))
parser <- ArgumentParser(description='Differential expression')
parser$add_argument('-i', '--matrix', type='character', required=TRUE,
    help='input count matrix. Rows are genes. Columns are samples.')
parser$add_argument('-c', '--classes', type='character', required=TRUE,
    help='input sample class information. Column 1: sample_id, Column 2: class')
parser$add_argument('-s', '--samples', type='character',
    help='input file containing sample ids for differential expression')
parser$add_argument('-p', '--positive-class', type='character', required=TRUE,
    help='comma-separated class names to use as positive class')
parser$add_argument('-n', '--negative-class', type='character', required=TRUE,
    help='comma-separated class names to use as negative class')
parser$add_argument('-b', '--batch', type='character', required=FALSE,
    help='batch information to remove')
parser$add_argument('--batch-index', type='integer', default=1,
    help='column number of the batch to remove')
parser$add_argument('-m', '--method', type='character', default="deseq2",
    choices=c('deseq2', 'edger_exact', 'edger_glmqlf', 'edger_glmlrt', 'wilcox', 'limma', 'ttest'),
    help='differential expression method to use')
parser$add_argument('--norm-method', type='character', default='TMM',
    choices=c('RLE', 'CPM', 'TMM', 'upperquartile'),
    help='normalization method for count-based methods')
parser$add_argument('--pseudo-count', type='double', default=1.0,
    help='pseudo-count added to log2 transform in ttest')
parser$add_argument('-o', '--output-file', type='character', required=TRUE,
    help='output file')
args <- parser$parse_args()

message('read count matrix: ', args$matrix)
mat <- read.table(args$matrix, header = TRUE, row.names=1, check.names=FALSE, sep='\t')
message('read class information: ', args$classes)
class_info <- read.table(args$classes, row.names=1, check.names=FALSE, header = TRUE, sep='\t', as.is=TRUE)
class_info <- class_info[colnames(mat),]
names(class_info) <- colnames(mat)
if(!is.null(args$samples)){
    message('read sample ids: ', args$samples)
    samples <- read.table(args$samples, check.names=FALSE, header=FALSE, as.is=TRUE)[,1]
    mat <- mat[, samples]
    class_info <- class_info[samples]
}
# get positive and negative class
for(cls in strsplit(args$positive_class, ',')[[1]]){
    class_info[class_info == cls] <- 'positive'
}
positive_samples <- names(class_info)[class_info == 'positive']
if(length(positive_samples) == 0){
    stop('No positive samples found')
}
message('Number of positive samples: ', length(positive_samples))
negative_samples <- NULL
for(cls in strsplit(args$negative_class, ',')[[1]]){
    class_info[class_info == cls] <- 'negative'
}
negative_samples <- names(class_info)[class_info == 'negative']
if(length(negative_samples) == 0){
    stop('No negative samples found')
}
message('Number of negative samples: ', length(negative_samples))

samples <- c(positive_samples, negative_samples)

group <- class_info[samples]
mat <- as.matrix(mat[,samples])
#class_info <- as.matrix(class_info)
#colnames(class_info) <- 'label'
mat <- as.matrix(mat)

# read batch information
if(!is.null(args$batch)){
    message('read batch information from: ', args$batch)
    batch <- read.table(args$batch, check.names=FALSE, header=TRUE, as.is=TRUE, row.names=1, sep='\t')
    if((args$batch_index < 1) || (args$batch_index > ncol(batch))){
        stop('Batch index out of bound')
    }
    batch <- batch[samples, args$batch_index]
}else{
    batch <- NULL
}

# set normalization method
if(args$norm_method == 'CPM'){
    norm_method <- 'none'
} else{
    norm_method <- args$norm_method
}
message('perform differential expression using ', args$method)
# Required columns for a differential expression file: baseMean, log2FoldChange, pvalue, padj
if(args$method == 'deseq2'){
    suppressPackageStartupMessages(library(DESeq2))
    if(!is.null(batch)){
        # include batch into regression
        dds <- DESeqDataSetFromMatrix(countData = mat,
                                    colData = as.matrix(data.frame(group=group, batch=batch)),
                                    design = ~group + batch)
    } else {
        dds <- DESeqDataSetFromMatrix(countData = mat,
                                    colData = as.matrix(data.frame(group=group)),
                                    design = ~group)
    }
    dds <- DESeq(dds)
    res <- results(dds, contrast=c('group', 'positive', 'negative'))
    #res <- res[order(res$padj)]
    write.table(as.data.frame(res), args$output_file, sep='\t', quote=FALSE, row.names=TRUE)
} else if(grepl('^edger_', args$method)) {
    suppressPackageStartupMessages(library(edgeR))
    y <- DGEList(counts=mat, samples=samples, group=group)
    y <- calcNormFactors(y, method=norm_method)
    if(!is.null(batch)){
        # regress out batch information as an additive term
        design <- model.matrix(~group + batch)
    } else {
        design <- model.matrix(~group)
    }
    y <- estimateDisp(y, design)
    if(args$method == 'edger_glmqlf'){
        fit <- glmQLFit(y, design)
        test <- glmQLFTest(fit, coef=2)
        res <- topTags(test, n=nrow(mat), sort.by='none')
    } else if(args$method == 'edger_glmlrt'){
        fit <- glmFit(y, design)
        test <- glmLRT(fit, coef=2)
        res <- topTags(test, n=nrow(mat), sort.by='none')
    } else if(args$method == 'edger_exact'){
        if(!is.null(batch)) message('ignoring batch information for exact text')
        test <- exactTest(y)
        res <- topTags(test, n=nrow(mat), sort.by='none')
    }
    res <- cbind(res$table, baseMean=2^(res$table$logCPM))
    # rename columns
    mapped_names <- colnames(res)
    for(i in 1:ncol(res)){
        if(colnames(res)[i] == 'logFC'){
            mapped_names[i] <- 'log2FoldChange'
        }else if(colnames(res)[i] == 'PValue'){
            mapped_names[i] <- 'pvalue'
        }else if(colnames(res)[i] == 'FDR') {
            mapped_names[i] <- 'padj'
        }else{
            mapped_names[i] <- colnames(res)[i]
        }
    }
    colnames(res) <- mapped_names

    # write results to file
    message('Write results to output file: ', args$output_file)
    write.table(res, args$output_file, sep='\t', quote=FALSE, row.names=TRUE)
} else if(args$method == 'wilcox') {
    suppressPackageStartupMessages(library(edgeR))
    # normalize
    matrix_cpm <- cpm(mat, method=norm_method)
    test_func <- function(x){
        wilcox.test(x[group == 'negative'], x[group == 'positive'], alternative='two.sided')$p.value
    }
    pvalues <- apply(matrix_cpm, 1, test_func)
    matrix_logcpm = log2(matrix + args$pseudo_count)
    logFC <- apply(matrix_logcpm[,which(group == 'positive')], 1, mean) -
        apply(matrix_logcpm[,which(group == 'negative')], 1, mean)
    res <- data.frame(log2FoldChange=logFC,
        pvalue=pvalues, 
        padj=p.adjust(pvalues, method='BH'),
        baseMean=apply(matrix_cpm, 1, mean))
    message('Write results to output file: ', args$output_file)
    write.table(res, args$output_file, sep='\t', quote=FALSE, row.names=TRUE)
} else if(args$method == 'limma'){
    suppressPackageStartupMessages(library(limma))
    suppressPackageStartupMessages(library(edgeR))
    y <- DGEList(counts=mat, samples=samples, group=group)
    y <- calcNormFactors(y, method=norm_method)
    if(!is.null(batch)){
        model <- model.matrix(~group + batch)
    } else {
        model <- model.matrix(~group)
    }
    y <- voom(y, model, plot=FALSE)
    fit <- lmFit(y, model)
    fit <- eBayes(fit, robust=TRUE, trend=TRUE)
    #fit2 <- contrasts.ft(fit)
    #fit2 <- eBayes(fit2, robust=TRUE, trend=TRUE)
    #top_table <- topTable(fit2, sort.by='none', n=Inf)
    top_table <- topTable(fit, coef=2, sort.by='none', n=Inf)
    # rename columns
    mapped_names <- colnames(top_table)
    for(i in 1:ncol(top_table)){
        if(colnames(top_table)[i] == 'logFC'){
            mapped_names[i] <- 'log2FoldChange'
        }else if(colnames(top_table)[i] == 'P.Value'){
            mapped_names[i] <- 'pvalue'
        }else if(colnames(top_table)[i] == 'adj.P.Val') {
            mapped_names[i] <- 'padj'
        }else{
            mapped_names[i] <- colnames(top_table)[i]
        }
    }
    colnames(top_table) <- mapped_names

    # write results to file
    message('Write results to output file: ', args$output_file)
    write.table(top_table, args$output_file, sep='\t', quote=FALSE, row.names=TRUE)
} else if(args$method == "ttest") {
    suppressPackageStartupMessages(library(genefilter))
    #mat <- log2(mat + args$pseudo_count)
    res <- rowttests(mat, as.factor(group))
    res$padj <- p.adjust(res$p.value, method='BH')
    res$log2FoldChange <- rowMeans(mat[, group == 'positive']) - rowMeans(mat[, group == 'negative'])
    write.table(res, args$output_file, sep='\t', quote=FALSE, row.names=TRUE)
} else {
    stop('unknown differential expression method: ', args$method)
}