a b/exseek/scripts/call_domains.R
1
#! /usr/bin/env Rscript
2
suppressPackageStartupMessages(library("argparse"))
3
parser <- ArgumentParser(description='Call domains with significant coverage')
4
parser$add_argument('-p', '--pvalue', type='double', default=0.05,
5
    help='adjusted p-value threshold for defining significant domains', metavar='NUMBER')
6
parser$add_argument('-i', '--input-file', type='character', default='-',
7
    help='input BED file with covearge in the 5th column')
8
parser$add_argument('-o', '--output-file', type='character', default='-',
9
    help='output BED file with adjusted p-value in the last column')
10
parser$add_argument('-b', '--background', type='double', default=0.99,
11
    help='set coverage quantile below this quantile as background regions')
12
args <- parser$parse_args()
13
14
library(countreg)
15
16
ztnb.test <- function(x, mu, theta, max_x=2000) {
17
    # build a p-value table
18
    pvalue_table <- dztnbinom(1:max_x, mu=mu, theta=theta)
19
    pvalue_table <- rev(cumsum(rev(pvalue_table)))
20
    # calculate pvalues
21
    pvalues <- rep(pvalue_table[max_x], length(x))
22
    pvalues[x <= max_x] <- pvalue_table[x[x <= max_x]]
23
    qvalues <- p.adjust(pvalues, method='BH')
24
    return(qvalues)
25
}
26
message('read coverage file:', args$input_file)
27
if(args$input_file == '-'){
28
    bed <- read.table(file('stdin'), sep='\t', header=FALSE)
29
} else {
30
    bed <- read.table(args$input_file, sep='\t', header=FALSE)
31
}
32
message('number of bins:', nrow(bed))
33
# remove bins with 1 read
34
#bed <- bed[bed[, 5] > 1,]
35
x <- as.integer(bed[, 5])
36
bg = x[x < quantile(x, args$background)]
37
if(length(bg) > 0){
38
    message('fit ZTNB distribution')
39
    fit <- zerotrunc(bg ~ 1, dist='negbin')
40
    qvalues <- ztnb.test(x, mu=exp(fit$coefficients), theta=fit$theta)
41
    bg <- x[qvalues >= args$pvalue]
42
    message('fit ZTNB distribution')
43
    fit <- zerotrunc(bg ~ 1, dist='negbin')
44
    qvalues <- ztnb.test(x, mu=exp(fit$coefficients), theta=fit$theta)
45
    bed_sig <- bed[qvalues < args$pvalue,]
46
    bed_sig <- cbind(bed_sig, qvalues[qvalues < args$pvalue])
47
} else {
48
    # empty file
49
    bed_sig <- data.frame()
50
}
51
message('number of significant bins:', nrow(bed_sig))
52
message('write significant domains')
53
if(args$output_file == '-'){
54
    write.table(bed_sig, row.names=FALSE, col.names=FALSE, sep='\t', quote=FALSE)
55
} else {
56
    write.table(bed_sig, args$output_file, row.names=FALSE, col.names=FALSE, sep='\t', quote=FALSE)
57
}