|
a |
|
b/exseek/scripts/call_domains.R |
|
|
1 |
#! /usr/bin/env Rscript |
|
|
2 |
suppressPackageStartupMessages(library("argparse")) |
|
|
3 |
parser <- ArgumentParser(description='Call domains with significant coverage') |
|
|
4 |
parser$add_argument('-p', '--pvalue', type='double', default=0.05, |
|
|
5 |
help='adjusted p-value threshold for defining significant domains', metavar='NUMBER') |
|
|
6 |
parser$add_argument('-i', '--input-file', type='character', default='-', |
|
|
7 |
help='input BED file with covearge in the 5th column') |
|
|
8 |
parser$add_argument('-o', '--output-file', type='character', default='-', |
|
|
9 |
help='output BED file with adjusted p-value in the last column') |
|
|
10 |
parser$add_argument('-b', '--background', type='double', default=0.99, |
|
|
11 |
help='set coverage quantile below this quantile as background regions') |
|
|
12 |
args <- parser$parse_args() |
|
|
13 |
|
|
|
14 |
library(countreg) |
|
|
15 |
|
|
|
16 |
ztnb.test <- function(x, mu, theta, max_x=2000) { |
|
|
17 |
# build a p-value table |
|
|
18 |
pvalue_table <- dztnbinom(1:max_x, mu=mu, theta=theta) |
|
|
19 |
pvalue_table <- rev(cumsum(rev(pvalue_table))) |
|
|
20 |
# calculate pvalues |
|
|
21 |
pvalues <- rep(pvalue_table[max_x], length(x)) |
|
|
22 |
pvalues[x <= max_x] <- pvalue_table[x[x <= max_x]] |
|
|
23 |
qvalues <- p.adjust(pvalues, method='BH') |
|
|
24 |
return(qvalues) |
|
|
25 |
} |
|
|
26 |
message('read coverage file:', args$input_file) |
|
|
27 |
if(args$input_file == '-'){ |
|
|
28 |
bed <- read.table(file('stdin'), sep='\t', header=FALSE) |
|
|
29 |
} else { |
|
|
30 |
bed <- read.table(args$input_file, sep='\t', header=FALSE) |
|
|
31 |
} |
|
|
32 |
message('number of bins:', nrow(bed)) |
|
|
33 |
# remove bins with 1 read |
|
|
34 |
#bed <- bed[bed[, 5] > 1,] |
|
|
35 |
x <- as.integer(bed[, 5]) |
|
|
36 |
bg = x[x < quantile(x, args$background)] |
|
|
37 |
if(length(bg) > 0){ |
|
|
38 |
message('fit ZTNB distribution') |
|
|
39 |
fit <- zerotrunc(bg ~ 1, dist='negbin') |
|
|
40 |
qvalues <- ztnb.test(x, mu=exp(fit$coefficients), theta=fit$theta) |
|
|
41 |
bg <- x[qvalues >= args$pvalue] |
|
|
42 |
message('fit ZTNB distribution') |
|
|
43 |
fit <- zerotrunc(bg ~ 1, dist='negbin') |
|
|
44 |
qvalues <- ztnb.test(x, mu=exp(fit$coefficients), theta=fit$theta) |
|
|
45 |
bed_sig <- bed[qvalues < args$pvalue,] |
|
|
46 |
bed_sig <- cbind(bed_sig, qvalues[qvalues < args$pvalue]) |
|
|
47 |
} else { |
|
|
48 |
# empty file |
|
|
49 |
bed_sig <- data.frame() |
|
|
50 |
} |
|
|
51 |
message('number of significant bins:', nrow(bed_sig)) |
|
|
52 |
message('write significant domains') |
|
|
53 |
if(args$output_file == '-'){ |
|
|
54 |
write.table(bed_sig, row.names=FALSE, col.names=FALSE, sep='\t', quote=FALSE) |
|
|
55 |
} else { |
|
|
56 |
write.table(bed_sig, args$output_file, row.names=FALSE, col.names=FALSE, sep='\t', quote=FALSE) |
|
|
57 |
} |