Switch to unified view

a b/exseek/scripts/statistics.py
1
#! /usr/bin/env python
2
from __future__ import print_function
3
import argparse, sys, os, errno
4
import logging
5
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] [%(levelname)s] %(name)s: %(message)s')
6
7
command_handlers = {}
8
def command_handler(f):
9
    command_handlers[f.__name__] = f
10
    return f
11
12
@command_handler
13
def read_length_hist(args):
14
    import pysam
15
    import numpy as np
16
    from ioutils import open_file_or_stdout
17
18
    logger.info('read input BAM/SAM file: ' + args.input_file)
19
    sam = pysam.AlignmentFile(args.input_file, "rb")
20
    counts_ref = np.zeros(args.max_length, dtype=np.int64)
21
    counts_query = np.zeros(args.max_length, dtype=np.int64)
22
    max_length = args.max_length
23
    for read in sam:
24
        counts_query[read.query_length] += 1
25
        counts_ref[min(read.reference_length, max_length - 1)] += 1
26
    
27
    logger.info('create output file: ' + args.output_file)
28
    with open_file_or_stdout(args.output_file) as f:
29
        f.write('length\tquery\treference\n')
30
        for i in range(args.max_length):
31
            f.write('{}\t{}\t{}\n'.format(i, counts_query[i], counts_ref[i]))
32
33
@command_handler
34
def read_duplicate_hist(args):
35
    import pysam
36
    import numpy as np
37
    from ioutils import open_file_or_stdout
38
39
    bin_size = args.bin_size
40
    max_length = args.max_length
41
    n = max_length//bin_size
42
    bounds = np.arange(0, (n + 1)*bin_size, bin_size)
43
44
    logger.info('read chrom sizes: ' + args.chrom_sizes_file)
45
    chrom_sizes = {}
46
    with open(args.chrom_sizes_file, 'r') as f:
47
        for line in f:
48
            c = line.strip().split('\t')
49
            chrom_sizes[c[0]] = int(c[1])
50
    logger.info('read input BAM/SAM file: ' + args.input_file)
51
    sam = pysam.AlignmentFile(args.input_file, "rb")
52
    dup_counts = np.zeros(n + 1, dtype=np.int64)
53
    tot_counts = np.zeros(n + 1, dtype=np.int64)
54
    max_length = args.max_length
55
    for read in sam:
56
        index = min(chrom_sizes[read.reference_name]//bin_size, n)
57
        if read.is_duplicate:
58
            dup_counts[index] += 1
59
        tot_counts[index] += 1
60
    
61
    logger.info('create output file: ' + args.output_file)
62
    with open_file_or_stdout(args.output_file) as f:
63
        f.write('bin\tduplicates\ttotal\n')
64
        for i in range(n + 1):
65
            f.write('{}\t{}\t{}\n'.format(bounds[i], dup_counts[i], tot_counts[i]))
66
67
@command_handler
68
def fragment_length_hist(args):
69
    import pysam
70
    import numpy as np
71
    from ioutils import open_file_or_stdout
72
73
    logger.info('read input BAM/SAM file: ' + args.input_file)
74
    sam = pysam.AlignmentFile(args.input_file, "rb")
75
    max_length = args.max_length
76
    counts = np.zeros(max_length + 1, dtype=np.int64)
77
    read1 = None
78
    for read in sam:
79
        if (not read.is_paired) or (not read.is_proper_pair):
80
            continue
81
        if read.is_read1:
82
            read1 = read
83
        elif read.is_read2:
84
            if read.query_name != read1.query_name:
85
                continue
86
            length = read.reference_end - read1.reference_start
87
            counts[min(length, max_length)] += 1
88
    
89
    with open_file_or_stdout(args.output_file) as f:
90
        f.write('fragment_length\tcounts\n')
91
        for i in range(max_length + 1):
92
            f.write('{}\t{}\n'.format(i, counts[i]))
93
        
94
95
if __name__ == '__main__':
96
    main_parser = argparse.ArgumentParser(description='Statistics of exRNA datasets')
97
    subparsers = main_parser.add_subparsers(dest='command')
98
99
    parser = subparsers.add_parser('read_length_hist', 
100
        help='calculate a histogram for read length distribution')
101
    parser.add_argument('--input-file', '-i', type=str, required=True, help='input BAM/SAM file')
102
    parser.add_argument('--output-file', '-o', type=str, default='-', help='output histogram file')
103
    parser.add_argument('--max-length', '-l', type=int, default=10000)
104
105
    parser = subparsers.add_parser('read_duplicate_hist', 
106
        help='calculate a histogram for duplicate reads fraction vs read length')
107
    parser.add_argument('--input-file', '-i', type=str, required=True, help='input BAM/SAM file')
108
    parser.add_argument('--output-file', '-o', type=str, default='-', help='output histogram file')
109
    parser.add_argument('--chrom-sizes-file', type=str, required=True, 
110
        help='file containing chromosome sizes')
111
    parser.add_argument('--bin-size', type=int, default=10, 
112
        help='bin size for read length')
113
    parser.add_argument('--max-length', type=int, default=10000,
114
        help='upper bound for bins')
115
116
    parser = subparsers.add_parser('fragment_length_hist',
117
        help='calculate a histogram for fragment length in a paired-end BAM/SAM file')
118
    parser.add_argument('--input-file', '-i', type=str, required=True, help='input BAM/SAM file')
119
    parser.add_argument('--output-file', '-o', type=str, default='-', help='output histogram file')
120
    parser.add_argument('--max-length', type=int, default=1000,
121
        help='upper bound for fragment lengths')
122
    
123
    args = main_parser.parse_args()
124
    if args.command is None:
125
        main_parser.print_help()
126
        sys.exit(1)
127
    logger = logging.getLogger('statistics.' + args.command)
128
129
    command_handlers.get(args.command)(args)