[41c1e8]: / exseek / scripts / statistics.py

Download this file

129 lines (111 with data), 5.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#! /usr/bin/env python
from __future__ import print_function
import argparse, sys, os, errno
import logging
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] [%(levelname)s] %(name)s: %(message)s')
command_handlers = {}
def command_handler(f):
command_handlers[f.__name__] = f
return f
@command_handler
def read_length_hist(args):
import pysam
import numpy as np
from ioutils import open_file_or_stdout
logger.info('read input BAM/SAM file: ' + args.input_file)
sam = pysam.AlignmentFile(args.input_file, "rb")
counts_ref = np.zeros(args.max_length, dtype=np.int64)
counts_query = np.zeros(args.max_length, dtype=np.int64)
max_length = args.max_length
for read in sam:
counts_query[read.query_length] += 1
counts_ref[min(read.reference_length, max_length - 1)] += 1
logger.info('create output file: ' + args.output_file)
with open_file_or_stdout(args.output_file) as f:
f.write('length\tquery\treference\n')
for i in range(args.max_length):
f.write('{}\t{}\t{}\n'.format(i, counts_query[i], counts_ref[i]))
@command_handler
def read_duplicate_hist(args):
import pysam
import numpy as np
from ioutils import open_file_or_stdout
bin_size = args.bin_size
max_length = args.max_length
n = max_length//bin_size
bounds = np.arange(0, (n + 1)*bin_size, bin_size)
logger.info('read chrom sizes: ' + args.chrom_sizes_file)
chrom_sizes = {}
with open(args.chrom_sizes_file, 'r') as f:
for line in f:
c = line.strip().split('\t')
chrom_sizes[c[0]] = int(c[1])
logger.info('read input BAM/SAM file: ' + args.input_file)
sam = pysam.AlignmentFile(args.input_file, "rb")
dup_counts = np.zeros(n + 1, dtype=np.int64)
tot_counts = np.zeros(n + 1, dtype=np.int64)
max_length = args.max_length
for read in sam:
index = min(chrom_sizes[read.reference_name]//bin_size, n)
if read.is_duplicate:
dup_counts[index] += 1
tot_counts[index] += 1
logger.info('create output file: ' + args.output_file)
with open_file_or_stdout(args.output_file) as f:
f.write('bin\tduplicates\ttotal\n')
for i in range(n + 1):
f.write('{}\t{}\t{}\n'.format(bounds[i], dup_counts[i], tot_counts[i]))
@command_handler
def fragment_length_hist(args):
import pysam
import numpy as np
from ioutils import open_file_or_stdout
logger.info('read input BAM/SAM file: ' + args.input_file)
sam = pysam.AlignmentFile(args.input_file, "rb")
max_length = args.max_length
counts = np.zeros(max_length + 1, dtype=np.int64)
read1 = None
for read in sam:
if (not read.is_paired) or (not read.is_proper_pair):
continue
if read.is_read1:
read1 = read
elif read.is_read2:
if read.query_name != read1.query_name:
continue
length = read.reference_end - read1.reference_start
counts[min(length, max_length)] += 1
with open_file_or_stdout(args.output_file) as f:
f.write('fragment_length\tcounts\n')
for i in range(max_length + 1):
f.write('{}\t{}\n'.format(i, counts[i]))
if __name__ == '__main__':
main_parser = argparse.ArgumentParser(description='Statistics of exRNA datasets')
subparsers = main_parser.add_subparsers(dest='command')
parser = subparsers.add_parser('read_length_hist',
help='calculate a histogram for read length distribution')
parser.add_argument('--input-file', '-i', type=str, required=True, help='input BAM/SAM file')
parser.add_argument('--output-file', '-o', type=str, default='-', help='output histogram file')
parser.add_argument('--max-length', '-l', type=int, default=10000)
parser = subparsers.add_parser('read_duplicate_hist',
help='calculate a histogram for duplicate reads fraction vs read length')
parser.add_argument('--input-file', '-i', type=str, required=True, help='input BAM/SAM file')
parser.add_argument('--output-file', '-o', type=str, default='-', help='output histogram file')
parser.add_argument('--chrom-sizes-file', type=str, required=True,
help='file containing chromosome sizes')
parser.add_argument('--bin-size', type=int, default=10,
help='bin size for read length')
parser.add_argument('--max-length', type=int, default=10000,
help='upper bound for bins')
parser = subparsers.add_parser('fragment_length_hist',
help='calculate a histogram for fragment length in a paired-end BAM/SAM file')
parser.add_argument('--input-file', '-i', type=str, required=True, help='input BAM/SAM file')
parser.add_argument('--output-file', '-o', type=str, default='-', help='output histogram file')
parser.add_argument('--max-length', type=int, default=1000,
help='upper bound for fragment lengths')
args = main_parser.parse_args()
if args.command is None:
main_parser.print_help()
sys.exit(1)
logger = logging.getLogger('statistics.' + args.command)
command_handlers.get(args.command)(args)