--- a +++ b/exseek/scripts/statistics.py @@ -0,0 +1,129 @@ +#! /usr/bin/env python +from __future__ import print_function +import argparse, sys, os, errno +import logging +logging.basicConfig(level=logging.INFO, format='[%(asctime)s] [%(levelname)s] %(name)s: %(message)s') + +command_handlers = {} +def command_handler(f): + command_handlers[f.__name__] = f + return f + +@command_handler +def read_length_hist(args): + import pysam + import numpy as np + from ioutils import open_file_or_stdout + + logger.info('read input BAM/SAM file: ' + args.input_file) + sam = pysam.AlignmentFile(args.input_file, "rb") + counts_ref = np.zeros(args.max_length, dtype=np.int64) + counts_query = np.zeros(args.max_length, dtype=np.int64) + max_length = args.max_length + for read in sam: + counts_query[read.query_length] += 1 + counts_ref[min(read.reference_length, max_length - 1)] += 1 + + logger.info('create output file: ' + args.output_file) + with open_file_or_stdout(args.output_file) as f: + f.write('length\tquery\treference\n') + for i in range(args.max_length): + f.write('{}\t{}\t{}\n'.format(i, counts_query[i], counts_ref[i])) + +@command_handler +def read_duplicate_hist(args): + import pysam + import numpy as np + from ioutils import open_file_or_stdout + + bin_size = args.bin_size + max_length = args.max_length + n = max_length//bin_size + bounds = np.arange(0, (n + 1)*bin_size, bin_size) + + logger.info('read chrom sizes: ' + args.chrom_sizes_file) + chrom_sizes = {} + with open(args.chrom_sizes_file, 'r') as f: + for line in f: + c = line.strip().split('\t') + chrom_sizes[c[0]] = int(c[1]) + logger.info('read input BAM/SAM file: ' + args.input_file) + sam = pysam.AlignmentFile(args.input_file, "rb") + dup_counts = np.zeros(n + 1, dtype=np.int64) + tot_counts = np.zeros(n + 1, dtype=np.int64) + max_length = args.max_length + for read in sam: + index = min(chrom_sizes[read.reference_name]//bin_size, n) + if read.is_duplicate: + dup_counts[index] += 1 + tot_counts[index] += 1 + + logger.info('create output file: ' + args.output_file) + with open_file_or_stdout(args.output_file) as f: + f.write('bin\tduplicates\ttotal\n') + for i in range(n + 1): + f.write('{}\t{}\t{}\n'.format(bounds[i], dup_counts[i], tot_counts[i])) + +@command_handler +def fragment_length_hist(args): + import pysam + import numpy as np + from ioutils import open_file_or_stdout + + logger.info('read input BAM/SAM file: ' + args.input_file) + sam = pysam.AlignmentFile(args.input_file, "rb") + max_length = args.max_length + counts = np.zeros(max_length + 1, dtype=np.int64) + read1 = None + for read in sam: + if (not read.is_paired) or (not read.is_proper_pair): + continue + if read.is_read1: + read1 = read + elif read.is_read2: + if read.query_name != read1.query_name: + continue + length = read.reference_end - read1.reference_start + counts[min(length, max_length)] += 1 + + with open_file_or_stdout(args.output_file) as f: + f.write('fragment_length\tcounts\n') + for i in range(max_length + 1): + f.write('{}\t{}\n'.format(i, counts[i])) + + +if __name__ == '__main__': + main_parser = argparse.ArgumentParser(description='Statistics of exRNA datasets') + subparsers = main_parser.add_subparsers(dest='command') + + parser = subparsers.add_parser('read_length_hist', + help='calculate a histogram for read length distribution') + parser.add_argument('--input-file', '-i', type=str, required=True, help='input BAM/SAM file') + parser.add_argument('--output-file', '-o', type=str, default='-', help='output histogram file') + parser.add_argument('--max-length', '-l', type=int, default=10000) + + parser = subparsers.add_parser('read_duplicate_hist', + help='calculate a histogram for duplicate reads fraction vs read length') + parser.add_argument('--input-file', '-i', type=str, required=True, help='input BAM/SAM file') + parser.add_argument('--output-file', '-o', type=str, default='-', help='output histogram file') + parser.add_argument('--chrom-sizes-file', type=str, required=True, + help='file containing chromosome sizes') + parser.add_argument('--bin-size', type=int, default=10, + help='bin size for read length') + parser.add_argument('--max-length', type=int, default=10000, + help='upper bound for bins') + + parser = subparsers.add_parser('fragment_length_hist', + help='calculate a histogram for fragment length in a paired-end BAM/SAM file') + parser.add_argument('--input-file', '-i', type=str, required=True, help='input BAM/SAM file') + parser.add_argument('--output-file', '-o', type=str, default='-', help='output histogram file') + parser.add_argument('--max-length', type=int, default=1000, + help='upper bound for fragment lengths') + + args = main_parser.parse_args() + if args.command is None: + main_parser.print_help() + sys.exit(1) + logger = logging.getLogger('statistics.' + args.command) + + command_handlers.get(args.command)(args) \ No newline at end of file