exSEEK / Git / [41c1e8] /exseek/scripts/preprocess.py

Models:
DanielG/
exSEEK
Downloads: 1
[41c1e8]: / exseek / scripts / preprocess.py
History
Download this file
996 lines (901 with data), 41.3 kB

#! /usr/bin/env python
from __future__ import print_function
import argparse, sys, os, errno
import logging
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] [%(levelname)s] %(name)s: %(message)s')

command_handlers = {}
def command_handler(f):
    command_handlers[f.__name__] = f
    return f
    
def read_gtf(filename):
    from ioutils import open_file_or_stdin

    with open_file_or_stdin(filename) as fin:
        lineno = 0
        for line in fin:
            lineno += 1
            c = line.strip().split('\t')
            if c[0].startswith('#'):
                continue
            attrs = {}
            for a in c[8].split(';')[:-1]:
                a = a.strip()
                i = a.find(' ')
                key = a[:i]
                val = a[(i + 1):].strip('"')
                attrs[key] = val
            gene_id = attrs.get('gene_id')
            if gene_id is None:
                raise ValueError('gene_id not found in GTF file at line {}'.format(lineno))
            yield (c, attrs, line)

@command_handler
def extract_gene(args):
    from ioutils import open_file_or_stdout

    feature = args.feature
    genes = {}
    logger.info('read GTF file: ' + args.input_file)
    for c, attrs, line in read_gtf(args.input_file):
        if (feature is not None) and (c[2] != feature):
            continue
        gene_id = attrs.get('gene_id')
        gene = genes.get(gene_id)
        if gene is None:
            gene = [c[0], int(c[3]) - 1, int(c[4]), gene_id, 0, c[6]]
            genes[gene_id] = gene
        else:
            gene[1] = min(gene[1], int(c[3]) - 1)
            gene[2] = max(gene[2], int(c[4]))
    
    logger.info('number of genes: {}'.format(len(genes)))
    logger.info('write BED file: ' + args.output_file)
    with open_file_or_stdout(args.output_file) as fout:
        for gene_id, gene in genes.items():
            fout.write('\t'.join(map(str, gene)))
            fout.write('\n')


@command_handler
def fix_gtf(args):
    from ioutils import open_file_or_stdout
    from collections import defaultdict
    # strand of exons grouped by transcript_id
    strands = defaultdict(list)

    feature = args.feature
    lines = []
    logger.info('read GTF file: ' + args.input_file)
    for c, attrs, line in read_gtf(args.input_file):
        if c[2] in ('transcript', 'exon'):
            transcript_id = attrs.get('transcript_id')
            if transcript_id is None:
                raise ValueError('transcript_id not found in GTF file at line {}'.format(lineno))
            lines.append((transcript_id, line))
        else:
            transcript_id = attrs.get('transcript_id')
            if transcript_id is None:
                raise ValueError('transcript_id not found in GTF file at line {}'.format(lineno))
            strands[transcript_id].append(c[6])
    invalid_transcripts = set()
    for transcript_id, strands_tx in strands.items():
        strands_tx = set(strands_tx)
        # remove transcripts without strand information
        if '.' in strands_tx:
            invalid_transcripts.add(transcript_id)
        # remove transcripts with exon on different strands
        elif len(strands_tx) != 1:
            invalid_transcripts.add(transcript_id)
    
    logger.info('number of transcripts: {}'.format(len(strands)))
    logger.info('number of invalid transcripts: {}'.format(len(invalid_transcripts)))
    logger.info('write GTF file: ' + args.output_file)
    with open_file_or_stdout(args.output_file) as fout:
        for transcript_id, line in lines:
            if transcript_id not in invalid_transcripts:
                fout.write(line)


@command_handler
def transcript_counts(args):
    import pysam
    import numpy as np
    from ioutils import open_file_or_stdout
    from collections import defaultdict

    logger.info('read input transcript BAM file: ' + args.input_file)
    sam = pysam.AlignmentFile(args.input_file, "rb")
    counts = defaultdict(int)
    for read in sam:
        counts[read.reference_name] += 1
    
    logger.info('create output file: ' + args.output_file)
    with open_file_or_stdout(args.output_file) as fout:
        for key, val in counts.items():
            fout.write('{}\t{}\n'.format(key, val))

@command_handler
def extract_longest_transcript(args):
    from ioutils import open_file_or_stdin, open_file_or_stdout
    from collections import defaultdict
    from functools import partial

    feature = args.feature
    genes = defaultdict(partial(defaultdict, int))
    lines = []
    logger.info('read gtf file: ' + args.input_file)
    with open_file_or_stdin(args.input_file) as fin:
        lineno = 0
        for line in fin:
            lineno += 1
            c = line.strip().split('\t')
            if c[0].startswith('#'):
                continue
            if c[2] != feature:
                lines.append(('#other#', line))
                continue
            attrs = {}
            for a in c[8].split(';')[:-1]:
                a = a.strip()
                i = a.find(' ')
                key = a[:i]
                val = a[(i + 1):].strip('"')
                attrs[key] = val
            transcript_id = attrs.get('transcript_id')
            if transcript_id is None:
                raise ValueError('transcript_id not found in GTF file at line {}'.format(lineno))
            gene_id = attrs.get('gene_id')
            if gene_id is None:
                raise ValueError('gene_id not found in GTF file at line {}'.format(lineno))
            lines.append((transcript_id, line))
            genes[gene_id][transcript_id] += int(c[4]) - int(c[3]) + 1
    kept_transcripts = set()
    kept_transcripts.add('#other#')
    for gene_id, gene in genes.items():
        max_length = 0
        max_transcript = None
        for transcript_id, length in gene.items():
            if length > max_length:
                max_length = length
                max_transcript = transcript_id
        kept_transcripts.add(transcript_id)

    logger.info('number of genes: {}'.format(len(genes)))
    logger.info('number of transcripts: {}'.format(sum(map(len, genes.values()))))
    logger.info('number of longest transcripts: {}'.format(len(kept_transcripts) - 1))
    logger.info('write output gtf file: ' + args.output_file)
    with open_file_or_stdout(args.output_file) as fout:
        for transcript_id, line in lines:
            if transcript_id in kept_transcripts:
                fout.write(line)

@command_handler
def gtf_to_transcript_table(args):
    from ioutils import open_file_or_stdin, open_file_or_stdout
    from collections import OrderedDict

    feature = args.feature
    default_transcript_type = args.transcript_type
    default_gene_type = args.gene_type

    fout = open_file_or_stdout(args.output_file)
    with open_file_or_stdin(args.input_file) as fin:
        transcripts = OrderedDict()
        for line in fin:
            # get GTF columns
            c = line.strip().split('\t')
            if c[0].startswith('#'):
                continue
            if c[2] != feature:
                continue
            # GTF attributes
            attrs = {}
            for a in c[8].split(';')[:-1]:
                a = a.strip()
                i = a.find(' ')
                key = a[:i]
                val = a[(i + 1):].strip('"')
                attrs[key] = val
            # mitranscriptome
            if c[1] == 'mitranscriptome':
                if attrs['tcat'] == 'lncrna':
                    attrs['gene_type'] = 'lncRNA'
                    attrs['transcript_type'] = 'lncRNA'
                elif attrs['tcat'] == 'tucp':
                    attrs['gene_type'] = 'tucpRNA'
                    attrs['transcript_type'] = 'tucpRNA'
            # use transcript_id as transcript_name
            if 'transcript_name' not in attrs:
                attrs['transcript_name'] = attrs['transcript_id']
            # use gene_id as gene_name
            if 'gene_name' not in attrs:
                attrs['gene_name'] = attrs['gene_id']
            # set transcript_type if given
            if default_transcript_type is not None:
                attrs['transcript_type'] = default_transcript_type
            else:
                if 'transcript_type' not in attrs:
                    attrs['transcript_type'] = 'unknown'
            # set gene_type if given
            if default_gene_type is not None:
                attrs['gene_type'] = default_gene_type
            else:
                if 'gene_type' not in attrs:
                    attrs['gene_type'] = 'unknown'
            exon = [c[0], int(c[3]) - 1, int(c[4]), attrs['gene_id'], 0, c[6],
                attrs['gene_id'], attrs['transcript_id'], 
                attrs['gene_name'], attrs['transcript_name'],
                attrs['gene_type'], attrs['transcript_type'], c[1]]
            transcript = transcripts.get(attrs['transcript_id'])
            if transcript is None:
                transcripts[attrs['transcript_id']] = exon
            else:
                if c[2] == 'exon':
                    transcript[1] = min(transcript[1], exon[1])
                    transcript[2] = max(transcript[2], exon[2])
        header = ['chrom', 'start', 'end', 'name', 'score', 'strand',
            'gene_id', 'transcript_id', 
            'gene_name', 'transcript_name',
            'gene_type', 'transcript_type', 'source'
        ]
        print('\t'.join(header), file=fout)
        for transcript in transcripts.values():
            print('\t'.join(str(a) for a in transcript), file=fout)
    fout.close()

@command_handler
def extract_circrna_junction(args):
    from Bio import SeqIO
    from ioutils import open_file_or_stdin, open_file_or_stdout

    anchor_size = args.anchor_size
    logger.info('read sequence file: ' + args.input_file)
    logger.info('create output file: ' + args.output_file)
    fout = open_file_or_stdout(args.output_file)
    with open_file_or_stdin(args.input_file) as fin:
        for record in SeqIO.parse(fin, 'fasta'):
            seq = str(record.seq)
            if len(seq) < args.min_length:
                continue
            s = min(len(seq), anchor_size)
            seq_id = record.id.split('|')[0]
            fout.write('>{}\n'.format(seq_id))
            fout.write(seq[-s:] + seq[:s])
            fout.write('\n')
    fout.close()

@command_handler
def filter_circrna_reads(args):
    import pysam
    import numpy as np
    from ioutils import open_file_or_stdout, open_file_or_stdin
    from collections import defaultdict
    from copy import deepcopy

    logger.info('read input SAM file: ' + args.input_file)
    fin = open_file_or_stdin(args.input_file)
    sam_in = pysam.AlignmentFile(fin, "r")
    if sam_in.header is None:
        raise ValueError('requires SAM header to get junction positions')
    # get junction positions (middle of the sequences)
    junction_positions = {}
    for sq in sam_in.header['SQ']:
        junction_positions[sq['SN']] = sq['LN']//2

    logger.info('create output SAM file: ' + args.output_file)
    fout = open_file_or_stdout(args.output_file)
    sam_out = pysam.AlignmentFile(fout, 'w', template=sam_in)

    sam_filtered = None
    if args.filtered_file is not None:
        logger.info('create filtered SAM file: ' + args.filtered_file)
        sam_filtered = pysam.AlignmentFile(args.filtered_file, 'w', template=sam_in)

    for read in sam_in:
        filtered = False
        if read.is_unmapped:
            filtered = True
        elif read.is_reverse:
            filtered = True
        else:
            pos = junction_positions[read.reference_name]
            if not (read.reference_start < pos <= read.reference_end):
                filterd = True
        if not filtered:
            sam_out.write(read)
        elif sam_filtered is not None:
            sam_filtered.write(read)
    
    fin.close()
    fout.close()
    if sam_filtered is not None:
        sam_filtered.close()

@command_handler
def chrom_sizes(args):
    from Bio import SeqIO
    from ioutils import open_file_or_stdin, open_file_or_stdout

    fout = open_file_or_stdout(args.output_file)
    with open_file_or_stdin(args.input_file) as fin:
        for record in SeqIO.parse(fin, 'fasta'):
            fout.write('{}\t{}\n'.format(record.id, len(record.seq)))

@command_handler
def extract_domain_sequence(args):
    from pyfaidx import Fasta
    from Bio.Seq import Seq
    from ioutils import open_file_or_stdout
    import pandas as pd

    fout = open_file_or_stdout(args.output_file)
    fastas = {}
    with open(args.input_file, 'r') as fin:
        for lineno, line in enumerate(fin):
            feature = line.split('\t')[0]
            gene_id, gene_type, gene_name, domain_id, transcript_id, start, end = feature.split('|')
            start = int(start)
            end = int(end)
            if gene_type == 'genomic':
                gene_type = 'genome'
            if gene_type not in fastas:
                fastas[gene_type] = Fasta(os.path.join(args.genome_dir, 'fasta', gene_type + '.fa'))
            if gene_type == 'genome':
                chrom, gstart, gend, strand = gene_id.split('_')
                gstart = int(gstart)
                gend = int(gend)
                seq = fastas[gene_type][chrom][gstart:gend].seq
                if strand == '-':
                    seq = str(Seq(seq).reverse_complement())
            else:
                seq = fastas[gene_type][transcript_id][start:end].seq
            seq = seq.upper()
            fout.write('>{}\n'.format(feature))
            fout.write(seq)
            fout.write('\n')
    fout.close()
    
@command_handler
def extract_feature_sequence(args):
    from pyfaidx import Fasta
    from Bio.Seq import Seq
    from ioutils import open_file_or_stdout

    def pad_range(start, end, chrom_size, max_length):
        if (end - start) >= max_length:
            return
        padding_left = (max_length - (end - start))//2
        new_start = max(0, start - padding_left)
        new_end = min(new_start + max_length, chrom_size)
        return new_start, new_end

    fout = open_file_or_stdout(args.output_file)
    fastas = {}
    with open(args.input_file, 'r') as fin:
        for lineno, line in enumerate(fin):
            if lineno == 0:
                continue
            feature = line.split('\t')[0]
            gene_id, gene_type, gene_name, domain_id, transcript_id, start, end = feature.split('|')
            start = int(start)
            end = int(end)
            if gene_type == 'genomic':
                gene_type = 'genome'
            # load FASTA file
            if gene_type not in fastas:
                fastas[gene_type] = Fasta(os.path.join(args.genome_dir, 'fasta', gene_type + '.fa'))
            if gene_type == 'genome':
                chrom, gstart, gend, strand = gene_id.split('_')
                gstart = int(gstart)
                gend = int(gend)
                seq = fastas[gene_type][chrom][gstart:gend].seq
                if strand == '-':
                    seq = str(Seq(seq).reverse_complement())
            else:
                seq = fastas[gene_type][transcript_id][start:end].seq
            seq = seq.upper()
            fout.write('>{}\n'.format(feature))
            fout.write(seq)
            fout.write('\n')
    fout.close()

@command_handler
def calculate_star_parameters(args):
    from math import log2

    genome_length = 0
    n_seqs = 0
    with open(args.input_file, 'r') as f:
        for line in f:
            genome_length += int(line.split('\t')[1])
            n_seqs += 1
    if args.parameter == 'genomeSAindexNbases':
        print(min(14, int(log2(genome_length)//2) - 1))
    elif args.parameter == 'genomeChrBinNbits':
        print(min(18, int(log2(genome_length/n_seqs))))

@command_handler
def highlight_mature_mirna_location(args):
    import gzip
    from Bio import SeqIO
    from collections import defaultdict
    from ioutils import open_file_or_stdout

    logger.info('read hairpin sequences: ' + args.hairpin)
    hairpin_seqs = {}
    with open(args.hairpin, 'r') as f:
        for record in SeqIO.parse(f, 'fasta'):
            if (args.species is None) or (args.species == record.id.split('-')[0]):
                hairpin_seqs[record.id] = str(record.seq)
    
    logger.info('read mature sequences: ' + args.mature)
    fout = open_file_or_stdout(args.output_file)
    mature_positions = defaultdict(dict)
    with open(args.mature, 'r') as f:
        for record in SeqIO.parse(f, 'fasta'):
            if not ((args.species is None) or (args.species == record.id.split('-')[0])):
                continue
            if record.id.endswith('-5p') or record.id.endswith('-3p'):
                end_type = record.id.split('-')[-1]
                hairpin_id = record.id[:-3].lower()
                hairpin_seq = hairpin_seqs.get(hairpin_id)
                if hairpin_seq is None:
                    logger.warn('hairpin sequence id {} is not found in hairpin file'.format(hairpin_id))
                    continue
                mature_id = record.id
                mature_seq = str(record.seq)
                pos = hairpin_seq.find(mature_seq)
                if pos < 0:
                    logger.warn('mature sequence {} is not found in hairpin file'.format(mature_id))
                    continue
                else:
                    mature_positions[hairpin_id][end_type] = (pos, pos + len(mature_seq))

    for hairpin_id, hairpin_seq in hairpin_seqs.items():
        mature_position = mature_positions.get(hairpin_id)
        if mature_position is None:
            mature_position = {}
        fout.write('>{}\n'.format(hairpin_id))
        offset = 0
        if '5p' in mature_position:
            start, end = mature_position['5p']
            fout.write('{0}\x1B[31;1m{1}\x1B[0m'.format(hairpin_seq[offset:start], hairpin_seq[start:end]))
            offset = end
        if '3p' in mature_position:
            start, end = mature_position['3p']
            fout.write('{0}\x1B[32;1m{1}\x1B[0m'.format(hairpin_seq[offset:start], hairpin_seq[start:end]))
            offset = end
        fout.write('{}\n'.format(hairpin_seq[offset:]))
    fout.close()

@command_handler
def extract_mature_mirna_location(args):
    from utils import read_gff, GFFRecord
    from ioutils import open_file_or_stdin, open_file_or_stdout
    from collections import OrderedDict, defaultdict

    logger.info('read input GFF file: ' + args.input_file)
    fin = open_file_or_stdin(args.input_file)
    logger.info('open output BED file: ' + args.input_file)
    fout = open_file_or_stdout(args.output_file)
    # key: precursor_id, value: precursor record
    precursors = OrderedDict()
    # key: precursor_id, value: list of mature records
    matures = defaultdict(list)
    # read features from GFF file
    for record in read_gff(fin):
        if record.feature == 'miRNA_primary_transcript':
            precursors[record.attr['ID']] = record
        elif record.feature == 'miRNA':
            matures[record.attr['Derives_from']].append(record)
    # get locations of mature miRNAs
    for precursor_id, precursor in precursors.items():
        for mature in matures[precursor_id]:
            if mature.strand == '+':
                fout.write('{}\t{}\t{}\t{}\t0\t+\n'.format(
                    precursor.attr['Name'], 
                    mature.start - precursor.start,
                    mature.end - precursor.start + 1,
                    mature.attr['Name']))
            else:
                fout.write('{}\t{}\t{}\t{}\t0\t+\n'.format(
                    precursor.attr['Name'],
                    precursor.end - mature.end,
                    precursor.end - mature.start + 1,
                    mature.attr['Name']
                ))
    fin.close()
    fout.close()

@command_handler
def gtf_to_bed(args):
    from ioutils import open_file_or_stdin, open_file_or_stdout

    exon_feature = 'exon'
    # use transcript_id attribute as key
    transcripts = {}
    logger.info('read input GTF file: ' + args.input_file)
    for lineno, record in enumerate(read_gtf(args.input_file)):
        c, attrs, line = record
        if c[2] == exon_feature:
            gene_id = attrs.get('gene_id')
            if gene_id is None:
                raise ValueError('gene_id attribute not found in GTF file {}:{}'.format(args.input_file, lineno))
            transcript_id = attrs.get('transcript_id')
            if transcript_id is None:
                raise ValueError('transcript_id attribute not found in GTF file {}:{}'.format(args.input_file, lineno))
            transcript = transcripts.get(transcript_id)
            if transcript is None:
                # new transcript
                transcript = {
                    'chrom': c[0],
                    'strand': c[6],
                    'gene_id': gene_id,
                    'gene_name': attrs.get('gene_name', gene_id),
                    'transcript_name': attrs.get('transcript_name', transcript_id),
                    'exons': []
                }
                transcripts[transcript_id] = transcript
            # add a new exon
            transcript['exons'].append((int(c[3]) - 1, int(c[4])))
    
    fout = open_file_or_stdout(args.output_file)
    bed_template = '{chrom}\t{start}\t{end}\t{name}\t0\t{strand}\t0\t0\t0\t{n_exons}\t{exon_sizes}\t{exon_starts}\n'
    for transcript_id, transcript in transcripts.items():
        # sort exons by start position
        transcript['exons'] = sorted(transcript['exons'], key=lambda x: x[0])
        transcript['n_exons'] = len(transcript['exons'])
        transcript['start'] = transcript['exons'][0][0]
        transcript['end'] = transcript['exons'][-1][1]
        transcript['exon_starts'] = ','.join(str(e[0] - transcript['start']) for e in transcript['exons'])
        transcript['exon_sizes'] = ','.join(str(e[1] - e[0]) for e in transcript['exons'])
        transcript['name'] = '{gene_id}'.format(**transcript)
        fout.write(bed_template.format(**transcript))


@command_handler
def calculate_gene_length(args):
    import HTSeq
    from collections import defaultdict
    from functools import partial
    import numpy as np
    from ioutils import open_file_or_stdin
    from tqdm import tqdm

    fin = open_file_or_stdin(args.input_file)
    gff = HTSeq.GFF_Reader(fin)
    exons = defaultdict(partial(defaultdict, int))
    for feature in tqdm(gff, unit='feature'):
        if feature.type == 'exon':
            exons[feature.attr['gene_id']][feature.attr['transcript_id']] += feature.iv.length

@command_handler
def merge_data_frames(args):
    import pandas as pd
    import numpy as np
    from ioutils import open_file_or_stdout

    if (not args.on_index) and (args.on is None):
        raise ValueError('argument --on is required if --on-index is not specified')
    merged = None
    for input_file in args.input_file:
        logger.info('read input file: ' + input_file)
        df = pd.read_table(input_file, sep=args.sep)
        if merged is None:
            merged = df
        else:
            if args.on_index:
                merged = pd.merge(merged, df, how=args.how, left_index=True, right_index=True)
            else:
                merged = pd.merge(merged, df, how=args.how, on=args.on)
    if args.fillna is not None:
        merged.fillna(args.fillna, inplace=True)
    logger.info('open output file: ' + args.output_file)
    with open_file_or_stdout(args.output_file) as f:
        merged.to_csv(f, sep=args.sep, header=True, index=args.on_index)
    
@command_handler
def genomecov(args):
    import pysam
    import h5py
    import numpy as np
    from tqdm import tqdm

    logger.info('read input BAM/SAM file: ' + args.input_file)
    sam = pysam.AlignmentFile(args.input_file, 'r')
    logger.info('open output HDF5 file: ' + args.output_file)
    fout = h5py.File(args.output_file, 'w')
    for sq in tqdm(sam.header['SQ'], unit='seq'):
        data = np.zeros(sq['LN'], dtype=np.int32)
        fout.create_dataset(sq['SN'], data=data, compression='gzip')
    fout.close()

@command_handler
def calc_rpkm(args):
    import pandas as pd
    import numpy as np
    from ioutils import open_file_or_stdin, open_file_or_stdout

    matrix = pd.read_table(open_file_or_stdin(args.input_file), index_col=0, sep='\t')
    feature_info = matrix.index.to_series().str.split('|', expand=True)
    feature_info.columns = ['gene_id', 'gene_type', 'gene_name', 'feature_id', 'transcript_id', 'start', 'end']
    feature_info['start'] = feature_info['start'].astype('int')
    feature_info['end'] = feature_info['end'].astype('int')
    feature_info['length'] = feature_info['end'] - feature_info['start']
    matrix = 1000.0*matrix.div(feature_info['length'], axis=0)
    matrix.to_csv(open_file_or_stdout(args.output_file), index=True, header=True, sep='\t', na_rep='NA')

@command_handler
def create_pseudo_genome(args):
    from pyfaidx import Fasta, Faidx
    import numpy as np

    src = Fasta(args.input_file)
    lengths = np.array([len(r) for r in src])
    padded_lengths = lengths + args.padding
    padded_seq = ''.join(['N']*args.padding)
    cum_lengths = np.cumsum(padded_lengths)
    chrom_indices = cum_lengths//args.max_chrom_size
    chrom_starts = cum_lengths%args.max_chrom_size - padded_lengths
    prev_chrom_index = -1

    # write FASTA file
    logger.info('write FASTA file: ' + args.output_fasta)
    with open(args.output_fasta, 'w') as fout:
        for i, record in enumerate(src):
            # new chromosome
            if chrom_indices[i] != prev_chrom_index:
                if i > 0:
                    fout.write('\n')
                fout.write('>c{}\n'.format(i + 1))
                prev_chrom_index = chrom_indices[i]
            # write sequence
            fout.write(str(record))
            fout.write(padded_seq)
    # build fasta index
    logger.info('build FASTA index')
    dst_index = Faidx(args.output_fasta)
    dst_index.build_index()
    # chrom sizes
    logger.info('write chrom sizes: ' + args.output_chrom_sizes)
    fout = open(args.output_chrom_sizes, 'w')
    with open(args.output_fasta + '.fai', 'r') as f:
        for line in f:
            c = line.strip().split('\t')
            fout.write(c[0] + '\t' + c[1] + '\n')
    fout.close()
    # cytoband file
    logger.info('write cytoband file: ' + args.output_cytoband)
    fout = open(args.output_cytoband, 'w')
    with open(args.output_fasta + '.fai', 'r') as f:
        for i, line in enumerate(f):
            c = line.strip().split('\t')
            fout.write('{0}\t0\t{1}\tp{2}.1\tgned\n'.format(c[0], c[1], i + 1))
    fout.close()

    # annotation file 
    logger.info('write annotation file: ' + args.output_annotation)
    with open(args.output_annotation, 'w') as fout:
        for i, record in enumerate(src):
            record = ['c%d'%(chrom_indices[i] + 1), 
                str(chrom_starts[i]), 
                str(chrom_starts[i] + lengths[i]),
                record.name,
                '0',
                '+']
            fout.write('\t'.join(record))
            fout.write('\n')
            # create junction annotation
            if args.circular_rna:
                junction_pos = (int(record[1]) + int(record[2]))//2
                record[1] = str(junction_pos - 1)
                record[2] = str(junction_pos + 1)
                record[3] = record[3] + '|junction'
                fout.write('\t'.join(record))
                fout.write('\n')

@command_handler
def map_bam_to_pseudo_genome(args):
    import pysam

    def as_paired_reads(sam):
        read1 = None
        for read in sam:
            if read.is_read1:
                read1 = read
            elif read.is_read2:
                yield (read1, read)
            else:
                raise ValueError('input SAM is not paired-end')
    
    chrom_sizes = {}
    logger.info('read chrom sizes: ' + args.chrom_sizes)
    with open(args.chrom_sizes, 'r') as fin:
        for line in fin:
            c = line.strip().split('\t')
            chrom_sizes[c[0]] = int(c[1])
    
    chrom_starts = {}
    chrom_names = {}
    with open(args.bed, 'r') as fin:
        for line in fin:
            c = line.strip().split('\t')
            c[1] = int(c[1])
            c[2] = int(c[2])
            chrom_names[c[3]] = c[0]
            chrom_starts[c[3]] = c[1]
    
    logger.info('read input BAM file: ' + args.input_file)
    sam = pysam.AlignmentFile(args.input_file, 'rb')

    if args.circular_rna:
        junction_positions = {sq['SN']:sq['LN']//2 for sq in sam.header['SQ']}

    output_header = {
    'HD': sam.header['HD'],
    'SQ': [{'SN': chrom, 'LN': size} for chrom, size in chrom_sizes.items()],
    'PG': [{'ID': 'map_bam_to_pseudo_genome', 
            'PN': 'map_bam_to_pseudo_genome',
            'VN': '1.0',
            'CL': ' '.join(sys.argv)}
          ]
    }
    logger.info('create output BAM file: ' + args.output_file)
    output_sam = pysam.AlignmentFile(args.output_file, 'wb', header=output_header)

    def map_read(read):
        d = read.to_dict()
        ref_name = d['ref_name']
        d['ref_name'] = chrom_names[ref_name]
        d['ref_pos'] = str(int(d['ref_pos']) + chrom_starts[ref_name])
        return pysam.AlignedSegment.from_dict(d, output_sam.header)

    strandness = args.strandness
    is_circular_rna = args.circular_rna

    n_reads = 0
    if args.paired_end:
        for read1, read2 in as_paired_reads(sam):
            if (read1.is_unmapped) or (read2.is_unmapped):
                continue
            if read1.reference_name != read2.reference_name:
                continue
            if is_circular_rna:
                if (strandness == 'forward') and ((not read1.is_reverse) and read2.is_reverse):
                    continue
                if (strandness == 'reverse') and ((not read2.is_reverse) and read1.is_reverse):
                    continue
                pos = junction_positions[read1.reference_name]
                if not (read1.reference_start < pos <= read2.reference_end):
                    continue
            output_sam.write(map_read(read1))
            output_sam.write(map_read(read2))
            n_reads += 1
    else:
        for read in sam:
            if read.is_unmapped:
                continue
            output_sam.write(map_read(read))
            n_reads += 1
    logger.info('number of written reads: {}'.format(n_reads))

    output_sam.close()



if __name__ == '__main__':
    main_parser = argparse.ArgumentParser(description='Preprocessing module')
    subparsers = main_parser.add_subparsers(dest='command')

    parser = subparsers.add_parser('transcript_counts', 
        help='count reads that belongs to a transcript')
    parser.add_argument('--input-file', '-i', type=str, required=True,
        help='input transcript BAM file')
    parser.add_argument('--output-file', '-o', type=str, default='-',
        help='output transcript counts file')
    
    parser = subparsers.add_parser('gtf_to_transcript_table')
    parser.add_argument('--input-file', '-i', type=str, default='-',
        help='input GTF file')
    parser.add_argument('--output-file', '-o', type=str, default='-',
        help='output table file')
    parser.add_argument('--feature', type=str,
        help='feature to use in input GTF file (Column 3)')
    parser.add_argument('--gene-type', type=str,
        help='gene type to set if "gene_type" attribute is not available in GTF file')
    parser.add_argument('--transcript-type', type=str, default='unknown',
        help='gene type to set if "transcript_type" attribute is not available in GTF file')
    
    parser = subparsers.add_parser('extract_longest_transcript')
    parser.add_argument('--input-file', '-i', type=str, default='-',
        help='input GTF file')
    parser.add_argument('--output-file', '-o', type=str, default='-',
        help='output table file')
    parser.add_argument('--feature', type=str, default='exon',
        help='feature to use in input GTF file (Column 3)')
    
    parser = subparsers.add_parser('fix_gtf',
        help='fix problems in a GTF file by removing invalid transcripts')
    parser.add_argument('--input-file', '-i', type=str, default='-',
        help='input GTF file')
    parser.add_argument('--output-file', '-o', type=str, default='-',
        help='output table file')
    parser.add_argument('--feature', type=str, default='exon',
        help='feature to use in input GTF file (Column 3)')
    
    parser = subparsers.add_parser('extract_gene',
        help='extract gene regions from a GTF file')
    parser.add_argument('--input-file', '-i', type=str, default='-',
        help='input GTF file')
    parser.add_argument('--output-file', '-o', type=str, default='-',
        help='output BED file')
    parser.add_argument('--feature', type=str,
        help='feature to use in input GTF file (Column 3)')
    
    parser = subparsers.add_parser('gtf_to_bed',
        help='convert transcripts from GTF to BED')
    parser.add_argument('--input-file', '-i', type=str, default='-',
        help='input GTF file')
    parser.add_argument('--output-file', '-o', type=str, default='-',
        help='output BED file')
    parser.add_argument('--feature', type=str,
        help='features to treat as exons in input GTF file (Column 3)')
    
    parser = subparsers.add_parser('extract_circrna_junction',
        help='extract circular RNA junction sequences from spliced sequences')
    parser.add_argument('--input-file', '-i', type=str, default='-',
        help='spliced sequence file')
    parser.add_argument('--output-file', '-o', type=str, default='-',
        help='junction sequence file')
    parser.add_argument('--anchor-size', '-s', type=int, default=50,
        help='anchor length from both ends')
    parser.add_argument('--min-length', type=int, default=50,
        help='minimal length to keep a spliced sequence')
    
    parser = subparsers.add_parser('filter_circrna_reads',
        help='filter out reads not crossing circRNA junctions')
    parser.add_argument('--input-file', '-i', type=str, default='-',
        help='input SAM file')
    parser.add_argument('--output-file', '-o', type=str, default='-',
        help='output SAM file')
    parser.add_argument('--filtered-file', '-u', type=str,
        help='write filtered SAM records to file')
    
    parser = subparsers.add_parser('chrom_sizes',
        help='create chrom sizes file from FASTA file')
    parser.add_argument('--input-file', '-i', type=str, default='-',
        help='input FASTA')
    parser.add_argument('--output-file', '-o', type=str, default='-',
        help='output chrom sizes file')
    
    parser = subparsers.add_parser('extract_feature_sequence',
        help='extract sequences using feature names')
    parser.add_argument('--input-file', '-i', type=str, required=True,
        help='feature file with feature names in the first column')
    parser.add_argument('--genome-dir', '-g', type=str, required=True,
        help='genome directory where fasta/${rna_type}.fa contains sequences')
    parser.add_argument('--output-file', '-o', type=str, default='-',
        help='output file in FASTA format')
    parser.add_argument('--padding', type=int, default=200,
        help='extend on both ends to maximum in length')

    parser = subparsers.add_parser('extract_domain_sequence',
        help='extract sequences using feature names')
    parser.add_argument('--input-file', '-i', type=str, required=True,
        help='feature file with feature names in the first column')
    parser.add_argument('--genome-dir', '-g', type=str, required=True,
        help='genome directory where fasta/${rna_type}.fa contains sequences')
    parser.add_argument('--flanking', type=int, default=100)
    parser.add_argument('--output-file', '-o', type=str, default='-',
        help='output file in FASTA format')
    
    parser = subparsers.add_parser('calculate_star_parameters',
        help='calculate --genomeSAindexNbases and --genomeChrBinNbits for STAR')
    parser.add_argument('--input-file', '-i', type=str, required=True,
        help='FASTA index (.fai) file')
    parser.add_argument('--parameter', '-p', type=str, required=True,
        choices=('genomeSAindexNbases', 'genomeChrBinNbits'),
        help='parameter to calculate')

    parser = subparsers.add_parser('highlight_mature_mirna_location',
        help='get mature miRNA location in precursor miRNA in miRBase')
    parser.add_argument('--mature', type=str, required=True,
        help='FASTA file of mature sequences')
    parser.add_argument('--hairpin', type=str, required=True,
        help='FASTA file of hairpin sequences')
    parser.add_argument('--output-file', '-o', type=str, default='-')
    parser.add_argument('--species', type=str)

    parser = subparsers.add_parser('extract_mature_mirna_location',
        help='Extract mature miRNA location in precursor miRNA in miRBase')
    parser.add_argument('--input-file', '-i', type=str, required=True,
        help='miRBase GFF3 file')
    parser.add_argument('--output-file', '-o', type=str, default='-',
        help='BED file with 6 columns: pre-miRNA, start, end, mature miRNA, 0, +')

    parser = subparsers.add_parser('calculate_gene_length',
        help='calculate effective gene length')
    parser.add_argument('--input-file', '-i', type=str, default='-',
        help='GTF/GFF file')
    parser.add_argument('--method', '-m', type=str,
        choices=('isoform_length_mean', 'isoform_length_median', 'isoform_length_max', 'merged_exon_length'))
    parser.add_argument('--output-file', '-o', type=str,
        help='output tab-deliminated file with two columns: gene_id, length')
    
    parser = subparsers.add_parser('merge_data_frames')
    parser.add_argument('--input-file', '-i', type=str, action='append', required=True,
        help='input data matrix')
    parser.add_argument('--sep', type=str, default='\t', help='column delimiter')
    parser.add_argument('--how', type=str, default='inner',
        choices=('inner', 'outer', 'left', 'right'))
    parser.add_argument('--on', type=str, help='column name to join on')
    parser.add_argument('--on-index', action='store_true', default=False, help='join on index')
    parser.add_argument('--fillna', type=str)
    parser.add_argument('--output-file', '-o', type=str, default='-',
        help='output merged data frame')
    
    parser = subparsers.add_parser('genomecov')
    parser.add_argument('--input-file', '-i', type=str,  required=True,
        help='input BAM/SAM file')
    parser.add_argument('--output-file', '-o', type=str, required=True,
        help='output HDF5 file')
    
    parser = subparsers.add_parser('calc_rpkm')
    parser.add_argument('--input-file', '-i', type=str,  default='-',
        help='input expression (RPM) matrix file')
    parser.add_argument('--output-file', '-o', type=str, default='-',
        help='output expression (RPKM) matrix file')
    
    parser = subparsers.add_parser('create_pseudo_genome')
    parser.add_argument('--input-file', '-i', type=str, required=True,
        help='input transcript FASTA file (with FASTA index)')
    parser.add_argument('--max-chrom-size', type=int, default=100000000,
        help='maximum size for each chromosome')
    parser.add_argument('--padding', type=int, default=50,
        help='number of N bases padded after each sequence')
    parser.add_argument('--circular-rna', action='store_true',
        help='write circular RNA junction to annotation file')
    parser.add_argument('--output-fasta', type=str, required=True,
        help='output FASTA and index file')
    parser.add_argument('--output-chrom-sizes', type=str, required=True,
        help='output chrom sizes file')
    parser.add_argument('--output-annotation', type=str, required=True,
        help='output annotation BED file')
    parser.add_argument('--output-cytoband', type=str, required=True,
        help='output cytoband file')
    
    parser = subparsers.add_parser('map_bam_to_pseudo_genome')
    parser.add_argument('--input-file', '-i', type=str, required=True,
        help='input BAM file')
    parser.add_argument('--bed', type=str, required=True,
        help='input BED file of genes in the pseudo-genome')
    parser.add_argument('--chrom-sizes', type=str, required=True,
        help='input chrom sizes of the pseudo-genome')
    parser.add_argument('--strandness', '-s', type=str, default='forward',
        help='strandness for circular RNA')
    parser.add_argument('--paired-end', action='store_true',
        help='input is paired-end reads')
    parser.add_argument('--circular-rna', action='store_true',
        help='requires the read to be mapped to circular RNA junctions')
    parser.add_argument('--output-file', '-o', type=str, required=True,
        help='output BAM file')
    
    args = main_parser.parse_args()
    if args.command is None:
        main_parser.print_help()
        sys.exit(1)
    logger = logging.getLogger('preprocess.' + args.command)

    try:
        command_handlers.get(args.command)(args)
    except BrokenPipeError:
        pass
    except KeyboardInterrupt:
        pass