#! /usr/bin/env python
from __future__ import print_function
import argparse, sys, os, errno
import logging
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] [%(levelname)s] %(name)s: %(message)s')
command_handlers = {}
def command_handler(f):
command_handlers[f.__name__] = f
return f
def read_gtf(filename):
from ioutils import open_file_or_stdin
with open_file_or_stdin(filename) as fin:
lineno = 0
for line in fin:
lineno += 1
c = line.strip().split('\t')
if c[0].startswith('#'):
continue
attrs = {}
for a in c[8].split(';')[:-1]:
a = a.strip()
i = a.find(' ')
key = a[:i]
val = a[(i + 1):].strip('"')
attrs[key] = val
gene_id = attrs.get('gene_id')
if gene_id is None:
raise ValueError('gene_id not found in GTF file at line {}'.format(lineno))
yield (c, attrs, line)
@command_handler
def extract_gene(args):
from ioutils import open_file_or_stdout
feature = args.feature
genes = {}
logger.info('read GTF file: ' + args.input_file)
for c, attrs, line in read_gtf(args.input_file):
if (feature is not None) and (c[2] != feature):
continue
gene_id = attrs.get('gene_id')
gene = genes.get(gene_id)
if gene is None:
gene = [c[0], int(c[3]) - 1, int(c[4]), gene_id, 0, c[6]]
genes[gene_id] = gene
else:
gene[1] = min(gene[1], int(c[3]) - 1)
gene[2] = max(gene[2], int(c[4]))
logger.info('number of genes: {}'.format(len(genes)))
logger.info('write BED file: ' + args.output_file)
with open_file_or_stdout(args.output_file) as fout:
for gene_id, gene in genes.items():
fout.write('\t'.join(map(str, gene)))
fout.write('\n')
@command_handler
def fix_gtf(args):
from ioutils import open_file_or_stdout
from collections import defaultdict
# strand of exons grouped by transcript_id
strands = defaultdict(list)
feature = args.feature
lines = []
logger.info('read GTF file: ' + args.input_file)
for c, attrs, line in read_gtf(args.input_file):
if c[2] in ('transcript', 'exon'):
transcript_id = attrs.get('transcript_id')
if transcript_id is None:
raise ValueError('transcript_id not found in GTF file at line {}'.format(lineno))
lines.append((transcript_id, line))
else:
transcript_id = attrs.get('transcript_id')
if transcript_id is None:
raise ValueError('transcript_id not found in GTF file at line {}'.format(lineno))
strands[transcript_id].append(c[6])
invalid_transcripts = set()
for transcript_id, strands_tx in strands.items():
strands_tx = set(strands_tx)
# remove transcripts without strand information
if '.' in strands_tx:
invalid_transcripts.add(transcript_id)
# remove transcripts with exon on different strands
elif len(strands_tx) != 1:
invalid_transcripts.add(transcript_id)
logger.info('number of transcripts: {}'.format(len(strands)))
logger.info('number of invalid transcripts: {}'.format(len(invalid_transcripts)))
logger.info('write GTF file: ' + args.output_file)
with open_file_or_stdout(args.output_file) as fout:
for transcript_id, line in lines:
if transcript_id not in invalid_transcripts:
fout.write(line)
@command_handler
def transcript_counts(args):
import pysam
import numpy as np
from ioutils import open_file_or_stdout
from collections import defaultdict
logger.info('read input transcript BAM file: ' + args.input_file)
sam = pysam.AlignmentFile(args.input_file, "rb")
counts = defaultdict(int)
for read in sam:
counts[read.reference_name] += 1
logger.info('create output file: ' + args.output_file)
with open_file_or_stdout(args.output_file) as fout:
for key, val in counts.items():
fout.write('{}\t{}\n'.format(key, val))
@command_handler
def extract_longest_transcript(args):
from ioutils import open_file_or_stdin, open_file_or_stdout
from collections import defaultdict
from functools import partial
feature = args.feature
genes = defaultdict(partial(defaultdict, int))
lines = []
logger.info('read gtf file: ' + args.input_file)
with open_file_or_stdin(args.input_file) as fin:
lineno = 0
for line in fin:
lineno += 1
c = line.strip().split('\t')
if c[0].startswith('#'):
continue
if c[2] != feature:
lines.append(('#other#', line))
continue
attrs = {}
for a in c[8].split(';')[:-1]:
a = a.strip()
i = a.find(' ')
key = a[:i]
val = a[(i + 1):].strip('"')
attrs[key] = val
transcript_id = attrs.get('transcript_id')
if transcript_id is None:
raise ValueError('transcript_id not found in GTF file at line {}'.format(lineno))
gene_id = attrs.get('gene_id')
if gene_id is None:
raise ValueError('gene_id not found in GTF file at line {}'.format(lineno))
lines.append((transcript_id, line))
genes[gene_id][transcript_id] += int(c[4]) - int(c[3]) + 1
kept_transcripts = set()
kept_transcripts.add('#other#')
for gene_id, gene in genes.items():
max_length = 0
max_transcript = None
for transcript_id, length in gene.items():
if length > max_length:
max_length = length
max_transcript = transcript_id
kept_transcripts.add(transcript_id)
logger.info('number of genes: {}'.format(len(genes)))
logger.info('number of transcripts: {}'.format(sum(map(len, genes.values()))))
logger.info('number of longest transcripts: {}'.format(len(kept_transcripts) - 1))
logger.info('write output gtf file: ' + args.output_file)
with open_file_or_stdout(args.output_file) as fout:
for transcript_id, line in lines:
if transcript_id in kept_transcripts:
fout.write(line)
@command_handler
def gtf_to_transcript_table(args):
from ioutils import open_file_or_stdin, open_file_or_stdout
from collections import OrderedDict
feature = args.feature
default_transcript_type = args.transcript_type
default_gene_type = args.gene_type
fout = open_file_or_stdout(args.output_file)
with open_file_or_stdin(args.input_file) as fin:
transcripts = OrderedDict()
for line in fin:
# get GTF columns
c = line.strip().split('\t')
if c[0].startswith('#'):
continue
if c[2] != feature:
continue
# GTF attributes
attrs = {}
for a in c[8].split(';')[:-1]:
a = a.strip()
i = a.find(' ')
key = a[:i]
val = a[(i + 1):].strip('"')
attrs[key] = val
# mitranscriptome
if c[1] == 'mitranscriptome':
if attrs['tcat'] == 'lncrna':
attrs['gene_type'] = 'lncRNA'
attrs['transcript_type'] = 'lncRNA'
elif attrs['tcat'] == 'tucp':
attrs['gene_type'] = 'tucpRNA'
attrs['transcript_type'] = 'tucpRNA'
# use transcript_id as transcript_name
if 'transcript_name' not in attrs:
attrs['transcript_name'] = attrs['transcript_id']
# use gene_id as gene_name
if 'gene_name' not in attrs:
attrs['gene_name'] = attrs['gene_id']
# set transcript_type if given
if default_transcript_type is not None:
attrs['transcript_type'] = default_transcript_type
else:
if 'transcript_type' not in attrs:
attrs['transcript_type'] = 'unknown'
# set gene_type if given
if default_gene_type is not None:
attrs['gene_type'] = default_gene_type
else:
if 'gene_type' not in attrs:
attrs['gene_type'] = 'unknown'
exon = [c[0], int(c[3]) - 1, int(c[4]), attrs['gene_id'], 0, c[6],
attrs['gene_id'], attrs['transcript_id'],
attrs['gene_name'], attrs['transcript_name'],
attrs['gene_type'], attrs['transcript_type'], c[1]]
transcript = transcripts.get(attrs['transcript_id'])
if transcript is None:
transcripts[attrs['transcript_id']] = exon
else:
if c[2] == 'exon':
transcript[1] = min(transcript[1], exon[1])
transcript[2] = max(transcript[2], exon[2])
header = ['chrom', 'start', 'end', 'name', 'score', 'strand',
'gene_id', 'transcript_id',
'gene_name', 'transcript_name',
'gene_type', 'transcript_type', 'source'
]
print('\t'.join(header), file=fout)
for transcript in transcripts.values():
print('\t'.join(str(a) for a in transcript), file=fout)
fout.close()
@command_handler
def extract_circrna_junction(args):
from Bio import SeqIO
from ioutils import open_file_or_stdin, open_file_or_stdout
anchor_size = args.anchor_size
logger.info('read sequence file: ' + args.input_file)
logger.info('create output file: ' + args.output_file)
fout = open_file_or_stdout(args.output_file)
with open_file_or_stdin(args.input_file) as fin:
for record in SeqIO.parse(fin, 'fasta'):
seq = str(record.seq)
if len(seq) < args.min_length:
continue
s = min(len(seq), anchor_size)
seq_id = record.id.split('|')[0]
fout.write('>{}\n'.format(seq_id))
fout.write(seq[-s:] + seq[:s])
fout.write('\n')
fout.close()
@command_handler
def filter_circrna_reads(args):
import pysam
import numpy as np
from ioutils import open_file_or_stdout, open_file_or_stdin
from collections import defaultdict
from copy import deepcopy
logger.info('read input SAM file: ' + args.input_file)
fin = open_file_or_stdin(args.input_file)
sam_in = pysam.AlignmentFile(fin, "r")
if sam_in.header is None:
raise ValueError('requires SAM header to get junction positions')
# get junction positions (middle of the sequences)
junction_positions = {}
for sq in sam_in.header['SQ']:
junction_positions[sq['SN']] = sq['LN']//2
logger.info('create output SAM file: ' + args.output_file)
fout = open_file_or_stdout(args.output_file)
sam_out = pysam.AlignmentFile(fout, 'w', template=sam_in)
sam_filtered = None
if args.filtered_file is not None:
logger.info('create filtered SAM file: ' + args.filtered_file)
sam_filtered = pysam.AlignmentFile(args.filtered_file, 'w', template=sam_in)
for read in sam_in:
filtered = False
if read.is_unmapped:
filtered = True
elif read.is_reverse:
filtered = True
else:
pos = junction_positions[read.reference_name]
if not (read.reference_start < pos <= read.reference_end):
filterd = True
if not filtered:
sam_out.write(read)
elif sam_filtered is not None:
sam_filtered.write(read)
fin.close()
fout.close()
if sam_filtered is not None:
sam_filtered.close()
@command_handler
def chrom_sizes(args):
from Bio import SeqIO
from ioutils import open_file_or_stdin, open_file_or_stdout
fout = open_file_or_stdout(args.output_file)
with open_file_or_stdin(args.input_file) as fin:
for record in SeqIO.parse(fin, 'fasta'):
fout.write('{}\t{}\n'.format(record.id, len(record.seq)))
@command_handler
def extract_domain_sequence(args):
from pyfaidx import Fasta
from Bio.Seq import Seq
from ioutils import open_file_or_stdout
import pandas as pd
fout = open_file_or_stdout(args.output_file)
fastas = {}
with open(args.input_file, 'r') as fin:
for lineno, line in enumerate(fin):
feature = line.split('\t')[0]
gene_id, gene_type, gene_name, domain_id, transcript_id, start, end = feature.split('|')
start = int(start)
end = int(end)
if gene_type == 'genomic':
gene_type = 'genome'
if gene_type not in fastas:
fastas[gene_type] = Fasta(os.path.join(args.genome_dir, 'fasta', gene_type + '.fa'))
if gene_type == 'genome':
chrom, gstart, gend, strand = gene_id.split('_')
gstart = int(gstart)
gend = int(gend)
seq = fastas[gene_type][chrom][gstart:gend].seq
if strand == '-':
seq = str(Seq(seq).reverse_complement())
else:
seq = fastas[gene_type][transcript_id][start:end].seq
seq = seq.upper()
fout.write('>{}\n'.format(feature))
fout.write(seq)
fout.write('\n')
fout.close()
@command_handler
def extract_feature_sequence(args):
from pyfaidx import Fasta
from Bio.Seq import Seq
from ioutils import open_file_or_stdout
def pad_range(start, end, chrom_size, max_length):
if (end - start) >= max_length:
return
padding_left = (max_length - (end - start))//2
new_start = max(0, start - padding_left)
new_end = min(new_start + max_length, chrom_size)
return new_start, new_end
fout = open_file_or_stdout(args.output_file)
fastas = {}
with open(args.input_file, 'r') as fin:
for lineno, line in enumerate(fin):
if lineno == 0:
continue
feature = line.split('\t')[0]
gene_id, gene_type, gene_name, domain_id, transcript_id, start, end = feature.split('|')
start = int(start)
end = int(end)
if gene_type == 'genomic':
gene_type = 'genome'
# load FASTA file
if gene_type not in fastas:
fastas[gene_type] = Fasta(os.path.join(args.genome_dir, 'fasta', gene_type + '.fa'))
if gene_type == 'genome':
chrom, gstart, gend, strand = gene_id.split('_')
gstart = int(gstart)
gend = int(gend)
seq = fastas[gene_type][chrom][gstart:gend].seq
if strand == '-':
seq = str(Seq(seq).reverse_complement())
else:
seq = fastas[gene_type][transcript_id][start:end].seq
seq = seq.upper()
fout.write('>{}\n'.format(feature))
fout.write(seq)
fout.write('\n')
fout.close()
@command_handler
def calculate_star_parameters(args):
from math import log2
genome_length = 0
n_seqs = 0
with open(args.input_file, 'r') as f:
for line in f:
genome_length += int(line.split('\t')[1])
n_seqs += 1
if args.parameter == 'genomeSAindexNbases':
print(min(14, int(log2(genome_length)//2) - 1))
elif args.parameter == 'genomeChrBinNbits':
print(min(18, int(log2(genome_length/n_seqs))))
@command_handler
def highlight_mature_mirna_location(args):
import gzip
from Bio import SeqIO
from collections import defaultdict
from ioutils import open_file_or_stdout
logger.info('read hairpin sequences: ' + args.hairpin)
hairpin_seqs = {}
with open(args.hairpin, 'r') as f:
for record in SeqIO.parse(f, 'fasta'):
if (args.species is None) or (args.species == record.id.split('-')[0]):
hairpin_seqs[record.id] = str(record.seq)
logger.info('read mature sequences: ' + args.mature)
fout = open_file_or_stdout(args.output_file)
mature_positions = defaultdict(dict)
with open(args.mature, 'r') as f:
for record in SeqIO.parse(f, 'fasta'):
if not ((args.species is None) or (args.species == record.id.split('-')[0])):
continue
if record.id.endswith('-5p') or record.id.endswith('-3p'):
end_type = record.id.split('-')[-1]
hairpin_id = record.id[:-3].lower()
hairpin_seq = hairpin_seqs.get(hairpin_id)
if hairpin_seq is None:
logger.warn('hairpin sequence id {} is not found in hairpin file'.format(hairpin_id))
continue
mature_id = record.id
mature_seq = str(record.seq)
pos = hairpin_seq.find(mature_seq)
if pos < 0:
logger.warn('mature sequence {} is not found in hairpin file'.format(mature_id))
continue
else:
mature_positions[hairpin_id][end_type] = (pos, pos + len(mature_seq))
for hairpin_id, hairpin_seq in hairpin_seqs.items():
mature_position = mature_positions.get(hairpin_id)
if mature_position is None:
mature_position = {}
fout.write('>{}\n'.format(hairpin_id))
offset = 0
if '5p' in mature_position:
start, end = mature_position['5p']
fout.write('{0}\x1B[31;1m{1}\x1B[0m'.format(hairpin_seq[offset:start], hairpin_seq[start:end]))
offset = end
if '3p' in mature_position:
start, end = mature_position['3p']
fout.write('{0}\x1B[32;1m{1}\x1B[0m'.format(hairpin_seq[offset:start], hairpin_seq[start:end]))
offset = end
fout.write('{}\n'.format(hairpin_seq[offset:]))
fout.close()
@command_handler
def extract_mature_mirna_location(args):
from utils import read_gff, GFFRecord
from ioutils import open_file_or_stdin, open_file_or_stdout
from collections import OrderedDict, defaultdict
logger.info('read input GFF file: ' + args.input_file)
fin = open_file_or_stdin(args.input_file)
logger.info('open output BED file: ' + args.input_file)
fout = open_file_or_stdout(args.output_file)
# key: precursor_id, value: precursor record
precursors = OrderedDict()
# key: precursor_id, value: list of mature records
matures = defaultdict(list)
# read features from GFF file
for record in read_gff(fin):
if record.feature == 'miRNA_primary_transcript':
precursors[record.attr['ID']] = record
elif record.feature == 'miRNA':
matures[record.attr['Derives_from']].append(record)
# get locations of mature miRNAs
for precursor_id, precursor in precursors.items():
for mature in matures[precursor_id]:
if mature.strand == '+':
fout.write('{}\t{}\t{}\t{}\t0\t+\n'.format(
precursor.attr['Name'],
mature.start - precursor.start,
mature.end - precursor.start + 1,
mature.attr['Name']))
else:
fout.write('{}\t{}\t{}\t{}\t0\t+\n'.format(
precursor.attr['Name'],
precursor.end - mature.end,
precursor.end - mature.start + 1,
mature.attr['Name']
))
fin.close()
fout.close()
@command_handler
def gtf_to_bed(args):
from ioutils import open_file_or_stdin, open_file_or_stdout
exon_feature = 'exon'
# use transcript_id attribute as key
transcripts = {}
logger.info('read input GTF file: ' + args.input_file)
for lineno, record in enumerate(read_gtf(args.input_file)):
c, attrs, line = record
if c[2] == exon_feature:
gene_id = attrs.get('gene_id')
if gene_id is None:
raise ValueError('gene_id attribute not found in GTF file {}:{}'.format(args.input_file, lineno))
transcript_id = attrs.get('transcript_id')
if transcript_id is None:
raise ValueError('transcript_id attribute not found in GTF file {}:{}'.format(args.input_file, lineno))
transcript = transcripts.get(transcript_id)
if transcript is None:
# new transcript
transcript = {
'chrom': c[0],
'strand': c[6],
'gene_id': gene_id,
'gene_name': attrs.get('gene_name', gene_id),
'transcript_name': attrs.get('transcript_name', transcript_id),
'exons': []
}
transcripts[transcript_id] = transcript
# add a new exon
transcript['exons'].append((int(c[3]) - 1, int(c[4])))
fout = open_file_or_stdout(args.output_file)
bed_template = '{chrom}\t{start}\t{end}\t{name}\t0\t{strand}\t0\t0\t0\t{n_exons}\t{exon_sizes}\t{exon_starts}\n'
for transcript_id, transcript in transcripts.items():
# sort exons by start position
transcript['exons'] = sorted(transcript['exons'], key=lambda x: x[0])
transcript['n_exons'] = len(transcript['exons'])
transcript['start'] = transcript['exons'][0][0]
transcript['end'] = transcript['exons'][-1][1]
transcript['exon_starts'] = ','.join(str(e[0] - transcript['start']) for e in transcript['exons'])
transcript['exon_sizes'] = ','.join(str(e[1] - e[0]) for e in transcript['exons'])
transcript['name'] = '{gene_id}'.format(**transcript)
fout.write(bed_template.format(**transcript))
@command_handler
def calculate_gene_length(args):
import HTSeq
from collections import defaultdict
from functools import partial
import numpy as np
from ioutils import open_file_or_stdin
from tqdm import tqdm
fin = open_file_or_stdin(args.input_file)
gff = HTSeq.GFF_Reader(fin)
exons = defaultdict(partial(defaultdict, int))
for feature in tqdm(gff, unit='feature'):
if feature.type == 'exon':
exons[feature.attr['gene_id']][feature.attr['transcript_id']] += feature.iv.length
@command_handler
def merge_data_frames(args):
import pandas as pd
import numpy as np
from ioutils import open_file_or_stdout
if (not args.on_index) and (args.on is None):
raise ValueError('argument --on is required if --on-index is not specified')
merged = None
for input_file in args.input_file:
logger.info('read input file: ' + input_file)
df = pd.read_table(input_file, sep=args.sep)
if merged is None:
merged = df
else:
if args.on_index:
merged = pd.merge(merged, df, how=args.how, left_index=True, right_index=True)
else:
merged = pd.merge(merged, df, how=args.how, on=args.on)
if args.fillna is not None:
merged.fillna(args.fillna, inplace=True)
logger.info('open output file: ' + args.output_file)
with open_file_or_stdout(args.output_file) as f:
merged.to_csv(f, sep=args.sep, header=True, index=args.on_index)
@command_handler
def genomecov(args):
import pysam
import h5py
import numpy as np
from tqdm import tqdm
logger.info('read input BAM/SAM file: ' + args.input_file)
sam = pysam.AlignmentFile(args.input_file, 'r')
logger.info('open output HDF5 file: ' + args.output_file)
fout = h5py.File(args.output_file, 'w')
for sq in tqdm(sam.header['SQ'], unit='seq'):
data = np.zeros(sq['LN'], dtype=np.int32)
fout.create_dataset(sq['SN'], data=data, compression='gzip')
fout.close()
@command_handler
def calc_rpkm(args):
import pandas as pd
import numpy as np
from ioutils import open_file_or_stdin, open_file_or_stdout
matrix = pd.read_table(open_file_or_stdin(args.input_file), index_col=0, sep='\t')
feature_info = matrix.index.to_series().str.split('|', expand=True)
feature_info.columns = ['gene_id', 'gene_type', 'gene_name', 'feature_id', 'transcript_id', 'start', 'end']
feature_info['start'] = feature_info['start'].astype('int')
feature_info['end'] = feature_info['end'].astype('int')
feature_info['length'] = feature_info['end'] - feature_info['start']
matrix = 1000.0*matrix.div(feature_info['length'], axis=0)
matrix.to_csv(open_file_or_stdout(args.output_file), index=True, header=True, sep='\t', na_rep='NA')
@command_handler
def create_pseudo_genome(args):
from pyfaidx import Fasta, Faidx
import numpy as np
src = Fasta(args.input_file)
lengths = np.array([len(r) for r in src])
padded_lengths = lengths + args.padding
padded_seq = ''.join(['N']*args.padding)
cum_lengths = np.cumsum(padded_lengths)
chrom_indices = cum_lengths//args.max_chrom_size
chrom_starts = cum_lengths%args.max_chrom_size - padded_lengths
prev_chrom_index = -1
# write FASTA file
logger.info('write FASTA file: ' + args.output_fasta)
with open(args.output_fasta, 'w') as fout:
for i, record in enumerate(src):
# new chromosome
if chrom_indices[i] != prev_chrom_index:
if i > 0:
fout.write('\n')
fout.write('>c{}\n'.format(i + 1))
prev_chrom_index = chrom_indices[i]
# write sequence
fout.write(str(record))
fout.write(padded_seq)
# build fasta index
logger.info('build FASTA index')
dst_index = Faidx(args.output_fasta)
dst_index.build_index()
# chrom sizes
logger.info('write chrom sizes: ' + args.output_chrom_sizes)
fout = open(args.output_chrom_sizes, 'w')
with open(args.output_fasta + '.fai', 'r') as f:
for line in f:
c = line.strip().split('\t')
fout.write(c[0] + '\t' + c[1] + '\n')
fout.close()
# cytoband file
logger.info('write cytoband file: ' + args.output_cytoband)
fout = open(args.output_cytoband, 'w')
with open(args.output_fasta + '.fai', 'r') as f:
for i, line in enumerate(f):
c = line.strip().split('\t')
fout.write('{0}\t0\t{1}\tp{2}.1\tgned\n'.format(c[0], c[1], i + 1))
fout.close()
# annotation file
logger.info('write annotation file: ' + args.output_annotation)
with open(args.output_annotation, 'w') as fout:
for i, record in enumerate(src):
record = ['c%d'%(chrom_indices[i] + 1),
str(chrom_starts[i]),
str(chrom_starts[i] + lengths[i]),
record.name,
'0',
'+']
fout.write('\t'.join(record))
fout.write('\n')
# create junction annotation
if args.circular_rna:
junction_pos = (int(record[1]) + int(record[2]))//2
record[1] = str(junction_pos - 1)
record[2] = str(junction_pos + 1)
record[3] = record[3] + '|junction'
fout.write('\t'.join(record))
fout.write('\n')
@command_handler
def map_bam_to_pseudo_genome(args):
import pysam
def as_paired_reads(sam):
read1 = None
for read in sam:
if read.is_read1:
read1 = read
elif read.is_read2:
yield (read1, read)
else:
raise ValueError('input SAM is not paired-end')
chrom_sizes = {}
logger.info('read chrom sizes: ' + args.chrom_sizes)
with open(args.chrom_sizes, 'r') as fin:
for line in fin:
c = line.strip().split('\t')
chrom_sizes[c[0]] = int(c[1])
chrom_starts = {}
chrom_names = {}
with open(args.bed, 'r') as fin:
for line in fin:
c = line.strip().split('\t')
c[1] = int(c[1])
c[2] = int(c[2])
chrom_names[c[3]] = c[0]
chrom_starts[c[3]] = c[1]
logger.info('read input BAM file: ' + args.input_file)
sam = pysam.AlignmentFile(args.input_file, 'rb')
if args.circular_rna:
junction_positions = {sq['SN']:sq['LN']//2 for sq in sam.header['SQ']}
output_header = {
'HD': sam.header['HD'],
'SQ': [{'SN': chrom, 'LN': size} for chrom, size in chrom_sizes.items()],
'PG': [{'ID': 'map_bam_to_pseudo_genome',
'PN': 'map_bam_to_pseudo_genome',
'VN': '1.0',
'CL': ' '.join(sys.argv)}
]
}
logger.info('create output BAM file: ' + args.output_file)
output_sam = pysam.AlignmentFile(args.output_file, 'wb', header=output_header)
def map_read(read):
d = read.to_dict()
ref_name = d['ref_name']
d['ref_name'] = chrom_names[ref_name]
d['ref_pos'] = str(int(d['ref_pos']) + chrom_starts[ref_name])
return pysam.AlignedSegment.from_dict(d, output_sam.header)
strandness = args.strandness
is_circular_rna = args.circular_rna
n_reads = 0
if args.paired_end:
for read1, read2 in as_paired_reads(sam):
if (read1.is_unmapped) or (read2.is_unmapped):
continue
if read1.reference_name != read2.reference_name:
continue
if is_circular_rna:
if (strandness == 'forward') and ((not read1.is_reverse) and read2.is_reverse):
continue
if (strandness == 'reverse') and ((not read2.is_reverse) and read1.is_reverse):
continue
pos = junction_positions[read1.reference_name]
if not (read1.reference_start < pos <= read2.reference_end):
continue
output_sam.write(map_read(read1))
output_sam.write(map_read(read2))
n_reads += 1
else:
for read in sam:
if read.is_unmapped:
continue
output_sam.write(map_read(read))
n_reads += 1
logger.info('number of written reads: {}'.format(n_reads))
output_sam.close()
if __name__ == '__main__':
main_parser = argparse.ArgumentParser(description='Preprocessing module')
subparsers = main_parser.add_subparsers(dest='command')
parser = subparsers.add_parser('transcript_counts',
help='count reads that belongs to a transcript')
parser.add_argument('--input-file', '-i', type=str, required=True,
help='input transcript BAM file')
parser.add_argument('--output-file', '-o', type=str, default='-',
help='output transcript counts file')
parser = subparsers.add_parser('gtf_to_transcript_table')
parser.add_argument('--input-file', '-i', type=str, default='-',
help='input GTF file')
parser.add_argument('--output-file', '-o', type=str, default='-',
help='output table file')
parser.add_argument('--feature', type=str,
help='feature to use in input GTF file (Column 3)')
parser.add_argument('--gene-type', type=str,
help='gene type to set if "gene_type" attribute is not available in GTF file')
parser.add_argument('--transcript-type', type=str, default='unknown',
help='gene type to set if "transcript_type" attribute is not available in GTF file')
parser = subparsers.add_parser('extract_longest_transcript')
parser.add_argument('--input-file', '-i', type=str, default='-',
help='input GTF file')
parser.add_argument('--output-file', '-o', type=str, default='-',
help='output table file')
parser.add_argument('--feature', type=str, default='exon',
help='feature to use in input GTF file (Column 3)')
parser = subparsers.add_parser('fix_gtf',
help='fix problems in a GTF file by removing invalid transcripts')
parser.add_argument('--input-file', '-i', type=str, default='-',
help='input GTF file')
parser.add_argument('--output-file', '-o', type=str, default='-',
help='output table file')
parser.add_argument('--feature', type=str, default='exon',
help='feature to use in input GTF file (Column 3)')
parser = subparsers.add_parser('extract_gene',
help='extract gene regions from a GTF file')
parser.add_argument('--input-file', '-i', type=str, default='-',
help='input GTF file')
parser.add_argument('--output-file', '-o', type=str, default='-',
help='output BED file')
parser.add_argument('--feature', type=str,
help='feature to use in input GTF file (Column 3)')
parser = subparsers.add_parser('gtf_to_bed',
help='convert transcripts from GTF to BED')
parser.add_argument('--input-file', '-i', type=str, default='-',
help='input GTF file')
parser.add_argument('--output-file', '-o', type=str, default='-',
help='output BED file')
parser.add_argument('--feature', type=str,
help='features to treat as exons in input GTF file (Column 3)')
parser = subparsers.add_parser('extract_circrna_junction',
help='extract circular RNA junction sequences from spliced sequences')
parser.add_argument('--input-file', '-i', type=str, default='-',
help='spliced sequence file')
parser.add_argument('--output-file', '-o', type=str, default='-',
help='junction sequence file')
parser.add_argument('--anchor-size', '-s', type=int, default=50,
help='anchor length from both ends')
parser.add_argument('--min-length', type=int, default=50,
help='minimal length to keep a spliced sequence')
parser = subparsers.add_parser('filter_circrna_reads',
help='filter out reads not crossing circRNA junctions')
parser.add_argument('--input-file', '-i', type=str, default='-',
help='input SAM file')
parser.add_argument('--output-file', '-o', type=str, default='-',
help='output SAM file')
parser.add_argument('--filtered-file', '-u', type=str,
help='write filtered SAM records to file')
parser = subparsers.add_parser('chrom_sizes',
help='create chrom sizes file from FASTA file')
parser.add_argument('--input-file', '-i', type=str, default='-',
help='input FASTA')
parser.add_argument('--output-file', '-o', type=str, default='-',
help='output chrom sizes file')
parser = subparsers.add_parser('extract_feature_sequence',
help='extract sequences using feature names')
parser.add_argument('--input-file', '-i', type=str, required=True,
help='feature file with feature names in the first column')
parser.add_argument('--genome-dir', '-g', type=str, required=True,
help='genome directory where fasta/${rna_type}.fa contains sequences')
parser.add_argument('--output-file', '-o', type=str, default='-',
help='output file in FASTA format')
parser.add_argument('--padding', type=int, default=200,
help='extend on both ends to maximum in length')
parser = subparsers.add_parser('extract_domain_sequence',
help='extract sequences using feature names')
parser.add_argument('--input-file', '-i', type=str, required=True,
help='feature file with feature names in the first column')
parser.add_argument('--genome-dir', '-g', type=str, required=True,
help='genome directory where fasta/${rna_type}.fa contains sequences')
parser.add_argument('--flanking', type=int, default=100)
parser.add_argument('--output-file', '-o', type=str, default='-',
help='output file in FASTA format')
parser = subparsers.add_parser('calculate_star_parameters',
help='calculate --genomeSAindexNbases and --genomeChrBinNbits for STAR')
parser.add_argument('--input-file', '-i', type=str, required=True,
help='FASTA index (.fai) file')
parser.add_argument('--parameter', '-p', type=str, required=True,
choices=('genomeSAindexNbases', 'genomeChrBinNbits'),
help='parameter to calculate')
parser = subparsers.add_parser('highlight_mature_mirna_location',
help='get mature miRNA location in precursor miRNA in miRBase')
parser.add_argument('--mature', type=str, required=True,
help='FASTA file of mature sequences')
parser.add_argument('--hairpin', type=str, required=True,
help='FASTA file of hairpin sequences')
parser.add_argument('--output-file', '-o', type=str, default='-')
parser.add_argument('--species', type=str)
parser = subparsers.add_parser('extract_mature_mirna_location',
help='Extract mature miRNA location in precursor miRNA in miRBase')
parser.add_argument('--input-file', '-i', type=str, required=True,
help='miRBase GFF3 file')
parser.add_argument('--output-file', '-o', type=str, default='-',
help='BED file with 6 columns: pre-miRNA, start, end, mature miRNA, 0, +')
parser = subparsers.add_parser('calculate_gene_length',
help='calculate effective gene length')
parser.add_argument('--input-file', '-i', type=str, default='-',
help='GTF/GFF file')
parser.add_argument('--method', '-m', type=str,
choices=('isoform_length_mean', 'isoform_length_median', 'isoform_length_max', 'merged_exon_length'))
parser.add_argument('--output-file', '-o', type=str,
help='output tab-deliminated file with two columns: gene_id, length')
parser = subparsers.add_parser('merge_data_frames')
parser.add_argument('--input-file', '-i', type=str, action='append', required=True,
help='input data matrix')
parser.add_argument('--sep', type=str, default='\t', help='column delimiter')
parser.add_argument('--how', type=str, default='inner',
choices=('inner', 'outer', 'left', 'right'))
parser.add_argument('--on', type=str, help='column name to join on')
parser.add_argument('--on-index', action='store_true', default=False, help='join on index')
parser.add_argument('--fillna', type=str)
parser.add_argument('--output-file', '-o', type=str, default='-',
help='output merged data frame')
parser = subparsers.add_parser('genomecov')
parser.add_argument('--input-file', '-i', type=str, required=True,
help='input BAM/SAM file')
parser.add_argument('--output-file', '-o', type=str, required=True,
help='output HDF5 file')
parser = subparsers.add_parser('calc_rpkm')
parser.add_argument('--input-file', '-i', type=str, default='-',
help='input expression (RPM) matrix file')
parser.add_argument('--output-file', '-o', type=str, default='-',
help='output expression (RPKM) matrix file')
parser = subparsers.add_parser('create_pseudo_genome')
parser.add_argument('--input-file', '-i', type=str, required=True,
help='input transcript FASTA file (with FASTA index)')
parser.add_argument('--max-chrom-size', type=int, default=100000000,
help='maximum size for each chromosome')
parser.add_argument('--padding', type=int, default=50,
help='number of N bases padded after each sequence')
parser.add_argument('--circular-rna', action='store_true',
help='write circular RNA junction to annotation file')
parser.add_argument('--output-fasta', type=str, required=True,
help='output FASTA and index file')
parser.add_argument('--output-chrom-sizes', type=str, required=True,
help='output chrom sizes file')
parser.add_argument('--output-annotation', type=str, required=True,
help='output annotation BED file')
parser.add_argument('--output-cytoband', type=str, required=True,
help='output cytoband file')
parser = subparsers.add_parser('map_bam_to_pseudo_genome')
parser.add_argument('--input-file', '-i', type=str, required=True,
help='input BAM file')
parser.add_argument('--bed', type=str, required=True,
help='input BED file of genes in the pseudo-genome')
parser.add_argument('--chrom-sizes', type=str, required=True,
help='input chrom sizes of the pseudo-genome')
parser.add_argument('--strandness', '-s', type=str, default='forward',
help='strandness for circular RNA')
parser.add_argument('--paired-end', action='store_true',
help='input is paired-end reads')
parser.add_argument('--circular-rna', action='store_true',
help='requires the read to be mapped to circular RNA junctions')
parser.add_argument('--output-file', '-o', type=str, required=True,
help='output BAM file')
args = main_parser.parse_args()
if args.command is None:
main_parser.print_help()
sys.exit(1)
logger = logging.getLogger('preprocess.' + args.command)
try:
command_handlers.get(args.command)(args)
except BrokenPipeError:
pass
except KeyboardInterrupt:
pass