Switch to unified view

a b/exseek/scripts/preprocess.py
1
#! /usr/bin/env python
2
from __future__ import print_function
3
import argparse, sys, os, errno
4
import logging
5
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] [%(levelname)s] %(name)s: %(message)s')
6
7
command_handlers = {}
8
def command_handler(f):
9
    command_handlers[f.__name__] = f
10
    return f
11
    
12
def read_gtf(filename):
13
    from ioutils import open_file_or_stdin
14
15
    with open_file_or_stdin(filename) as fin:
16
        lineno = 0
17
        for line in fin:
18
            lineno += 1
19
            c = line.strip().split('\t')
20
            if c[0].startswith('#'):
21
                continue
22
            attrs = {}
23
            for a in c[8].split(';')[:-1]:
24
                a = a.strip()
25
                i = a.find(' ')
26
                key = a[:i]
27
                val = a[(i + 1):].strip('"')
28
                attrs[key] = val
29
            gene_id = attrs.get('gene_id')
30
            if gene_id is None:
31
                raise ValueError('gene_id not found in GTF file at line {}'.format(lineno))
32
            yield (c, attrs, line)
33
34
@command_handler
35
def extract_gene(args):
36
    from ioutils import open_file_or_stdout
37
38
    feature = args.feature
39
    genes = {}
40
    logger.info('read GTF file: ' + args.input_file)
41
    for c, attrs, line in read_gtf(args.input_file):
42
        if (feature is not None) and (c[2] != feature):
43
            continue
44
        gene_id = attrs.get('gene_id')
45
        gene = genes.get(gene_id)
46
        if gene is None:
47
            gene = [c[0], int(c[3]) - 1, int(c[4]), gene_id, 0, c[6]]
48
            genes[gene_id] = gene
49
        else:
50
            gene[1] = min(gene[1], int(c[3]) - 1)
51
            gene[2] = max(gene[2], int(c[4]))
52
    
53
    logger.info('number of genes: {}'.format(len(genes)))
54
    logger.info('write BED file: ' + args.output_file)
55
    with open_file_or_stdout(args.output_file) as fout:
56
        for gene_id, gene in genes.items():
57
            fout.write('\t'.join(map(str, gene)))
58
            fout.write('\n')
59
60
61
@command_handler
62
def fix_gtf(args):
63
    from ioutils import open_file_or_stdout
64
    from collections import defaultdict
65
    # strand of exons grouped by transcript_id
66
    strands = defaultdict(list)
67
68
    feature = args.feature
69
    lines = []
70
    logger.info('read GTF file: ' + args.input_file)
71
    for c, attrs, line in read_gtf(args.input_file):
72
        if c[2] in ('transcript', 'exon'):
73
            transcript_id = attrs.get('transcript_id')
74
            if transcript_id is None:
75
                raise ValueError('transcript_id not found in GTF file at line {}'.format(lineno))
76
            lines.append((transcript_id, line))
77
        else:
78
            transcript_id = attrs.get('transcript_id')
79
            if transcript_id is None:
80
                raise ValueError('transcript_id not found in GTF file at line {}'.format(lineno))
81
            strands[transcript_id].append(c[6])
82
    invalid_transcripts = set()
83
    for transcript_id, strands_tx in strands.items():
84
        strands_tx = set(strands_tx)
85
        # remove transcripts without strand information
86
        if '.' in strands_tx:
87
            invalid_transcripts.add(transcript_id)
88
        # remove transcripts with exon on different strands
89
        elif len(strands_tx) != 1:
90
            invalid_transcripts.add(transcript_id)
91
    
92
    logger.info('number of transcripts: {}'.format(len(strands)))
93
    logger.info('number of invalid transcripts: {}'.format(len(invalid_transcripts)))
94
    logger.info('write GTF file: ' + args.output_file)
95
    with open_file_or_stdout(args.output_file) as fout:
96
        for transcript_id, line in lines:
97
            if transcript_id not in invalid_transcripts:
98
                fout.write(line)
99
100
101
@command_handler
102
def transcript_counts(args):
103
    import pysam
104
    import numpy as np
105
    from ioutils import open_file_or_stdout
106
    from collections import defaultdict
107
108
    logger.info('read input transcript BAM file: ' + args.input_file)
109
    sam = pysam.AlignmentFile(args.input_file, "rb")
110
    counts = defaultdict(int)
111
    for read in sam:
112
        counts[read.reference_name] += 1
113
    
114
    logger.info('create output file: ' + args.output_file)
115
    with open_file_or_stdout(args.output_file) as fout:
116
        for key, val in counts.items():
117
            fout.write('{}\t{}\n'.format(key, val))
118
119
@command_handler
120
def extract_longest_transcript(args):
121
    from ioutils import open_file_or_stdin, open_file_or_stdout
122
    from collections import defaultdict
123
    from functools import partial
124
125
    feature = args.feature
126
    genes = defaultdict(partial(defaultdict, int))
127
    lines = []
128
    logger.info('read gtf file: ' + args.input_file)
129
    with open_file_or_stdin(args.input_file) as fin:
130
        lineno = 0
131
        for line in fin:
132
            lineno += 1
133
            c = line.strip().split('\t')
134
            if c[0].startswith('#'):
135
                continue
136
            if c[2] != feature:
137
                lines.append(('#other#', line))
138
                continue
139
            attrs = {}
140
            for a in c[8].split(';')[:-1]:
141
                a = a.strip()
142
                i = a.find(' ')
143
                key = a[:i]
144
                val = a[(i + 1):].strip('"')
145
                attrs[key] = val
146
            transcript_id = attrs.get('transcript_id')
147
            if transcript_id is None:
148
                raise ValueError('transcript_id not found in GTF file at line {}'.format(lineno))
149
            gene_id = attrs.get('gene_id')
150
            if gene_id is None:
151
                raise ValueError('gene_id not found in GTF file at line {}'.format(lineno))
152
            lines.append((transcript_id, line))
153
            genes[gene_id][transcript_id] += int(c[4]) - int(c[3]) + 1
154
    kept_transcripts = set()
155
    kept_transcripts.add('#other#')
156
    for gene_id, gene in genes.items():
157
        max_length = 0
158
        max_transcript = None
159
        for transcript_id, length in gene.items():
160
            if length > max_length:
161
                max_length = length
162
                max_transcript = transcript_id
163
        kept_transcripts.add(transcript_id)
164
165
    logger.info('number of genes: {}'.format(len(genes)))
166
    logger.info('number of transcripts: {}'.format(sum(map(len, genes.values()))))
167
    logger.info('number of longest transcripts: {}'.format(len(kept_transcripts) - 1))
168
    logger.info('write output gtf file: ' + args.output_file)
169
    with open_file_or_stdout(args.output_file) as fout:
170
        for transcript_id, line in lines:
171
            if transcript_id in kept_transcripts:
172
                fout.write(line)
173
174
@command_handler
175
def gtf_to_transcript_table(args):
176
    from ioutils import open_file_or_stdin, open_file_or_stdout
177
    from collections import OrderedDict
178
179
    feature = args.feature
180
    default_transcript_type = args.transcript_type
181
    default_gene_type = args.gene_type
182
183
    fout = open_file_or_stdout(args.output_file)
184
    with open_file_or_stdin(args.input_file) as fin:
185
        transcripts = OrderedDict()
186
        for line in fin:
187
            # get GTF columns
188
            c = line.strip().split('\t')
189
            if c[0].startswith('#'):
190
                continue
191
            if c[2] != feature:
192
                continue
193
            # GTF attributes
194
            attrs = {}
195
            for a in c[8].split(';')[:-1]:
196
                a = a.strip()
197
                i = a.find(' ')
198
                key = a[:i]
199
                val = a[(i + 1):].strip('"')
200
                attrs[key] = val
201
            # mitranscriptome
202
            if c[1] == 'mitranscriptome':
203
                if attrs['tcat'] == 'lncrna':
204
                    attrs['gene_type'] = 'lncRNA'
205
                    attrs['transcript_type'] = 'lncRNA'
206
                elif attrs['tcat'] == 'tucp':
207
                    attrs['gene_type'] = 'tucpRNA'
208
                    attrs['transcript_type'] = 'tucpRNA'
209
            # use transcript_id as transcript_name
210
            if 'transcript_name' not in attrs:
211
                attrs['transcript_name'] = attrs['transcript_id']
212
            # use gene_id as gene_name
213
            if 'gene_name' not in attrs:
214
                attrs['gene_name'] = attrs['gene_id']
215
            # set transcript_type if given
216
            if default_transcript_type is not None:
217
                attrs['transcript_type'] = default_transcript_type
218
            else:
219
                if 'transcript_type' not in attrs:
220
                    attrs['transcript_type'] = 'unknown'
221
            # set gene_type if given
222
            if default_gene_type is not None:
223
                attrs['gene_type'] = default_gene_type
224
            else:
225
                if 'gene_type' not in attrs:
226
                    attrs['gene_type'] = 'unknown'
227
            exon = [c[0], int(c[3]) - 1, int(c[4]), attrs['gene_id'], 0, c[6],
228
                attrs['gene_id'], attrs['transcript_id'], 
229
                attrs['gene_name'], attrs['transcript_name'],
230
                attrs['gene_type'], attrs['transcript_type'], c[1]]
231
            transcript = transcripts.get(attrs['transcript_id'])
232
            if transcript is None:
233
                transcripts[attrs['transcript_id']] = exon
234
            else:
235
                if c[2] == 'exon':
236
                    transcript[1] = min(transcript[1], exon[1])
237
                    transcript[2] = max(transcript[2], exon[2])
238
        header = ['chrom', 'start', 'end', 'name', 'score', 'strand',
239
            'gene_id', 'transcript_id', 
240
            'gene_name', 'transcript_name',
241
            'gene_type', 'transcript_type', 'source'
242
        ]
243
        print('\t'.join(header), file=fout)
244
        for transcript in transcripts.values():
245
            print('\t'.join(str(a) for a in transcript), file=fout)
246
    fout.close()
247
248
@command_handler
249
def extract_circrna_junction(args):
250
    from Bio import SeqIO
251
    from ioutils import open_file_or_stdin, open_file_or_stdout
252
253
    anchor_size = args.anchor_size
254
    logger.info('read sequence file: ' + args.input_file)
255
    logger.info('create output file: ' + args.output_file)
256
    fout = open_file_or_stdout(args.output_file)
257
    with open_file_or_stdin(args.input_file) as fin:
258
        for record in SeqIO.parse(fin, 'fasta'):
259
            seq = str(record.seq)
260
            if len(seq) < args.min_length:
261
                continue
262
            s = min(len(seq), anchor_size)
263
            seq_id = record.id.split('|')[0]
264
            fout.write('>{}\n'.format(seq_id))
265
            fout.write(seq[-s:] + seq[:s])
266
            fout.write('\n')
267
    fout.close()
268
269
@command_handler
270
def filter_circrna_reads(args):
271
    import pysam
272
    import numpy as np
273
    from ioutils import open_file_or_stdout, open_file_or_stdin
274
    from collections import defaultdict
275
    from copy import deepcopy
276
277
    logger.info('read input SAM file: ' + args.input_file)
278
    fin = open_file_or_stdin(args.input_file)
279
    sam_in = pysam.AlignmentFile(fin, "r")
280
    if sam_in.header is None:
281
        raise ValueError('requires SAM header to get junction positions')
282
    # get junction positions (middle of the sequences)
283
    junction_positions = {}
284
    for sq in sam_in.header['SQ']:
285
        junction_positions[sq['SN']] = sq['LN']//2
286
287
    logger.info('create output SAM file: ' + args.output_file)
288
    fout = open_file_or_stdout(args.output_file)
289
    sam_out = pysam.AlignmentFile(fout, 'w', template=sam_in)
290
291
    sam_filtered = None
292
    if args.filtered_file is not None:
293
        logger.info('create filtered SAM file: ' + args.filtered_file)
294
        sam_filtered = pysam.AlignmentFile(args.filtered_file, 'w', template=sam_in)
295
296
    for read in sam_in:
297
        filtered = False
298
        if read.is_unmapped:
299
            filtered = True
300
        elif read.is_reverse:
301
            filtered = True
302
        else:
303
            pos = junction_positions[read.reference_name]
304
            if not (read.reference_start < pos <= read.reference_end):
305
                filterd = True
306
        if not filtered:
307
            sam_out.write(read)
308
        elif sam_filtered is not None:
309
            sam_filtered.write(read)
310
    
311
    fin.close()
312
    fout.close()
313
    if sam_filtered is not None:
314
        sam_filtered.close()
315
316
@command_handler
317
def chrom_sizes(args):
318
    from Bio import SeqIO
319
    from ioutils import open_file_or_stdin, open_file_or_stdout
320
321
    fout = open_file_or_stdout(args.output_file)
322
    with open_file_or_stdin(args.input_file) as fin:
323
        for record in SeqIO.parse(fin, 'fasta'):
324
            fout.write('{}\t{}\n'.format(record.id, len(record.seq)))
325
326
@command_handler
327
def extract_domain_sequence(args):
328
    from pyfaidx import Fasta
329
    from Bio.Seq import Seq
330
    from ioutils import open_file_or_stdout
331
    import pandas as pd
332
333
    fout = open_file_or_stdout(args.output_file)
334
    fastas = {}
335
    with open(args.input_file, 'r') as fin:
336
        for lineno, line in enumerate(fin):
337
            feature = line.split('\t')[0]
338
            gene_id, gene_type, gene_name, domain_id, transcript_id, start, end = feature.split('|')
339
            start = int(start)
340
            end = int(end)
341
            if gene_type == 'genomic':
342
                gene_type = 'genome'
343
            if gene_type not in fastas:
344
                fastas[gene_type] = Fasta(os.path.join(args.genome_dir, 'fasta', gene_type + '.fa'))
345
            if gene_type == 'genome':
346
                chrom, gstart, gend, strand = gene_id.split('_')
347
                gstart = int(gstart)
348
                gend = int(gend)
349
                seq = fastas[gene_type][chrom][gstart:gend].seq
350
                if strand == '-':
351
                    seq = str(Seq(seq).reverse_complement())
352
            else:
353
                seq = fastas[gene_type][transcript_id][start:end].seq
354
            seq = seq.upper()
355
            fout.write('>{}\n'.format(feature))
356
            fout.write(seq)
357
            fout.write('\n')
358
    fout.close()
359
    
360
@command_handler
361
def extract_feature_sequence(args):
362
    from pyfaidx import Fasta
363
    from Bio.Seq import Seq
364
    from ioutils import open_file_or_stdout
365
366
    def pad_range(start, end, chrom_size, max_length):
367
        if (end - start) >= max_length:
368
            return
369
        padding_left = (max_length - (end - start))//2
370
        new_start = max(0, start - padding_left)
371
        new_end = min(new_start + max_length, chrom_size)
372
        return new_start, new_end
373
374
    fout = open_file_or_stdout(args.output_file)
375
    fastas = {}
376
    with open(args.input_file, 'r') as fin:
377
        for lineno, line in enumerate(fin):
378
            if lineno == 0:
379
                continue
380
            feature = line.split('\t')[0]
381
            gene_id, gene_type, gene_name, domain_id, transcript_id, start, end = feature.split('|')
382
            start = int(start)
383
            end = int(end)
384
            if gene_type == 'genomic':
385
                gene_type = 'genome'
386
            # load FASTA file
387
            if gene_type not in fastas:
388
                fastas[gene_type] = Fasta(os.path.join(args.genome_dir, 'fasta', gene_type + '.fa'))
389
            if gene_type == 'genome':
390
                chrom, gstart, gend, strand = gene_id.split('_')
391
                gstart = int(gstart)
392
                gend = int(gend)
393
                seq = fastas[gene_type][chrom][gstart:gend].seq
394
                if strand == '-':
395
                    seq = str(Seq(seq).reverse_complement())
396
            else:
397
                seq = fastas[gene_type][transcript_id][start:end].seq
398
            seq = seq.upper()
399
            fout.write('>{}\n'.format(feature))
400
            fout.write(seq)
401
            fout.write('\n')
402
    fout.close()
403
404
@command_handler
405
def calculate_star_parameters(args):
406
    from math import log2
407
408
    genome_length = 0
409
    n_seqs = 0
410
    with open(args.input_file, 'r') as f:
411
        for line in f:
412
            genome_length += int(line.split('\t')[1])
413
            n_seqs += 1
414
    if args.parameter == 'genomeSAindexNbases':
415
        print(min(14, int(log2(genome_length)//2) - 1))
416
    elif args.parameter == 'genomeChrBinNbits':
417
        print(min(18, int(log2(genome_length/n_seqs))))
418
419
@command_handler
420
def highlight_mature_mirna_location(args):
421
    import gzip
422
    from Bio import SeqIO
423
    from collections import defaultdict
424
    from ioutils import open_file_or_stdout
425
426
    logger.info('read hairpin sequences: ' + args.hairpin)
427
    hairpin_seqs = {}
428
    with open(args.hairpin, 'r') as f:
429
        for record in SeqIO.parse(f, 'fasta'):
430
            if (args.species is None) or (args.species == record.id.split('-')[0]):
431
                hairpin_seqs[record.id] = str(record.seq)
432
    
433
    logger.info('read mature sequences: ' + args.mature)
434
    fout = open_file_or_stdout(args.output_file)
435
    mature_positions = defaultdict(dict)
436
    with open(args.mature, 'r') as f:
437
        for record in SeqIO.parse(f, 'fasta'):
438
            if not ((args.species is None) or (args.species == record.id.split('-')[0])):
439
                continue
440
            if record.id.endswith('-5p') or record.id.endswith('-3p'):
441
                end_type = record.id.split('-')[-1]
442
                hairpin_id = record.id[:-3].lower()
443
                hairpin_seq = hairpin_seqs.get(hairpin_id)
444
                if hairpin_seq is None:
445
                    logger.warn('hairpin sequence id {} is not found in hairpin file'.format(hairpin_id))
446
                    continue
447
                mature_id = record.id
448
                mature_seq = str(record.seq)
449
                pos = hairpin_seq.find(mature_seq)
450
                if pos < 0:
451
                    logger.warn('mature sequence {} is not found in hairpin file'.format(mature_id))
452
                    continue
453
                else:
454
                    mature_positions[hairpin_id][end_type] = (pos, pos + len(mature_seq))
455
456
    for hairpin_id, hairpin_seq in hairpin_seqs.items():
457
        mature_position = mature_positions.get(hairpin_id)
458
        if mature_position is None:
459
            mature_position = {}
460
        fout.write('>{}\n'.format(hairpin_id))
461
        offset = 0
462
        if '5p' in mature_position:
463
            start, end = mature_position['5p']
464
            fout.write('{0}\x1B[31;1m{1}\x1B[0m'.format(hairpin_seq[offset:start], hairpin_seq[start:end]))
465
            offset = end
466
        if '3p' in mature_position:
467
            start, end = mature_position['3p']
468
            fout.write('{0}\x1B[32;1m{1}\x1B[0m'.format(hairpin_seq[offset:start], hairpin_seq[start:end]))
469
            offset = end
470
        fout.write('{}\n'.format(hairpin_seq[offset:]))
471
    fout.close()
472
473
@command_handler
474
def extract_mature_mirna_location(args):
475
    from utils import read_gff, GFFRecord
476
    from ioutils import open_file_or_stdin, open_file_or_stdout
477
    from collections import OrderedDict, defaultdict
478
479
    logger.info('read input GFF file: ' + args.input_file)
480
    fin = open_file_or_stdin(args.input_file)
481
    logger.info('open output BED file: ' + args.input_file)
482
    fout = open_file_or_stdout(args.output_file)
483
    # key: precursor_id, value: precursor record
484
    precursors = OrderedDict()
485
    # key: precursor_id, value: list of mature records
486
    matures = defaultdict(list)
487
    # read features from GFF file
488
    for record in read_gff(fin):
489
        if record.feature == 'miRNA_primary_transcript':
490
            precursors[record.attr['ID']] = record
491
        elif record.feature == 'miRNA':
492
            matures[record.attr['Derives_from']].append(record)
493
    # get locations of mature miRNAs
494
    for precursor_id, precursor in precursors.items():
495
        for mature in matures[precursor_id]:
496
            if mature.strand == '+':
497
                fout.write('{}\t{}\t{}\t{}\t0\t+\n'.format(
498
                    precursor.attr['Name'], 
499
                    mature.start - precursor.start,
500
                    mature.end - precursor.start + 1,
501
                    mature.attr['Name']))
502
            else:
503
                fout.write('{}\t{}\t{}\t{}\t0\t+\n'.format(
504
                    precursor.attr['Name'],
505
                    precursor.end - mature.end,
506
                    precursor.end - mature.start + 1,
507
                    mature.attr['Name']
508
                ))
509
    fin.close()
510
    fout.close()
511
512
@command_handler
513
def gtf_to_bed(args):
514
    from ioutils import open_file_or_stdin, open_file_or_stdout
515
516
    exon_feature = 'exon'
517
    # use transcript_id attribute as key
518
    transcripts = {}
519
    logger.info('read input GTF file: ' + args.input_file)
520
    for lineno, record in enumerate(read_gtf(args.input_file)):
521
        c, attrs, line = record
522
        if c[2] == exon_feature:
523
            gene_id = attrs.get('gene_id')
524
            if gene_id is None:
525
                raise ValueError('gene_id attribute not found in GTF file {}:{}'.format(args.input_file, lineno))
526
            transcript_id = attrs.get('transcript_id')
527
            if transcript_id is None:
528
                raise ValueError('transcript_id attribute not found in GTF file {}:{}'.format(args.input_file, lineno))
529
            transcript = transcripts.get(transcript_id)
530
            if transcript is None:
531
                # new transcript
532
                transcript = {
533
                    'chrom': c[0],
534
                    'strand': c[6],
535
                    'gene_id': gene_id,
536
                    'gene_name': attrs.get('gene_name', gene_id),
537
                    'transcript_name': attrs.get('transcript_name', transcript_id),
538
                    'exons': []
539
                }
540
                transcripts[transcript_id] = transcript
541
            # add a new exon
542
            transcript['exons'].append((int(c[3]) - 1, int(c[4])))
543
    
544
    fout = open_file_or_stdout(args.output_file)
545
    bed_template = '{chrom}\t{start}\t{end}\t{name}\t0\t{strand}\t0\t0\t0\t{n_exons}\t{exon_sizes}\t{exon_starts}\n'
546
    for transcript_id, transcript in transcripts.items():
547
        # sort exons by start position
548
        transcript['exons'] = sorted(transcript['exons'], key=lambda x: x[0])
549
        transcript['n_exons'] = len(transcript['exons'])
550
        transcript['start'] = transcript['exons'][0][0]
551
        transcript['end'] = transcript['exons'][-1][1]
552
        transcript['exon_starts'] = ','.join(str(e[0] - transcript['start']) for e in transcript['exons'])
553
        transcript['exon_sizes'] = ','.join(str(e[1] - e[0]) for e in transcript['exons'])
554
        transcript['name'] = '{gene_id}'.format(**transcript)
555
        fout.write(bed_template.format(**transcript))
556
557
558
@command_handler
559
def calculate_gene_length(args):
560
    import HTSeq
561
    from collections import defaultdict
562
    from functools import partial
563
    import numpy as np
564
    from ioutils import open_file_or_stdin
565
    from tqdm import tqdm
566
567
    fin = open_file_or_stdin(args.input_file)
568
    gff = HTSeq.GFF_Reader(fin)
569
    exons = defaultdict(partial(defaultdict, int))
570
    for feature in tqdm(gff, unit='feature'):
571
        if feature.type == 'exon':
572
            exons[feature.attr['gene_id']][feature.attr['transcript_id']] += feature.iv.length
573
574
@command_handler
575
def merge_data_frames(args):
576
    import pandas as pd
577
    import numpy as np
578
    from ioutils import open_file_or_stdout
579
580
    if (not args.on_index) and (args.on is None):
581
        raise ValueError('argument --on is required if --on-index is not specified')
582
    merged = None
583
    for input_file in args.input_file:
584
        logger.info('read input file: ' + input_file)
585
        df = pd.read_table(input_file, sep=args.sep)
586
        if merged is None:
587
            merged = df
588
        else:
589
            if args.on_index:
590
                merged = pd.merge(merged, df, how=args.how, left_index=True, right_index=True)
591
            else:
592
                merged = pd.merge(merged, df, how=args.how, on=args.on)
593
    if args.fillna is not None:
594
        merged.fillna(args.fillna, inplace=True)
595
    logger.info('open output file: ' + args.output_file)
596
    with open_file_or_stdout(args.output_file) as f:
597
        merged.to_csv(f, sep=args.sep, header=True, index=args.on_index)
598
    
599
@command_handler
600
def genomecov(args):
601
    import pysam
602
    import h5py
603
    import numpy as np
604
    from tqdm import tqdm
605
606
    logger.info('read input BAM/SAM file: ' + args.input_file)
607
    sam = pysam.AlignmentFile(args.input_file, 'r')
608
    logger.info('open output HDF5 file: ' + args.output_file)
609
    fout = h5py.File(args.output_file, 'w')
610
    for sq in tqdm(sam.header['SQ'], unit='seq'):
611
        data = np.zeros(sq['LN'], dtype=np.int32)
612
        fout.create_dataset(sq['SN'], data=data, compression='gzip')
613
    fout.close()
614
615
@command_handler
616
def calc_rpkm(args):
617
    import pandas as pd
618
    import numpy as np
619
    from ioutils import open_file_or_stdin, open_file_or_stdout
620
621
    matrix = pd.read_table(open_file_or_stdin(args.input_file), index_col=0, sep='\t')
622
    feature_info = matrix.index.to_series().str.split('|', expand=True)
623
    feature_info.columns = ['gene_id', 'gene_type', 'gene_name', 'feature_id', 'transcript_id', 'start', 'end']
624
    feature_info['start'] = feature_info['start'].astype('int')
625
    feature_info['end'] = feature_info['end'].astype('int')
626
    feature_info['length'] = feature_info['end'] - feature_info['start']
627
    matrix = 1000.0*matrix.div(feature_info['length'], axis=0)
628
    matrix.to_csv(open_file_or_stdout(args.output_file), index=True, header=True, sep='\t', na_rep='NA')
629
630
@command_handler
631
def create_pseudo_genome(args):
632
    from pyfaidx import Fasta, Faidx
633
    import numpy as np
634
635
    src = Fasta(args.input_file)
636
    lengths = np.array([len(r) for r in src])
637
    padded_lengths = lengths + args.padding
638
    padded_seq = ''.join(['N']*args.padding)
639
    cum_lengths = np.cumsum(padded_lengths)
640
    chrom_indices = cum_lengths//args.max_chrom_size
641
    chrom_starts = cum_lengths%args.max_chrom_size - padded_lengths
642
    prev_chrom_index = -1
643
644
    # write FASTA file
645
    logger.info('write FASTA file: ' + args.output_fasta)
646
    with open(args.output_fasta, 'w') as fout:
647
        for i, record in enumerate(src):
648
            # new chromosome
649
            if chrom_indices[i] != prev_chrom_index:
650
                if i > 0:
651
                    fout.write('\n')
652
                fout.write('>c{}\n'.format(i + 1))
653
                prev_chrom_index = chrom_indices[i]
654
            # write sequence
655
            fout.write(str(record))
656
            fout.write(padded_seq)
657
    # build fasta index
658
    logger.info('build FASTA index')
659
    dst_index = Faidx(args.output_fasta)
660
    dst_index.build_index()
661
    # chrom sizes
662
    logger.info('write chrom sizes: ' + args.output_chrom_sizes)
663
    fout = open(args.output_chrom_sizes, 'w')
664
    with open(args.output_fasta + '.fai', 'r') as f:
665
        for line in f:
666
            c = line.strip().split('\t')
667
            fout.write(c[0] + '\t' + c[1] + '\n')
668
    fout.close()
669
    # cytoband file
670
    logger.info('write cytoband file: ' + args.output_cytoband)
671
    fout = open(args.output_cytoband, 'w')
672
    with open(args.output_fasta + '.fai', 'r') as f:
673
        for i, line in enumerate(f):
674
            c = line.strip().split('\t')
675
            fout.write('{0}\t0\t{1}\tp{2}.1\tgned\n'.format(c[0], c[1], i + 1))
676
    fout.close()
677
678
    # annotation file 
679
    logger.info('write annotation file: ' + args.output_annotation)
680
    with open(args.output_annotation, 'w') as fout:
681
        for i, record in enumerate(src):
682
            record = ['c%d'%(chrom_indices[i] + 1), 
683
                str(chrom_starts[i]), 
684
                str(chrom_starts[i] + lengths[i]),
685
                record.name,
686
                '0',
687
                '+']
688
            fout.write('\t'.join(record))
689
            fout.write('\n')
690
            # create junction annotation
691
            if args.circular_rna:
692
                junction_pos = (int(record[1]) + int(record[2]))//2
693
                record[1] = str(junction_pos - 1)
694
                record[2] = str(junction_pos + 1)
695
                record[3] = record[3] + '|junction'
696
                fout.write('\t'.join(record))
697
                fout.write('\n')
698
699
@command_handler
700
def map_bam_to_pseudo_genome(args):
701
    import pysam
702
703
    def as_paired_reads(sam):
704
        read1 = None
705
        for read in sam:
706
            if read.is_read1:
707
                read1 = read
708
            elif read.is_read2:
709
                yield (read1, read)
710
            else:
711
                raise ValueError('input SAM is not paired-end')
712
    
713
    chrom_sizes = {}
714
    logger.info('read chrom sizes: ' + args.chrom_sizes)
715
    with open(args.chrom_sizes, 'r') as fin:
716
        for line in fin:
717
            c = line.strip().split('\t')
718
            chrom_sizes[c[0]] = int(c[1])
719
    
720
    chrom_starts = {}
721
    chrom_names = {}
722
    with open(args.bed, 'r') as fin:
723
        for line in fin:
724
            c = line.strip().split('\t')
725
            c[1] = int(c[1])
726
            c[2] = int(c[2])
727
            chrom_names[c[3]] = c[0]
728
            chrom_starts[c[3]] = c[1]
729
    
730
    logger.info('read input BAM file: ' + args.input_file)
731
    sam = pysam.AlignmentFile(args.input_file, 'rb')
732
733
    if args.circular_rna:
734
        junction_positions = {sq['SN']:sq['LN']//2 for sq in sam.header['SQ']}
735
736
    output_header = {
737
    'HD': sam.header['HD'],
738
    'SQ': [{'SN': chrom, 'LN': size} for chrom, size in chrom_sizes.items()],
739
    'PG': [{'ID': 'map_bam_to_pseudo_genome', 
740
            'PN': 'map_bam_to_pseudo_genome',
741
            'VN': '1.0',
742
            'CL': ' '.join(sys.argv)}
743
          ]
744
    }
745
    logger.info('create output BAM file: ' + args.output_file)
746
    output_sam = pysam.AlignmentFile(args.output_file, 'wb', header=output_header)
747
748
    def map_read(read):
749
        d = read.to_dict()
750
        ref_name = d['ref_name']
751
        d['ref_name'] = chrom_names[ref_name]
752
        d['ref_pos'] = str(int(d['ref_pos']) + chrom_starts[ref_name])
753
        return pysam.AlignedSegment.from_dict(d, output_sam.header)
754
755
    strandness = args.strandness
756
    is_circular_rna = args.circular_rna
757
758
    n_reads = 0
759
    if args.paired_end:
760
        for read1, read2 in as_paired_reads(sam):
761
            if (read1.is_unmapped) or (read2.is_unmapped):
762
                continue
763
            if read1.reference_name != read2.reference_name:
764
                continue
765
            if is_circular_rna:
766
                if (strandness == 'forward') and ((not read1.is_reverse) and read2.is_reverse):
767
                    continue
768
                if (strandness == 'reverse') and ((not read2.is_reverse) and read1.is_reverse):
769
                    continue
770
                pos = junction_positions[read1.reference_name]
771
                if not (read1.reference_start < pos <= read2.reference_end):
772
                    continue
773
            output_sam.write(map_read(read1))
774
            output_sam.write(map_read(read2))
775
            n_reads += 1
776
    else:
777
        for read in sam:
778
            if read.is_unmapped:
779
                continue
780
            output_sam.write(map_read(read))
781
            n_reads += 1
782
    logger.info('number of written reads: {}'.format(n_reads))
783
784
    output_sam.close()
785
786
787
788
if __name__ == '__main__':
789
    main_parser = argparse.ArgumentParser(description='Preprocessing module')
790
    subparsers = main_parser.add_subparsers(dest='command')
791
792
    parser = subparsers.add_parser('transcript_counts', 
793
        help='count reads that belongs to a transcript')
794
    parser.add_argument('--input-file', '-i', type=str, required=True,
795
        help='input transcript BAM file')
796
    parser.add_argument('--output-file', '-o', type=str, default='-',
797
        help='output transcript counts file')
798
    
799
    parser = subparsers.add_parser('gtf_to_transcript_table')
800
    parser.add_argument('--input-file', '-i', type=str, default='-',
801
        help='input GTF file')
802
    parser.add_argument('--output-file', '-o', type=str, default='-',
803
        help='output table file')
804
    parser.add_argument('--feature', type=str,
805
        help='feature to use in input GTF file (Column 3)')
806
    parser.add_argument('--gene-type', type=str,
807
        help='gene type to set if "gene_type" attribute is not available in GTF file')
808
    parser.add_argument('--transcript-type', type=str, default='unknown',
809
        help='gene type to set if "transcript_type" attribute is not available in GTF file')
810
    
811
    parser = subparsers.add_parser('extract_longest_transcript')
812
    parser.add_argument('--input-file', '-i', type=str, default='-',
813
        help='input GTF file')
814
    parser.add_argument('--output-file', '-o', type=str, default='-',
815
        help='output table file')
816
    parser.add_argument('--feature', type=str, default='exon',
817
        help='feature to use in input GTF file (Column 3)')
818
    
819
    parser = subparsers.add_parser('fix_gtf',
820
        help='fix problems in a GTF file by removing invalid transcripts')
821
    parser.add_argument('--input-file', '-i', type=str, default='-',
822
        help='input GTF file')
823
    parser.add_argument('--output-file', '-o', type=str, default='-',
824
        help='output table file')
825
    parser.add_argument('--feature', type=str, default='exon',
826
        help='feature to use in input GTF file (Column 3)')
827
    
828
    parser = subparsers.add_parser('extract_gene',
829
        help='extract gene regions from a GTF file')
830
    parser.add_argument('--input-file', '-i', type=str, default='-',
831
        help='input GTF file')
832
    parser.add_argument('--output-file', '-o', type=str, default='-',
833
        help='output BED file')
834
    parser.add_argument('--feature', type=str,
835
        help='feature to use in input GTF file (Column 3)')
836
    
837
    parser = subparsers.add_parser('gtf_to_bed',
838
        help='convert transcripts from GTF to BED')
839
    parser.add_argument('--input-file', '-i', type=str, default='-',
840
        help='input GTF file')
841
    parser.add_argument('--output-file', '-o', type=str, default='-',
842
        help='output BED file')
843
    parser.add_argument('--feature', type=str,
844
        help='features to treat as exons in input GTF file (Column 3)')
845
    
846
    parser = subparsers.add_parser('extract_circrna_junction',
847
        help='extract circular RNA junction sequences from spliced sequences')
848
    parser.add_argument('--input-file', '-i', type=str, default='-',
849
        help='spliced sequence file')
850
    parser.add_argument('--output-file', '-o', type=str, default='-',
851
        help='junction sequence file')
852
    parser.add_argument('--anchor-size', '-s', type=int, default=50,
853
        help='anchor length from both ends')
854
    parser.add_argument('--min-length', type=int, default=50,
855
        help='minimal length to keep a spliced sequence')
856
    
857
    parser = subparsers.add_parser('filter_circrna_reads',
858
        help='filter out reads not crossing circRNA junctions')
859
    parser.add_argument('--input-file', '-i', type=str, default='-',
860
        help='input SAM file')
861
    parser.add_argument('--output-file', '-o', type=str, default='-',
862
        help='output SAM file')
863
    parser.add_argument('--filtered-file', '-u', type=str,
864
        help='write filtered SAM records to file')
865
    
866
    parser = subparsers.add_parser('chrom_sizes',
867
        help='create chrom sizes file from FASTA file')
868
    parser.add_argument('--input-file', '-i', type=str, default='-',
869
        help='input FASTA')
870
    parser.add_argument('--output-file', '-o', type=str, default='-',
871
        help='output chrom sizes file')
872
    
873
    parser = subparsers.add_parser('extract_feature_sequence',
874
        help='extract sequences using feature names')
875
    parser.add_argument('--input-file', '-i', type=str, required=True,
876
        help='feature file with feature names in the first column')
877
    parser.add_argument('--genome-dir', '-g', type=str, required=True,
878
        help='genome directory where fasta/${rna_type}.fa contains sequences')
879
    parser.add_argument('--output-file', '-o', type=str, default='-',
880
        help='output file in FASTA format')
881
    parser.add_argument('--padding', type=int, default=200,
882
        help='extend on both ends to maximum in length')
883
884
    parser = subparsers.add_parser('extract_domain_sequence',
885
        help='extract sequences using feature names')
886
    parser.add_argument('--input-file', '-i', type=str, required=True,
887
        help='feature file with feature names in the first column')
888
    parser.add_argument('--genome-dir', '-g', type=str, required=True,
889
        help='genome directory where fasta/${rna_type}.fa contains sequences')
890
    parser.add_argument('--flanking', type=int, default=100)
891
    parser.add_argument('--output-file', '-o', type=str, default='-',
892
        help='output file in FASTA format')
893
    
894
    parser = subparsers.add_parser('calculate_star_parameters',
895
        help='calculate --genomeSAindexNbases and --genomeChrBinNbits for STAR')
896
    parser.add_argument('--input-file', '-i', type=str, required=True,
897
        help='FASTA index (.fai) file')
898
    parser.add_argument('--parameter', '-p', type=str, required=True,
899
        choices=('genomeSAindexNbases', 'genomeChrBinNbits'),
900
        help='parameter to calculate')
901
902
    parser = subparsers.add_parser('highlight_mature_mirna_location',
903
        help='get mature miRNA location in precursor miRNA in miRBase')
904
    parser.add_argument('--mature', type=str, required=True,
905
        help='FASTA file of mature sequences')
906
    parser.add_argument('--hairpin', type=str, required=True,
907
        help='FASTA file of hairpin sequences')
908
    parser.add_argument('--output-file', '-o', type=str, default='-')
909
    parser.add_argument('--species', type=str)
910
911
    parser = subparsers.add_parser('extract_mature_mirna_location',
912
        help='Extract mature miRNA location in precursor miRNA in miRBase')
913
    parser.add_argument('--input-file', '-i', type=str, required=True,
914
        help='miRBase GFF3 file')
915
    parser.add_argument('--output-file', '-o', type=str, default='-',
916
        help='BED file with 6 columns: pre-miRNA, start, end, mature miRNA, 0, +')
917
918
    parser = subparsers.add_parser('calculate_gene_length',
919
        help='calculate effective gene length')
920
    parser.add_argument('--input-file', '-i', type=str, default='-',
921
        help='GTF/GFF file')
922
    parser.add_argument('--method', '-m', type=str,
923
        choices=('isoform_length_mean', 'isoform_length_median', 'isoform_length_max', 'merged_exon_length'))
924
    parser.add_argument('--output-file', '-o', type=str,
925
        help='output tab-deliminated file with two columns: gene_id, length')
926
    
927
    parser = subparsers.add_parser('merge_data_frames')
928
    parser.add_argument('--input-file', '-i', type=str, action='append', required=True,
929
        help='input data matrix')
930
    parser.add_argument('--sep', type=str, default='\t', help='column delimiter')
931
    parser.add_argument('--how', type=str, default='inner',
932
        choices=('inner', 'outer', 'left', 'right'))
933
    parser.add_argument('--on', type=str, help='column name to join on')
934
    parser.add_argument('--on-index', action='store_true', default=False, help='join on index')
935
    parser.add_argument('--fillna', type=str)
936
    parser.add_argument('--output-file', '-o', type=str, default='-',
937
        help='output merged data frame')
938
    
939
    parser = subparsers.add_parser('genomecov')
940
    parser.add_argument('--input-file', '-i', type=str,  required=True,
941
        help='input BAM/SAM file')
942
    parser.add_argument('--output-file', '-o', type=str, required=True,
943
        help='output HDF5 file')
944
    
945
    parser = subparsers.add_parser('calc_rpkm')
946
    parser.add_argument('--input-file', '-i', type=str,  default='-',
947
        help='input expression (RPM) matrix file')
948
    parser.add_argument('--output-file', '-o', type=str, default='-',
949
        help='output expression (RPKM) matrix file')
950
    
951
    parser = subparsers.add_parser('create_pseudo_genome')
952
    parser.add_argument('--input-file', '-i', type=str, required=True,
953
        help='input transcript FASTA file (with FASTA index)')
954
    parser.add_argument('--max-chrom-size', type=int, default=100000000,
955
        help='maximum size for each chromosome')
956
    parser.add_argument('--padding', type=int, default=50,
957
        help='number of N bases padded after each sequence')
958
    parser.add_argument('--circular-rna', action='store_true',
959
        help='write circular RNA junction to annotation file')
960
    parser.add_argument('--output-fasta', type=str, required=True,
961
        help='output FASTA and index file')
962
    parser.add_argument('--output-chrom-sizes', type=str, required=True,
963
        help='output chrom sizes file')
964
    parser.add_argument('--output-annotation', type=str, required=True,
965
        help='output annotation BED file')
966
    parser.add_argument('--output-cytoband', type=str, required=True,
967
        help='output cytoband file')
968
    
969
    parser = subparsers.add_parser('map_bam_to_pseudo_genome')
970
    parser.add_argument('--input-file', '-i', type=str, required=True,
971
        help='input BAM file')
972
    parser.add_argument('--bed', type=str, required=True,
973
        help='input BED file of genes in the pseudo-genome')
974
    parser.add_argument('--chrom-sizes', type=str, required=True,
975
        help='input chrom sizes of the pseudo-genome')
976
    parser.add_argument('--strandness', '-s', type=str, default='forward',
977
        help='strandness for circular RNA')
978
    parser.add_argument('--paired-end', action='store_true',
979
        help='input is paired-end reads')
980
    parser.add_argument('--circular-rna', action='store_true',
981
        help='requires the read to be mapped to circular RNA junctions')
982
    parser.add_argument('--output-file', '-o', type=str, required=True,
983
        help='output BAM file')
984
    
985
    args = main_parser.parse_args()
986
    if args.command is None:
987
        main_parser.print_help()
988
        sys.exit(1)
989
    logger = logging.getLogger('preprocess.' + args.command)
990
991
    try:
992
        command_handlers.get(args.command)(args)
993
    except BrokenPipeError:
994
        pass
995
    except KeyboardInterrupt:
996
        pass