Switch to unified view

a b/singlecellmultiomics/features/exonGTFtoIntronGTF.py
1
#!/usr/bin/env python3
2
import collections
3
import os
4
import sys
5
import glob
6
import argparse
7
import itertools
8
import gzip
9
10
11
def decodeKvPairs(kv):
12
    keyValues = {}
13
    for part in kv.split(';'):
14
        kv = part.strip().split()
15
        if len(kv) == 2:
16
            key = kv[0]
17
            value = kv[1].replace('"', '')
18
            keyValues[key] = value
19
    return keyValues
20
21
22
def exonGTF_to_intronGTF(exon_path, id):
23
    # (gene,chrom,strand)-> [range(start,end), range(start,end)]
24
    geneToExonRanges = collections.defaultdict(list)
25
    prevChrom = None
26
    capture = list(set(['gene_id',
27
                        'gene_version',
28
                        'transcript_version',
29
                        'gene_name',
30
                        'gene_source',
31
                        'gene_biotype',
32
                        'transcript_name',
33
                        'transcript_source',
34
                        'transcript_biotype',
35
                        'transcript_support_level']) - set([id]))
36
    print(capture)
37
38
    id_to_features = {}
39
40
    with gzip.open(exon_path, 'rt') if exon_path.endswith('.gz') else open(exon_path) as f:
41
        for line in f:
42
            if line.startswith('#'):
43
                yield line
44
                continue
45
            chromosome, source, feature_type, feature_start, feature_end, at5, strand, at7, kv = line.split(
46
                None, 8)
47
48
            if feature_type != 'exon':
49
                continue
50
51
            meta = decodeKvPairs(kv)
52
            id_to_features[meta[id]] = {
53
                c: meta.get(c) for c in capture if c in meta
54
            }
55
            geneToExonRanges[(
56
                meta[id], chromosome, strand, source
57
            )
58
            ].append(range(int(feature_start), int(feature_end) + 1))
59
60
            if prevChrom is not None and prevChrom != chromosome:
61
                for intr in generate_introns(
62
                        geneToExonRanges, id, id_to_features):
63
                    yield intr
64
                geneToExonRanges = collections.defaultdict(list)
65
66
            prevChrom = chromosome
67
68
    if prevChrom is not None:
69
        for intr in generate_introns(geneToExonRanges, id, id_to_features):
70
            yield intr
71
        geneToExonRanges = collections.defaultdict(list)
72
73
74
def generate_introns(geneToExonRanges, id, id_to_features):
75
    for g in geneToExonRanges:
76
        if len(geneToExonRanges[g]) == 0:
77
            continue
78
        geneStart = min([min(r) for r in geneToExonRanges[g]])
79
        geneEnd = max([max(r) for r in geneToExonRanges[g]])
80
        gene_identifier_or_other, chromosome, strand, source = g
81
82
        introns = []
83
        in_intron = None
84
        current_intron_start = None
85
        rest_data = id_to_features.get(gene_identifier_or_other, {})
86
87
        if len(rest_data) > 0:
88
            rest = '; '.join(
89
                [f'{key} {value}' for key, value in rest_data.items()])
90
        else:
91
            rest = ''
92
93
        for i in sorted(
94
95
            [geneStart, geneEnd] +
96
            [i.start for i in geneToExonRanges[g]] +
97
                [i.stop for i in geneToExonRanges[g]]):
98
            if any([i in r for r in geneToExonRanges[g]]):
99
                if current_intron_start is not None and in_intron is True:
100
                    introns.append([current_intron_start, i])
101
102
                    yield '\t'.join([str(x) for x in [
103
                        chromosome,
104
                        source,
105
                        'intron',
106
                        current_intron_start,
107
                        i - 1,
108
                        '.',
109
                        strand,
110
                        '.', f'{id} "{gene_identifier_or_other}";{rest}']]) + '\n'
111
112
                in_intron = False
113
            else:
114
                if in_intron is False:
115
                    in_intron = True
116
                    current_intron_start = i
117
118
119
if __name__ == '__main__':
120
    argparser = argparse.ArgumentParser(
121
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
122
        description="Convert exonic GTF file to intronic GTF file")
123
    argparser.add_argument('gffin')
124
    argparser.add_argument('-id', help='identifier', default='transcript_id')
125
    argparser.add_argument(
126
        '-o',
127
        help="Output GTF file",
128
        type=str,
129
        required=True)
130
    args = argparser.parse_args()
131
132
    with gzip.open(args.o, 'wt') if args.o.endswith('.gz') else open(args.o, 'w') as o:
133
        for intron in exonGTF_to_intronGTF(args.gffin, args.id):
134
            if intron is not None:
135
                o.write(intron)