|
a |
|
b/singlecellmultiomics/features/exonGTFtoIntronGTF.py |
|
|
1 |
#!/usr/bin/env python3 |
|
|
2 |
import collections |
|
|
3 |
import os |
|
|
4 |
import sys |
|
|
5 |
import glob |
|
|
6 |
import argparse |
|
|
7 |
import itertools |
|
|
8 |
import gzip |
|
|
9 |
|
|
|
10 |
|
|
|
11 |
def decodeKvPairs(kv): |
|
|
12 |
keyValues = {} |
|
|
13 |
for part in kv.split(';'): |
|
|
14 |
kv = part.strip().split() |
|
|
15 |
if len(kv) == 2: |
|
|
16 |
key = kv[0] |
|
|
17 |
value = kv[1].replace('"', '') |
|
|
18 |
keyValues[key] = value |
|
|
19 |
return keyValues |
|
|
20 |
|
|
|
21 |
|
|
|
22 |
def exonGTF_to_intronGTF(exon_path, id): |
|
|
23 |
# (gene,chrom,strand)-> [range(start,end), range(start,end)] |
|
|
24 |
geneToExonRanges = collections.defaultdict(list) |
|
|
25 |
prevChrom = None |
|
|
26 |
capture = list(set(['gene_id', |
|
|
27 |
'gene_version', |
|
|
28 |
'transcript_version', |
|
|
29 |
'gene_name', |
|
|
30 |
'gene_source', |
|
|
31 |
'gene_biotype', |
|
|
32 |
'transcript_name', |
|
|
33 |
'transcript_source', |
|
|
34 |
'transcript_biotype', |
|
|
35 |
'transcript_support_level']) - set([id])) |
|
|
36 |
print(capture) |
|
|
37 |
|
|
|
38 |
id_to_features = {} |
|
|
39 |
|
|
|
40 |
with gzip.open(exon_path, 'rt') if exon_path.endswith('.gz') else open(exon_path) as f: |
|
|
41 |
for line in f: |
|
|
42 |
if line.startswith('#'): |
|
|
43 |
yield line |
|
|
44 |
continue |
|
|
45 |
chromosome, source, feature_type, feature_start, feature_end, at5, strand, at7, kv = line.split( |
|
|
46 |
None, 8) |
|
|
47 |
|
|
|
48 |
if feature_type != 'exon': |
|
|
49 |
continue |
|
|
50 |
|
|
|
51 |
meta = decodeKvPairs(kv) |
|
|
52 |
id_to_features[meta[id]] = { |
|
|
53 |
c: meta.get(c) for c in capture if c in meta |
|
|
54 |
} |
|
|
55 |
geneToExonRanges[( |
|
|
56 |
meta[id], chromosome, strand, source |
|
|
57 |
) |
|
|
58 |
].append(range(int(feature_start), int(feature_end) + 1)) |
|
|
59 |
|
|
|
60 |
if prevChrom is not None and prevChrom != chromosome: |
|
|
61 |
for intr in generate_introns( |
|
|
62 |
geneToExonRanges, id, id_to_features): |
|
|
63 |
yield intr |
|
|
64 |
geneToExonRanges = collections.defaultdict(list) |
|
|
65 |
|
|
|
66 |
prevChrom = chromosome |
|
|
67 |
|
|
|
68 |
if prevChrom is not None: |
|
|
69 |
for intr in generate_introns(geneToExonRanges, id, id_to_features): |
|
|
70 |
yield intr |
|
|
71 |
geneToExonRanges = collections.defaultdict(list) |
|
|
72 |
|
|
|
73 |
|
|
|
74 |
def generate_introns(geneToExonRanges, id, id_to_features): |
|
|
75 |
for g in geneToExonRanges: |
|
|
76 |
if len(geneToExonRanges[g]) == 0: |
|
|
77 |
continue |
|
|
78 |
geneStart = min([min(r) for r in geneToExonRanges[g]]) |
|
|
79 |
geneEnd = max([max(r) for r in geneToExonRanges[g]]) |
|
|
80 |
gene_identifier_or_other, chromosome, strand, source = g |
|
|
81 |
|
|
|
82 |
introns = [] |
|
|
83 |
in_intron = None |
|
|
84 |
current_intron_start = None |
|
|
85 |
rest_data = id_to_features.get(gene_identifier_or_other, {}) |
|
|
86 |
|
|
|
87 |
if len(rest_data) > 0: |
|
|
88 |
rest = '; '.join( |
|
|
89 |
[f'{key} {value}' for key, value in rest_data.items()]) |
|
|
90 |
else: |
|
|
91 |
rest = '' |
|
|
92 |
|
|
|
93 |
for i in sorted( |
|
|
94 |
|
|
|
95 |
[geneStart, geneEnd] + |
|
|
96 |
[i.start for i in geneToExonRanges[g]] + |
|
|
97 |
[i.stop for i in geneToExonRanges[g]]): |
|
|
98 |
if any([i in r for r in geneToExonRanges[g]]): |
|
|
99 |
if current_intron_start is not None and in_intron is True: |
|
|
100 |
introns.append([current_intron_start, i]) |
|
|
101 |
|
|
|
102 |
yield '\t'.join([str(x) for x in [ |
|
|
103 |
chromosome, |
|
|
104 |
source, |
|
|
105 |
'intron', |
|
|
106 |
current_intron_start, |
|
|
107 |
i - 1, |
|
|
108 |
'.', |
|
|
109 |
strand, |
|
|
110 |
'.', f'{id} "{gene_identifier_or_other}";{rest}']]) + '\n' |
|
|
111 |
|
|
|
112 |
in_intron = False |
|
|
113 |
else: |
|
|
114 |
if in_intron is False: |
|
|
115 |
in_intron = True |
|
|
116 |
current_intron_start = i |
|
|
117 |
|
|
|
118 |
|
|
|
119 |
if __name__ == '__main__': |
|
|
120 |
argparser = argparse.ArgumentParser( |
|
|
121 |
formatter_class=argparse.ArgumentDefaultsHelpFormatter, |
|
|
122 |
description="Convert exonic GTF file to intronic GTF file") |
|
|
123 |
argparser.add_argument('gffin') |
|
|
124 |
argparser.add_argument('-id', help='identifier', default='transcript_id') |
|
|
125 |
argparser.add_argument( |
|
|
126 |
'-o', |
|
|
127 |
help="Output GTF file", |
|
|
128 |
type=str, |
|
|
129 |
required=True) |
|
|
130 |
args = argparser.parse_args() |
|
|
131 |
|
|
|
132 |
with gzip.open(args.o, 'wt') if args.o.endswith('.gz') else open(args.o, 'w') as o: |
|
|
133 |
for intron in exonGTF_to_intronGTF(args.gffin, args.id): |
|
|
134 |
if intron is not None: |
|
|
135 |
o.write(intron) |