[45ad7e]: / singlecellmultiomics / features / exonGTFtoIntronGTF.py

Download this file

136 lines (113 with data), 4.5 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python3
import collections
import os
import sys
import glob
import argparse
import itertools
import gzip
def decodeKvPairs(kv):
keyValues = {}
for part in kv.split(';'):
kv = part.strip().split()
if len(kv) == 2:
key = kv[0]
value = kv[1].replace('"', '')
keyValues[key] = value
return keyValues
def exonGTF_to_intronGTF(exon_path, id):
# (gene,chrom,strand)-> [range(start,end), range(start,end)]
geneToExonRanges = collections.defaultdict(list)
prevChrom = None
capture = list(set(['gene_id',
'gene_version',
'transcript_version',
'gene_name',
'gene_source',
'gene_biotype',
'transcript_name',
'transcript_source',
'transcript_biotype',
'transcript_support_level']) - set([id]))
print(capture)
id_to_features = {}
with gzip.open(exon_path, 'rt') if exon_path.endswith('.gz') else open(exon_path) as f:
for line in f:
if line.startswith('#'):
yield line
continue
chromosome, source, feature_type, feature_start, feature_end, at5, strand, at7, kv = line.split(
None, 8)
if feature_type != 'exon':
continue
meta = decodeKvPairs(kv)
id_to_features[meta[id]] = {
c: meta.get(c) for c in capture if c in meta
}
geneToExonRanges[(
meta[id], chromosome, strand, source
)
].append(range(int(feature_start), int(feature_end) + 1))
if prevChrom is not None and prevChrom != chromosome:
for intr in generate_introns(
geneToExonRanges, id, id_to_features):
yield intr
geneToExonRanges = collections.defaultdict(list)
prevChrom = chromosome
if prevChrom is not None:
for intr in generate_introns(geneToExonRanges, id, id_to_features):
yield intr
geneToExonRanges = collections.defaultdict(list)
def generate_introns(geneToExonRanges, id, id_to_features):
for g in geneToExonRanges:
if len(geneToExonRanges[g]) == 0:
continue
geneStart = min([min(r) for r in geneToExonRanges[g]])
geneEnd = max([max(r) for r in geneToExonRanges[g]])
gene_identifier_or_other, chromosome, strand, source = g
introns = []
in_intron = None
current_intron_start = None
rest_data = id_to_features.get(gene_identifier_or_other, {})
if len(rest_data) > 0:
rest = '; '.join(
[f'{key} {value}' for key, value in rest_data.items()])
else:
rest = ''
for i in sorted(
[geneStart, geneEnd] +
[i.start for i in geneToExonRanges[g]] +
[i.stop for i in geneToExonRanges[g]]):
if any([i in r for r in geneToExonRanges[g]]):
if current_intron_start is not None and in_intron is True:
introns.append([current_intron_start, i])
yield '\t'.join([str(x) for x in [
chromosome,
source,
'intron',
current_intron_start,
i - 1,
'.',
strand,
'.', f'{id} "{gene_identifier_or_other}";{rest}']]) + '\n'
in_intron = False
else:
if in_intron is False:
in_intron = True
current_intron_start = i
if __name__ == '__main__':
argparser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description="Convert exonic GTF file to intronic GTF file")
argparser.add_argument('gffin')
argparser.add_argument('-id', help='identifier', default='transcript_id')
argparser.add_argument(
'-o',
help="Output GTF file",
type=str,
required=True)
args = argparser.parse_args()
with gzip.open(args.o, 'wt') if args.o.endswith('.gz') else open(args.o, 'w') as o:
for intron in exonGTF_to_intronGTF(args.gffin, args.id):
if intron is not None:
o.write(intron)