a b/annotate_transcriptome.py
1
import re
2
in_genes="Homo_sapiens.GRCh38.82.with_tid.gtf"
3
# out_genes="Homo_sapiens.GRCh38.82.annotated.gtf"
4
out_genes="Homo_sapiens.GRCh38.82.annotated.gtf"
5
accepted_gene_biotypes_for_NA_transcripts = set(["IG_V_gene","IG_J_gene","protein_coding","TR_J_gene","TR_D_gene","TR_V_gene","IG_C_gene","IG_D_gene","TR_C_gene"])
6
with open(in_genes, 'r') as in_f, open(out_genes, 'w') as out_f:
7
    for line in in_f:
8
        chr_name = line.rstrip().split('\t')[0]
9
        # Check the transcript_support level
10
        # This should be faster than a regex
11
        # We need to support the case where see these two types of annotations:
12
        #   transcript_support_level "1"
13
        #   transcript_support_level "1 (assigned to previous version X)"
14
        #   transcript_support_level "2" <- Clear example of a gene like this is NKX6.1
15
        #   transcript_support_level "2 (assigned to previous version X)"
16
        line_valid_for_output = False
17
        if 'transcript_support_level "1"' in line or 'transcript_support_level "1 ' in line or 'transcript_support_level "2"' in line or 'transcript_support_level "2 ' in line:
18
            line_valid_for_output = True
19
        elif 'transcript_support_level "NA' in line:
20
            # Transcript Support Level Not Analysed. Pseudogenes, single exon transcripts, HLA, T-cell receptor and Ig transcripts are not analysed and therefore not given any of the TSL categories.
21
            # Keep only a few ones annotated as "IG_V_gene","IG_J_gene","protein_coding","TR_J_gene","TR_D_gene","TR_V_gene","IG_C_gene","IG_D_gene","TR_C_gene"
22
            gene_biotype = re.search(r'gene_biotype \"(.*?)\";', line)
23
            if gene_biotype and gene_biotype.group(1) in accepted_gene_biotypes_for_NA_transcripts:
24
                line_valid_for_output = True
25
        if line_valid_for_output:
26
            gene_name = re.search(r'gene_name \"(.*?)\";', line)
27
            if gene_name:
28
                gene_name = gene_name.group(1)
29
                out_line = re.sub(r'(?<=transcript_id ")(.*?)(?=";)', r'\1|'+gene_name, line)
30
                out_f.write(out_line)