a b/annotate_mouse_transcriptome.py
1
import re
2
in_genes="Mus_musculus.GRCm38.84.with_tid.gtf"
3
out_genes="Mus_musculus.GRCm38.84.annotated.gtf"
4
accepted_gene_biotypes_for_NA_transcripts = set(["IG_V_gene","IG_J_gene","protein_coding","TR_J_gene","TR_D_gene","TR_V_gene","IG_C_gene","IG_D_gene","TR_C_gene"])
5
with open(in_genes, 'r') as in_f, open(out_genes, 'w') as out_f:
6
    for line in in_f:
7
        chr_name = line.rstrip().split('\t')[0]
8
        # Check the transcript_support level
9
        # This should be faster than a regex
10
        # We need to support the case where see these two types of annotations:
11
        #   transcript_support_level "1"
12
        #   transcript_support_level "1 (assigned to previous version X)"
13
        #   transcript_support_level "2" <- Clear example of a gene like this is NKX6.1
14
        #   transcript_support_level "2 (assigned to previous version X)"
15
        line_valid_for_output = False
16
        if 'transcript_support_level "1"' in line or 'transcript_support_level "1 ' in line or 'transcript_support_level "2"' in line or 'transcript_support_level "2 ' in line:
17
            line_valid_for_output = True
18
        elif 'transcript_support_level "NA' in line:
19
            # Transcript Support Level Not Analysed. Pseudogenes, single exon transcripts, HLA, T-cell receptor and Ig transcripts are not analysed and therefore not given any of the TSL categories.
20
            # Keep only a few ones annotated as "IG_V_gene","IG_J_gene","protein_coding","TR_J_gene","TR_D_gene","TR_V_gene","IG_C_gene","IG_D_gene","TR_C_gene"
21
            gene_biotype = re.search(r'gene_biotype \"(.*?)\";', line)
22
            if gene_biotype and gene_biotype.group(1) in accepted_gene_biotypes_for_NA_transcripts:
23
                line_valid_for_output = True
24
        if line_valid_for_output:
25
            gene_name = re.search(r'gene_name \"(.*?)\";', line)
26
            if gene_name:
27
                gene_name = gene_name.group(1)
28
                out_line = re.sub(r'(?<=transcript_id ")(.*?)(?=";)', r'\1|'+gene_name, line)
29
                out_f.write(out_line)