|
a |
|
b/annotate_mouse_transcriptome.py |
|
|
1 |
import re |
|
|
2 |
in_genes="Mus_musculus.GRCm38.84.with_tid.gtf" |
|
|
3 |
out_genes="Mus_musculus.GRCm38.84.annotated.gtf" |
|
|
4 |
accepted_gene_biotypes_for_NA_transcripts = set(["IG_V_gene","IG_J_gene","protein_coding","TR_J_gene","TR_D_gene","TR_V_gene","IG_C_gene","IG_D_gene","TR_C_gene"]) |
|
|
5 |
with open(in_genes, 'r') as in_f, open(out_genes, 'w') as out_f: |
|
|
6 |
for line in in_f: |
|
|
7 |
chr_name = line.rstrip().split('\t')[0] |
|
|
8 |
# Check the transcript_support level |
|
|
9 |
# This should be faster than a regex |
|
|
10 |
# We need to support the case where see these two types of annotations: |
|
|
11 |
# transcript_support_level "1" |
|
|
12 |
# transcript_support_level "1 (assigned to previous version X)" |
|
|
13 |
# transcript_support_level "2" <- Clear example of a gene like this is NKX6.1 |
|
|
14 |
# transcript_support_level "2 (assigned to previous version X)" |
|
|
15 |
line_valid_for_output = False |
|
|
16 |
if 'transcript_support_level "1"' in line or 'transcript_support_level "1 ' in line or 'transcript_support_level "2"' in line or 'transcript_support_level "2 ' in line: |
|
|
17 |
line_valid_for_output = True |
|
|
18 |
elif 'transcript_support_level "NA' in line: |
|
|
19 |
# Transcript Support Level Not Analysed. Pseudogenes, single exon transcripts, HLA, T-cell receptor and Ig transcripts are not analysed and therefore not given any of the TSL categories. |
|
|
20 |
# Keep only a few ones annotated as "IG_V_gene","IG_J_gene","protein_coding","TR_J_gene","TR_D_gene","TR_V_gene","IG_C_gene","IG_D_gene","TR_C_gene" |
|
|
21 |
gene_biotype = re.search(r'gene_biotype \"(.*?)\";', line) |
|
|
22 |
if gene_biotype and gene_biotype.group(1) in accepted_gene_biotypes_for_NA_transcripts: |
|
|
23 |
line_valid_for_output = True |
|
|
24 |
if line_valid_for_output: |
|
|
25 |
gene_name = re.search(r'gene_name \"(.*?)\";', line) |
|
|
26 |
if gene_name: |
|
|
27 |
gene_name = gene_name.group(1) |
|
|
28 |
out_line = re.sub(r'(?<=transcript_id ")(.*?)(?=";)', r'\1|'+gene_name, line) |
|
|
29 |
out_f.write(out_line) |