|
a |
|
b/BioAid/MMBSearchTK/AnnoMMBS.py |
|
|
1 |
# This code was developed and authored by Jerzy Twarowski in Malkova Lab at the University of Iowa |
|
|
2 |
# Contact: jerzymateusz-twarowski@uiowa.edu, tvarovski1@gmail.com |
|
|
3 |
|
|
|
4 |
## Contents: |
|
|
5 |
## verifyImperfectHomology |
|
|
6 |
## add_gene |
|
|
7 |
## add_exon |
|
|
8 |
## createAnnotatedOutput |
|
|
9 |
|
|
|
10 |
import pandas as pd |
|
|
11 |
from pyensembl import EnsemblRelease |
|
|
12 |
from pyensembl.exon import Exon |
|
|
13 |
import logging |
|
|
14 |
import regex as re |
|
|
15 |
from ..base import createChrList |
|
|
16 |
from ..base import rev_compl |
|
|
17 |
from ..complexity import movingWindow |
|
|
18 |
|
|
|
19 |
|
|
|
20 |
def verifyImperfectHomology(ref, query, min_homology=0.8): |
|
|
21 |
|
|
|
22 |
mmbir = rev_compl(query) |
|
|
23 |
mmbir_errors = round(len(mmbir)*(1-min_homology)) |
|
|
24 |
if mmbir_errors > 10: |
|
|
25 |
mmbir_errors = 10 |
|
|
26 |
output_list = re.findall( '(' + mmbir + '){e<=' + str(mmbir_errors) + '}', ref) |
|
|
27 |
print(f'Found {len(output_list)} possible templates for {query}: {output_list}') |
|
|
28 |
|
|
|
29 |
if len(output_list) > 0: |
|
|
30 |
return(True) |
|
|
31 |
else: |
|
|
32 |
return(False) |
|
|
33 |
|
|
|
34 |
def add_gene(row): |
|
|
35 |
|
|
|
36 |
data = EnsemblRelease(104) |
|
|
37 |
#print("loaded genome! #addgene") |
|
|
38 |
output = data.gene_names_at_locus(contig=row['chr'].strip("chr"), position=int(row["iBirStart"])) |
|
|
39 |
print(output) |
|
|
40 |
#output = str(output) |
|
|
41 |
out_list = [] |
|
|
42 |
for gname in output: |
|
|
43 |
if gname != "": |
|
|
44 |
out_list.append(gname) |
|
|
45 |
#print("appended_gene!") |
|
|
46 |
|
|
|
47 |
return(out_list) #output |
|
|
48 |
|
|
|
49 |
def add_exon(row): |
|
|
50 |
|
|
|
51 |
data = EnsemblRelease(104) |
|
|
52 |
#print("loaded genome! #addexon") |
|
|
53 |
output = data.exons_at_locus(contig=row['chr'].strip("chr"), position=int(row["iBirStart"])) |
|
|
54 |
#output = str(output) |
|
|
55 |
out_list = [] |
|
|
56 |
for exon in output: |
|
|
57 |
if isinstance(exon, Exon): |
|
|
58 |
out_list.append(exon.exon_id) |
|
|
59 |
|
|
|
60 |
return(out_list) #output |
|
|
61 |
|
|
|
62 |
def createAnnotatedOutput(f_name, path, output_f_name): |
|
|
63 |
import os |
|
|
64 |
|
|
|
65 |
chrList=createChrList(25) |
|
|
66 |
df = pd.DataFrame() |
|
|
67 |
|
|
|
68 |
for chr in chrList: |
|
|
69 |
try: |
|
|
70 |
if os.stat(f"{path}{chr}/{f_name}").st_size == 0: |
|
|
71 |
print(f"WARNING! File {path}{chr}/{f_name} is empty. Skipping...") |
|
|
72 |
continue |
|
|
73 |
with open(f"{path}{chr}/{f_name}") as f: |
|
|
74 |
lines = f.readlines() |
|
|
75 |
except: |
|
|
76 |
print(f"WARNING! Couldn't read {path}{chr}/{f_name}. Make sure it's there.") |
|
|
77 |
lines = None |
|
|
78 |
continue |
|
|
79 |
|
|
|
80 |
if (type(lines) == None): |
|
|
81 |
print("WARNING! the type of lines is None. Skipping...") |
|
|
82 |
|
|
|
83 |
else: |
|
|
84 |
print(f"hello {chr}") |
|
|
85 |
for i in range(len(lines)): |
|
|
86 |
if "###" in lines[i]: |
|
|
87 |
i-=1 |
|
|
88 |
if "Microhomology Insertion:" in lines[i+11]: |
|
|
89 |
var0 = lines[i].strip() |
|
|
90 |
var1 = lines[i+1].strip() |
|
|
91 |
var2 = lines[i+2].strip("iBirStart:").strip() |
|
|
92 |
var3 = lines[i+3].strip("Consensus/cluster number:").strip() |
|
|
93 |
var4 = lines[i+4].strip("iDepth:").strip() |
|
|
94 |
var5 = lines[i+5].strip("sBir:").strip() |
|
|
95 |
var6 = lines[i+6].strip("sBirReversed:").strip() |
|
|
96 |
var7 = lines[i+7].strip() |
|
|
97 |
var8 = lines[i+8].strip("ref:").strip() |
|
|
98 |
var9 = lines[i+9].strip("bir:").strip() |
|
|
99 |
var10 = lines[i+10].strip("TEM:").strip() |
|
|
100 |
var11 = lines[i+11].strip("Microhomology Insertion:").strip() |
|
|
101 |
var12 = lines[i+12].strip("Microhomology template:").strip() |
|
|
102 |
var13 = lines[i+13].strip("readStart: ").strip() |
|
|
103 |
var14 = lines[i+14].strip("ref:").strip() |
|
|
104 |
var15 = lines[i+15].strip("bir:").strip() |
|
|
105 |
var16 = lines[i+16].strip() |
|
|
106 |
var17 = lines[i+17].strip() |
|
|
107 |
var18 = lines[i+18].strip() |
|
|
108 |
var19 = lines[i+19].strip() |
|
|
109 |
else: |
|
|
110 |
var0 = lines[i].strip() |
|
|
111 |
var1 = lines[i+1].strip() |
|
|
112 |
var2 = lines[i+2].strip("iBirStart:").strip() |
|
|
113 |
var3 = lines[i+3].strip("Consensus/cluster number:").strip() |
|
|
114 |
var4 = lines[i+4].strip("iDepth:").strip() |
|
|
115 |
var5 = lines[i+5].strip("sBir:").strip() |
|
|
116 |
var6 = lines[i+6].strip("sBirReversed:").strip() |
|
|
117 |
var7 = lines[i+7].strip() |
|
|
118 |
var8 = lines[i+8].strip("ref:").strip() |
|
|
119 |
var9 = lines[i+9].strip("bir:").strip() |
|
|
120 |
var10 = lines[i+10].strip("TEM:").strip() |
|
|
121 |
var11 = "Empty" |
|
|
122 |
var12 = "Empty" |
|
|
123 |
var13 = lines[i+11].strip("readStart: ").strip() |
|
|
124 |
var14 = lines[i+12].strip("ref:").strip() |
|
|
125 |
var15 = lines[i+13].strip("bir:").strip() |
|
|
126 |
var16 = lines[i+14].strip() |
|
|
127 |
var17 = lines[i+15].strip() |
|
|
128 |
var18 = lines[i+16].strip() |
|
|
129 |
|
|
|
130 |
event_dict = { |
|
|
131 |
"chr":chr, |
|
|
132 |
"iBirStart":var2, |
|
|
133 |
"Consensus/cluster_number":var3, |
|
|
134 |
"iDepth":var4, |
|
|
135 |
"sBir":var5, |
|
|
136 |
"sBirReversed":var6, |
|
|
137 |
"ref":var8, |
|
|
138 |
"bir":var9, |
|
|
139 |
"TEM":var10, |
|
|
140 |
"Microhomology_Insertion":var11, |
|
|
141 |
"Microhomology_template":var12, |
|
|
142 |
"readStart":var13, |
|
|
143 |
"var16":var16, |
|
|
144 |
"var17":var17, |
|
|
145 |
"var18":var18 |
|
|
146 |
} |
|
|
147 |
|
|
|
148 |
#make sure that the ref and bir are not empty, otherwise the movingWindow will fail |
|
|
149 |
if (len(var8) == 0) or (len(var9) == 0): |
|
|
150 |
print(f"ref or bir are empty for {path}{chr}/{f_name}") |
|
|
151 |
print("Cheeck if they run correctly in the original output file...") |
|
|
152 |
print("Skipping this event...") |
|
|
153 |
|
|
|
154 |
#print the same messages to the error log |
|
|
155 |
logging.basicConfig(filename='error.log', level=logging.DEBUG) |
|
|
156 |
logging.debug(f"ref or bir are empty for {path}{chr}/{f_name}") |
|
|
157 |
logging.debug("Cheeck if they run correctly in the original output file...") |
|
|
158 |
logging.debug("Skipping this event...") |
|
|
159 |
continue |
|
|
160 |
|
|
|
161 |
event_df = pd.DataFrame.from_records([event_dict]) |
|
|
162 |
df = pd.concat([df, event_df], ignore_index=True) |
|
|
163 |
|
|
|
164 |
print("Starting complexity") |
|
|
165 |
|
|
|
166 |
try: |
|
|
167 |
df[['ref_complexity_fail', 'ref_complexity_score', 'ref_complexity_tree']] = df.apply(lambda row: movingWindow(row.ref, complexity_threshold=0.2), axis=1, result_type ='expand') |
|
|
168 |
except: |
|
|
169 |
print(f"Error with ref_complexity for {path}{chr}/{f_name}") |
|
|
170 |
|
|
|
171 |
try: |
|
|
172 |
df[['bir_complexity_fail', 'bir_complexity_score', 'bir_complexity_tree']] = df.apply(lambda row: movingWindow(row.bir, complexity_threshold=0.2), axis=1, result_type ='expand') |
|
|
173 |
except: |
|
|
174 |
print(f"Error with bir_complexity for {path}{chr}/{f_name}") |
|
|
175 |
|
|
|
176 |
print("Starting homology check") |
|
|
177 |
df['homology_check_ref'] = df.apply(lambda row: verifyImperfectHomology(row.ref, row.sBir), axis=1) |
|
|
178 |
df['homology_check_bir'] = df.apply(lambda row: verifyImperfectHomology(row.bir, row.sBir), axis=1) |
|
|
179 |
print("Starting gene/exon annotation") |
|
|
180 |
df["genes"] = df.apply(lambda row: add_gene(row), axis=1) |
|
|
181 |
df["exones"] = df.apply(lambda row: add_exon(row), axis=1) |
|
|
182 |
|
|
|
183 |
df.to_csv(output_f_name, sep="\t",index=False) |
|
|
184 |
print(f"finished annotation... DF saved to {output_f_name}") |