Switch to unified view

a b/BioAid/MMBSearchTK/AnnoMMBS.py
1
# This code was developed and authored by Jerzy Twarowski in Malkova Lab at the University of Iowa 
2
# Contact: jerzymateusz-twarowski@uiowa.edu, tvarovski1@gmail.com
3
4
## Contents:
5
## verifyImperfectHomology
6
## add_gene
7
## add_exon
8
## createAnnotatedOutput
9
10
import pandas as pd
11
from pyensembl import EnsemblRelease
12
from pyensembl.exon import Exon
13
import logging
14
import regex as re
15
from ..base import createChrList
16
from ..base import rev_compl
17
from ..complexity import movingWindow
18
19
20
def verifyImperfectHomology(ref, query, min_homology=0.8):
21
22
  mmbir = rev_compl(query)
23
  mmbir_errors = round(len(mmbir)*(1-min_homology))
24
  if mmbir_errors > 10:
25
    mmbir_errors = 10
26
  output_list = re.findall( '(' + mmbir + '){e<=' + str(mmbir_errors) + '}', ref)
27
  print(f'Found {len(output_list)} possible templates for {query}: {output_list}')
28
29
  if len(output_list) > 0:
30
    return(True)
31
  else:
32
    return(False)
33
34
def add_gene(row):
35
36
  data = EnsemblRelease(104)
37
  #print("loaded genome! #addgene")
38
  output = data.gene_names_at_locus(contig=row['chr'].strip("chr"), position=int(row["iBirStart"]))
39
  print(output)
40
  #output = str(output)
41
  out_list = []
42
  for gname in output:
43
    if gname != "":
44
      out_list.append(gname)
45
      #print("appended_gene!")
46
47
  return(out_list) #output
48
49
def add_exon(row):
50
51
  data = EnsemblRelease(104)
52
  #print("loaded genome! #addexon")
53
  output = data.exons_at_locus(contig=row['chr'].strip("chr"), position=int(row["iBirStart"]))
54
  #output = str(output)
55
  out_list = []
56
  for exon in output:
57
    if isinstance(exon, Exon):
58
      out_list.append(exon.exon_id)
59
60
  return(out_list) #output
61
62
def createAnnotatedOutput(f_name, path, output_f_name):
63
  import os
64
65
  chrList=createChrList(25)
66
  df = pd.DataFrame()
67
68
  for chr in chrList:
69
    try:
70
      if os.stat(f"{path}{chr}/{f_name}").st_size == 0:
71
        print(f"WARNING! File {path}{chr}/{f_name} is empty. Skipping...")
72
        continue
73
      with open(f"{path}{chr}/{f_name}") as f:
74
        lines = f.readlines()
75
    except:
76
      print(f"WARNING! Couldn't read {path}{chr}/{f_name}. Make sure it's there.")
77
      lines = None
78
      continue
79
80
    if (type(lines) == None):
81
      print("WARNING! the type of lines is None. Skipping...")
82
83
    else:
84
      print(f"hello {chr}")
85
      for i in range(len(lines)):
86
        if "###" in lines[i]:
87
          i-=1
88
          if "Microhomology Insertion:" in lines[i+11]:
89
            var0 = lines[i].strip()
90
            var1 = lines[i+1].strip()
91
            var2 = lines[i+2].strip("iBirStart:").strip()
92
            var3 = lines[i+3].strip("Consensus/cluster number:").strip()
93
            var4 = lines[i+4].strip("iDepth:").strip()
94
            var5 = lines[i+5].strip("sBir:").strip()
95
            var6 = lines[i+6].strip("sBirReversed:").strip()
96
            var7 = lines[i+7].strip()
97
            var8 = lines[i+8].strip("ref:").strip()
98
            var9 = lines[i+9].strip("bir:").strip()
99
            var10 = lines[i+10].strip("TEM:").strip()
100
            var11 = lines[i+11].strip("Microhomology Insertion:").strip()
101
            var12 = lines[i+12].strip("Microhomology template:").strip()
102
            var13 = lines[i+13].strip("readStart: ").strip()
103
            var14 = lines[i+14].strip("ref:").strip()
104
            var15 = lines[i+15].strip("bir:").strip()
105
            var16 = lines[i+16].strip()
106
            var17 = lines[i+17].strip()
107
            var18 = lines[i+18].strip()
108
            var19 = lines[i+19].strip()
109
          else:
110
            var0 = lines[i].strip()
111
            var1 = lines[i+1].strip()
112
            var2 = lines[i+2].strip("iBirStart:").strip()
113
            var3 = lines[i+3].strip("Consensus/cluster number:").strip()
114
            var4 = lines[i+4].strip("iDepth:").strip()
115
            var5 = lines[i+5].strip("sBir:").strip()
116
            var6 = lines[i+6].strip("sBirReversed:").strip()
117
            var7 = lines[i+7].strip()
118
            var8 = lines[i+8].strip("ref:").strip()
119
            var9 = lines[i+9].strip("bir:").strip()
120
            var10 = lines[i+10].strip("TEM:").strip()
121
            var11 = "Empty"
122
            var12 = "Empty"
123
            var13 = lines[i+11].strip("readStart: ").strip()
124
            var14 = lines[i+12].strip("ref:").strip()
125
            var15 = lines[i+13].strip("bir:").strip()
126
            var16 = lines[i+14].strip()
127
            var17 = lines[i+15].strip()
128
            var18 = lines[i+16].strip()
129
130
          event_dict = {
131
                        "chr":chr,
132
                        "iBirStart":var2,
133
                        "Consensus/cluster_number":var3,
134
                        "iDepth":var4,
135
                        "sBir":var5,
136
                        "sBirReversed":var6,
137
                        "ref":var8,
138
                        "bir":var9,
139
                        "TEM":var10,
140
                        "Microhomology_Insertion":var11,
141
                        "Microhomology_template":var12,
142
                        "readStart":var13,
143
                        "var16":var16,
144
                        "var17":var17,
145
                        "var18":var18
146
                        }
147
          
148
          #make sure that the ref and bir are not empty, otherwise the movingWindow will fail
149
          if (len(var8) == 0) or (len(var9) == 0):
150
            print(f"ref or bir are empty for {path}{chr}/{f_name}")
151
            print("Cheeck if they run correctly in the original output file...")
152
            print("Skipping this event...")
153
154
            #print the same messages to the error log
155
            logging.basicConfig(filename='error.log', level=logging.DEBUG)
156
            logging.debug(f"ref or bir are empty for {path}{chr}/{f_name}")
157
            logging.debug("Cheeck if they run correctly in the original output file...")
158
            logging.debug("Skipping this event...")
159
            continue
160
161
          event_df = pd.DataFrame.from_records([event_dict])
162
          df = pd.concat([df, event_df], ignore_index=True)
163
164
  print("Starting complexity")
165
166
  try:
167
    df[['ref_complexity_fail', 'ref_complexity_score', 'ref_complexity_tree']] = df.apply(lambda row: movingWindow(row.ref, complexity_threshold=0.2), axis=1, result_type ='expand')
168
  except:
169
    print(f"Error with ref_complexity for {path}{chr}/{f_name}")
170
171
  try:
172
    df[['bir_complexity_fail', 'bir_complexity_score', 'bir_complexity_tree']] = df.apply(lambda row: movingWindow(row.bir, complexity_threshold=0.2), axis=1, result_type ='expand')
173
  except:
174
    print(f"Error with bir_complexity for {path}{chr}/{f_name}")
175
176
  print("Starting homology check")
177
  df['homology_check_ref'] = df.apply(lambda row: verifyImperfectHomology(row.ref, row.sBir), axis=1)
178
  df['homology_check_bir'] = df.apply(lambda row: verifyImperfectHomology(row.bir, row.sBir), axis=1)
179
  print("Starting gene/exon annotation")
180
  df["genes"] = df.apply(lambda row: add_gene(row), axis=1)
181
  df["exones"] = df.apply(lambda row: add_exon(row), axis=1)
182
183
  df.to_csv(output_f_name, sep="\t",index=False)
184
  print(f"finished annotation... DF saved to {output_f_name}")