|
a |
|
b/BioAid/repeatSearch.py |
|
|
1 |
# This code was developed and authored by Jerzy Twarowski in Malkova Lab at the University of Iowa |
|
|
2 |
# Contact: jerzymateusz-twarowski@uiowa.edu, tvarovski1@gmail.com |
|
|
3 |
|
|
|
4 |
## Contents: |
|
|
5 |
## validateDNASquence |
|
|
6 |
## imperfectHomologySearch |
|
|
7 |
## findInvertedRepeat |
|
|
8 |
## searchSequenceForRepeats |
|
|
9 |
## saveToJSON |
|
|
10 |
|
|
|
11 |
#from .base import validateSquence |
|
|
12 |
from .base import rev_compl |
|
|
13 |
|
|
|
14 |
|
|
|
15 |
def validateDNASquence(sequence: str) -> bool: |
|
|
16 |
''' |
|
|
17 |
Check if the given DNA sequence contains only valid nucleotides. |
|
|
18 |
|
|
|
19 |
Args: |
|
|
20 |
sequence (str): the DNA sequence to validate. |
|
|
21 |
|
|
|
22 |
Returns: |
|
|
23 |
bool: True if the sequence contains only valid nucleotides, False otherwise. |
|
|
24 |
''' |
|
|
25 |
|
|
|
26 |
bases = "ATGCatgc-" |
|
|
27 |
for i in sequence: |
|
|
28 |
if i not in bases: |
|
|
29 |
print("warning, sequence doesn't contain cannonical nucleotides") |
|
|
30 |
return(False) |
|
|
31 |
return(True) |
|
|
32 |
|
|
|
33 |
def imperfectHomologySearch(sequence: str, query: str, min_homology: float = 0.8, fixed_errors: bool = False, inverted: bool = True, silent: bool = False) -> list: |
|
|
34 |
''' |
|
|
35 |
Search for a query sequence in a given DNA sequence, allowing for a certain number of mismatches/imperfect homology. |
|
|
36 |
|
|
|
37 |
Args: |
|
|
38 |
sequence (str): the DNA sequence to search in. |
|
|
39 |
query (str): the query sequence to search for. |
|
|
40 |
min_homology (float, optional): the minimum homology (similarity) between the query and the found sequence, as a fraction between 0 and 1. Defaults to 0.8. |
|
|
41 |
fixed_errors (bool, optional): if True, use a fixed number of errors instead of calculating it from min_homology. Defaults to False. |
|
|
42 |
inverted (bool, optional): if True, search for the reverse complement of the query sequence as well. Defaults to True. |
|
|
43 |
silent (bool, optional): if True, don't print any messages. Defaults to False. |
|
|
44 |
|
|
|
45 |
Returns: |
|
|
46 |
list: a list of query-match pairs, where each pair is a list containing the query sequence and a list of matching sequences. |
|
|
47 |
''' |
|
|
48 |
import regex as re |
|
|
49 |
errors = 0 |
|
|
50 |
if min_homology: |
|
|
51 |
errors = round(len(query)*(1-min_homology)) |
|
|
52 |
if fixed_errors: |
|
|
53 |
errors = fixed_errors |
|
|
54 |
#print(f"searching with {errors} errors...") |
|
|
55 |
|
|
|
56 |
output_list = re.findall( '(' + query + '){e<=' + str(errors) + '}', sequence) |
|
|
57 |
|
|
|
58 |
if inverted == True: |
|
|
59 |
query=rev_compl(query) |
|
|
60 |
|
|
|
61 |
query_match_pairs=[query, output_list] |
|
|
62 |
if len(output_list) > 0: |
|
|
63 |
if len(output_list) > 1: |
|
|
64 |
if not silent: |
|
|
65 |
print(f"This is unusual... Found more than one match ({len(output_list)}) for the query...") |
|
|
66 |
|
|
|
67 |
if not silent: |
|
|
68 |
print(f'Found possible template(s) for {query}: {output_list}') |
|
|
69 |
return(query_match_pairs) |
|
|
70 |
else: |
|
|
71 |
return([]) |
|
|
72 |
|
|
|
73 |
def findInvertedRepeat(sequence: str, query_length: int = 4, min_spacer: int = 4, imperfect_homology: bool = False, |
|
|
74 |
min_homology: float = 0.8, fixed_errors: bool = False, inverted: bool = True) -> list: |
|
|
75 |
''' |
|
|
76 |
Search for inverted repeats in a given DNA sequence. |
|
|
77 |
|
|
|
78 |
Args: |
|
|
79 |
sequence (str): the DNA sequence to search in. |
|
|
80 |
query_length (int, optional): the length of the query sequence to search for. Defaults to 4. |
|
|
81 |
min_spacer (int, optional): the minimum number of non-matching nucleotides between the two halves of the inverted repeat. Defaults to 4. |
|
|
82 |
imperfect_homology (bool, optional): if True, allow for a certain number of errors in the inverted repeat. Defaults to False. |
|
|
83 |
min_homology (float, optional): the minimum homology (similarity) between the query and the found sequence, as a fraction between 0 and 1. Only used if imperfect_homology is True. Defaults to 0.8. |
|
|
84 |
fixed_errors (bool, optional): if True, use a fixed number of errors instead of calculating it from min_homology. Only used if imperfect_homology is True. Defaults to False. |
|
|
85 |
inverted (bool, optional): if True, search for the reverse complement of the query sequence as well. Defaults to True. |
|
|
86 |
|
|
|
87 |
Returns: |
|
|
88 |
list: a list of query-match pairs, where each pair is a list containing the query sequence and a list of matching sequences. |
|
|
89 |
''' |
|
|
90 |
query_string=sequence[:query_length] |
|
|
91 |
|
|
|
92 |
if inverted == True: |
|
|
93 |
query = rev_compl(query_string) |
|
|
94 |
elif inverted == False: |
|
|
95 |
query = query_string |
|
|
96 |
|
|
|
97 |
sequence=sequence[query_length+min_spacer:] |
|
|
98 |
query_complement = rev_compl(query) |
|
|
99 |
output_pair_list=[] |
|
|
100 |
|
|
|
101 |
for i in range(len(sequence)-query_length+1): |
|
|
102 |
#print(sequence[i:query_length+i]) |
|
|
103 |
if imperfect_homology: |
|
|
104 |
|
|
|
105 |
|
|
|
106 |
output_pair=imperfectHomologySearch(sequence[i:query_length+i], |
|
|
107 |
query, |
|
|
108 |
min_homology=min_homology, |
|
|
109 |
fixed_errors=fixed_errors, |
|
|
110 |
inverted=inverted) |
|
|
111 |
|
|
|
112 |
if output_pair != []: |
|
|
113 |
output_pair_list.append(output_pair) |
|
|
114 |
|
|
|
115 |
else: |
|
|
116 |
if sequence[i:query_length+i] == query: |
|
|
117 |
|
|
|
118 |
if inverted == True: |
|
|
119 |
print(f"Success {query_complement}, {sequence[i:query_length+i]}") |
|
|
120 |
output_pair_list.append([query_complement, [sequence[i:query_length+i]]]) |
|
|
121 |
|
|
|
122 |
if inverted == False: |
|
|
123 |
print(f"Success {query}, {sequence[i:query_length+i]}") |
|
|
124 |
output_pair_list.append([query, [sequence[i:query_length+i]]]) |
|
|
125 |
|
|
|
126 |
return(output_pair_list) |
|
|
127 |
|
|
|
128 |
def searchSequenceForRepeats(sequence: str, min_query_length: int = 4, max_query_length: int = 25, |
|
|
129 |
min_spacer: int = 0, window_size: int = 250, imperfect_homology:bool = False, |
|
|
130 |
min_homology:float = 0.8, fixed_errors:bool = False, inverted: bool = True) -> dict: |
|
|
131 |
''' |
|
|
132 |
Search a DNA sequence for inverted repeats of various lengths. |
|
|
133 |
|
|
|
134 |
Args: |
|
|
135 |
sequence (str): the DNA sequence to search in. |
|
|
136 |
min_query_length (int, optional): the minimum length of the query sequence to search for. Defaults to 4. |
|
|
137 |
max_query_length (int, optional): the maximum length of the query sequence to search for. Defaults to 25. |
|
|
138 |
min_spacer (int, optional): the minimum number of non-matching nucleotides between the two halves of the inverted repeat. Defaults to 0. |
|
|
139 |
window_size (int, optional): the size of the sliding window to use when searching for inverted repeats. Defaults to 250. |
|
|
140 |
imperfect_homology (bool, optional): if True, allow for a certain number of errors in the inverted repeat. Defaults to False. |
|
|
141 |
min_homology (float, optional): the minimum homology (similarity) between the query and the found sequence, as a fraction between 0 and 1. Only used if imperfect_homology is True. Defaults to 0.8. |
|
|
142 |
fixed_errors (bool, optional): if True, use a fixed number of errors instead of calculating it from min_homology. Only used if imperfect_homology is True. Defaults to False. |
|
|
143 |
inverted (bool, optional): if True, search for the reverse complement of the query sequence as well. Defaults to True. |
|
|
144 |
|
|
|
145 |
Returns: |
|
|
146 |
dict: a dictionary where the keys are the query sequences and the values are lists of matching sequences. |
|
|
147 |
''' |
|
|
148 |
if imperfect_homology: |
|
|
149 |
print(f"Search has been set to find quasi-pallindromes") |
|
|
150 |
if fixed_errors: |
|
|
151 |
print(f" Allowing up to {fixed_errors} errors/mismatches...") |
|
|
152 |
if not fixed_errors: |
|
|
153 |
print(f" Searching with a minimum of {min_homology} homology") |
|
|
154 |
if not imperfect_homology: |
|
|
155 |
print(f"Search has been set to find perfect pallindromes") |
|
|
156 |
|
|
|
157 |
|
|
|
158 |
list_of_all_pairs=[] |
|
|
159 |
for query_length in range(min_query_length, max_query_length+1): |
|
|
160 |
print(f"###Searching for inverted-repeating {query_length}bp-long fragments...") |
|
|
161 |
sequence_size=len(sequence) |
|
|
162 |
|
|
|
163 |
for i in range(sequence_size): #-window_size+1 |
|
|
164 |
seq=sequence[i:i+window_size] |
|
|
165 |
output_pair_list = findInvertedRepeat( seq, |
|
|
166 |
query_length=query_length, |
|
|
167 |
min_spacer=min_spacer, |
|
|
168 |
imperfect_homology=imperfect_homology, |
|
|
169 |
min_homology=min_homology, |
|
|
170 |
fixed_errors=fixed_errors, |
|
|
171 |
inverted=inverted) |
|
|
172 |
for pair in output_pair_list: |
|
|
173 |
list_of_all_pairs.append(pair) |
|
|
174 |
|
|
|
175 |
results_dictionary={} |
|
|
176 |
print("Consolodating Results...") |
|
|
177 |
for pair in list_of_all_pairs: |
|
|
178 |
query=pair[0] |
|
|
179 |
if query in results_dictionary: |
|
|
180 |
if pair[1][0] not in results_dictionary[query]: |
|
|
181 |
results_dictionary[query].append(pair[1][0]) |
|
|
182 |
else: |
|
|
183 |
results_dictionary[query] = [pair[1][0]] |
|
|
184 |
print("Results were consolidated...") |
|
|
185 |
return(results_dictionary) |
|
|
186 |
|
|
|
187 |
def saveToJSON(results_dictionary: dict, output_file_name: str = 'json_output.json') -> None: |
|
|
188 |
''' |
|
|
189 |
Save a dictionary to a JSON file. |
|
|
190 |
|
|
|
191 |
Args: |
|
|
192 |
results_dictionary (dict): the dictionary to save. |
|
|
193 |
output_file_name (str, optional): the name of the output file. Defaults to 'json_output.json'. |
|
|
194 |
|
|
|
195 |
Returns: |
|
|
196 |
None |
|
|
197 |
''' |
|
|
198 |
import json |
|
|
199 |
with open(output_file_name, 'w') as outfile: |
|
|
200 |
json.dump(results_dictionary, outfile) |
|
|
201 |
|