a b/BioAid/repeatSearch.py
1
# This code was developed and authored by Jerzy Twarowski in Malkova Lab at the University of Iowa 
2
# Contact: jerzymateusz-twarowski@uiowa.edu, tvarovski1@gmail.com
3
4
## Contents:
5
## validateDNASquence
6
## imperfectHomologySearch
7
## findInvertedRepeat
8
## searchSequenceForRepeats
9
## saveToJSON
10
11
#from .base import validateSquence
12
from .base import rev_compl
13
14
15
def validateDNASquence(sequence: str) -> bool:
16
    '''
17
    Check if the given DNA sequence contains only valid nucleotides.
18
19
    Args:
20
        sequence (str): the DNA sequence to validate.
21
22
    Returns:
23
        bool: True if the sequence contains only valid nucleotides, False otherwise.
24
    '''
25
26
    bases = "ATGCatgc-"
27
    for i in sequence:
28
        if i not in bases:
29
            print("warning, sequence doesn't contain cannonical nucleotides")
30
            return(False)
31
    return(True)
32
33
def imperfectHomologySearch(sequence: str, query: str, min_homology: float = 0.8, fixed_errors: bool = False, inverted: bool = True, silent: bool = False) -> list:
34
    '''
35
    Search for a query sequence in a given DNA sequence, allowing for a certain number of mismatches/imperfect homology.
36
37
    Args:
38
        sequence (str): the DNA sequence to search in.
39
        query (str): the query sequence to search for.
40
        min_homology (float, optional): the minimum homology (similarity) between the query and the found sequence, as a fraction between 0 and 1. Defaults to 0.8.
41
        fixed_errors (bool, optional): if True, use a fixed number of errors instead of calculating it from min_homology. Defaults to False.
42
        inverted (bool, optional): if True, search for the reverse complement of the query sequence as well. Defaults to True.
43
        silent (bool, optional): if True, don't print any messages. Defaults to False.
44
45
    Returns:
46
        list: a list of query-match pairs, where each pair is a list containing the query sequence and a list of matching sequences.
47
    '''
48
    import regex as re
49
    errors = 0
50
    if min_homology:
51
        errors = round(len(query)*(1-min_homology))
52
    if fixed_errors:
53
        errors = fixed_errors
54
    #print(f"searching with {errors} errors...")
55
56
    output_list = re.findall( '(' + query + '){e<=' + str(errors) + '}', sequence)
57
58
    if inverted == True:
59
        query=rev_compl(query)
60
    
61
    query_match_pairs=[query, output_list]
62
    if len(output_list) > 0:
63
        if len(output_list) > 1:
64
            if not silent:
65
                print(f"This is unusual... Found more than one match ({len(output_list)}) for the query...")
66
        
67
        if not silent:
68
            print(f'Found possible template(s) for {query}: {output_list}')
69
        return(query_match_pairs)
70
    else:
71
        return([])
72
73
def findInvertedRepeat(sequence: str, query_length: int = 4, min_spacer: int = 4, imperfect_homology: bool = False,
74
                        min_homology: float = 0.8, fixed_errors: bool = False, inverted: bool = True) -> list:
75
    '''
76
    Search for inverted repeats in a given DNA sequence.
77
78
    Args:
79
        sequence (str): the DNA sequence to search in.
80
        query_length (int, optional): the length of the query sequence to search for. Defaults to 4.
81
        min_spacer (int, optional): the minimum number of non-matching nucleotides between the two halves of the inverted repeat. Defaults to 4.
82
        imperfect_homology (bool, optional): if True, allow for a certain number of errors in the inverted repeat. Defaults to False.
83
        min_homology (float, optional): the minimum homology (similarity) between the query and the found sequence, as a fraction between 0 and 1. Only used if imperfect_homology is True. Defaults to 0.8.
84
        fixed_errors (bool, optional): if True, use a fixed number of errors instead of calculating it from min_homology. Only used if imperfect_homology is True. Defaults to False.
85
        inverted (bool, optional): if True, search for the reverse complement of the query sequence as well. Defaults to True.
86
87
    Returns:
88
        list: a list of query-match pairs, where each pair is a list containing the query sequence and a list of matching sequences.
89
    '''
90
    query_string=sequence[:query_length]
91
92
    if inverted == True:
93
        query = rev_compl(query_string)
94
    elif inverted == False:
95
        query = query_string
96
        
97
    sequence=sequence[query_length+min_spacer:]
98
    query_complement = rev_compl(query)
99
    output_pair_list=[]
100
101
    for i in range(len(sequence)-query_length+1):
102
        #print(sequence[i:query_length+i])
103
        if imperfect_homology:
104
            
105
            
106
            output_pair=imperfectHomologySearch(sequence[i:query_length+i],
107
                                                query,
108
                                                min_homology=min_homology,
109
                                                fixed_errors=fixed_errors,
110
                                                inverted=inverted)
111
112
            if output_pair != []:
113
                output_pair_list.append(output_pair)
114
115
        else:
116
            if sequence[i:query_length+i] == query:
117
118
                if inverted == True:
119
                    print(f"Success {query_complement}, {sequence[i:query_length+i]}")
120
                    output_pair_list.append([query_complement, [sequence[i:query_length+i]]])
121
122
                if inverted == False:
123
                    print(f"Success {query}, {sequence[i:query_length+i]}")
124
                    output_pair_list.append([query, [sequence[i:query_length+i]]])
125
126
    return(output_pair_list)
127
        
128
def searchSequenceForRepeats(sequence: str, min_query_length: int = 4, max_query_length: int = 25,
129
                            min_spacer: int = 0, window_size: int = 250, imperfect_homology:bool = False,
130
                            min_homology:float = 0.8, fixed_errors:bool = False, inverted: bool = True) -> dict:
131
    '''
132
    Search a DNA sequence for inverted repeats of various lengths.
133
134
    Args:
135
        sequence (str): the DNA sequence to search in.
136
        min_query_length (int, optional): the minimum length of the query sequence to search for. Defaults to 4.
137
        max_query_length (int, optional): the maximum length of the query sequence to search for. Defaults to 25.
138
        min_spacer (int, optional): the minimum number of non-matching nucleotides between the two halves of the inverted repeat. Defaults to 0.
139
        window_size (int, optional): the size of the sliding window to use when searching for inverted repeats. Defaults to 250.
140
        imperfect_homology (bool, optional): if True, allow for a certain number of errors in the inverted repeat. Defaults to False.
141
        min_homology (float, optional): the minimum homology (similarity) between the query and the found sequence, as a fraction between 0 and 1. Only used if imperfect_homology is True. Defaults to 0.8.
142
        fixed_errors (bool, optional): if True, use a fixed number of errors instead of calculating it from min_homology. Only used if imperfect_homology is True. Defaults to False.
143
        inverted (bool, optional): if True, search for the reverse complement of the query sequence as well. Defaults to True.
144
145
    Returns:
146
        dict: a dictionary where the keys are the query sequences and the values are lists of matching sequences.
147
    '''
148
    if imperfect_homology:
149
        print(f"Search has been set to find quasi-pallindromes")
150
        if fixed_errors:
151
            print(f"        Allowing up to {fixed_errors} errors/mismatches...")
152
        if not fixed_errors:
153
            print(f"        Searching with a minimum of {min_homology} homology")
154
    if not imperfect_homology:
155
        print(f"Search has been set to find perfect pallindromes")
156
157
    
158
    list_of_all_pairs=[]
159
    for query_length in range(min_query_length, max_query_length+1):
160
        print(f"###Searching for inverted-repeating {query_length}bp-long fragments...")
161
        sequence_size=len(sequence)
162
        
163
        for i in range(sequence_size): #-window_size+1
164
            seq=sequence[i:i+window_size]
165
            output_pair_list = findInvertedRepeat(  seq,
166
                                                    query_length=query_length,
167
                                                    min_spacer=min_spacer,
168
                                                    imperfect_homology=imperfect_homology,
169
                                                    min_homology=min_homology,
170
                                                    fixed_errors=fixed_errors,
171
                                                    inverted=inverted)
172
            for pair in output_pair_list:
173
                list_of_all_pairs.append(pair)
174
175
    results_dictionary={}
176
    print("Consolodating Results...")
177
    for pair in list_of_all_pairs:
178
        query=pair[0]
179
        if query in results_dictionary:
180
            if pair[1][0] not in results_dictionary[query]:
181
                results_dictionary[query].append(pair[1][0])
182
        else:
183
            results_dictionary[query] = [pair[1][0]]
184
    print("Results were consolidated...")
185
    return(results_dictionary)
186
187
def saveToJSON(results_dictionary: dict, output_file_name: str = 'json_output.json') -> None:
188
    '''
189
    Save a dictionary to a JSON file.
190
191
    Args:
192
        results_dictionary (dict): the dictionary to save.
193
        output_file_name (str, optional): the name of the output file. Defaults to 'json_output.json'.
194
195
    Returns:
196
        None
197
    '''
198
    import json
199
    with open(output_file_name, 'w') as outfile:
200
            json.dump(results_dictionary, outfile)
201