Diff of /BioAid/repeatSearch.py [000000] .. [deb8e5]

Switch to side-by-side view

--- a
+++ b/BioAid/repeatSearch.py
@@ -0,0 +1,201 @@
+# This code was developed and authored by Jerzy Twarowski in Malkova Lab at the University of Iowa 
+# Contact: jerzymateusz-twarowski@uiowa.edu, tvarovski1@gmail.com
+
+## Contents:
+## validateDNASquence
+## imperfectHomologySearch
+## findInvertedRepeat
+## searchSequenceForRepeats
+## saveToJSON
+
+#from .base import validateSquence
+from .base import rev_compl
+
+
+def validateDNASquence(sequence: str) -> bool:
+    '''
+    Check if the given DNA sequence contains only valid nucleotides.
+
+    Args:
+        sequence (str): the DNA sequence to validate.
+
+    Returns:
+        bool: True if the sequence contains only valid nucleotides, False otherwise.
+    '''
+
+    bases = "ATGCatgc-"
+    for i in sequence:
+        if i not in bases:
+            print("warning, sequence doesn't contain cannonical nucleotides")
+            return(False)
+    return(True)
+
+def imperfectHomologySearch(sequence: str, query: str, min_homology: float = 0.8, fixed_errors: bool = False, inverted: bool = True, silent: bool = False) -> list:
+    '''
+    Search for a query sequence in a given DNA sequence, allowing for a certain number of mismatches/imperfect homology.
+
+    Args:
+        sequence (str): the DNA sequence to search in.
+        query (str): the query sequence to search for.
+        min_homology (float, optional): the minimum homology (similarity) between the query and the found sequence, as a fraction between 0 and 1. Defaults to 0.8.
+        fixed_errors (bool, optional): if True, use a fixed number of errors instead of calculating it from min_homology. Defaults to False.
+        inverted (bool, optional): if True, search for the reverse complement of the query sequence as well. Defaults to True.
+        silent (bool, optional): if True, don't print any messages. Defaults to False.
+
+    Returns:
+        list: a list of query-match pairs, where each pair is a list containing the query sequence and a list of matching sequences.
+    '''
+    import regex as re
+    errors = 0
+    if min_homology:
+        errors = round(len(query)*(1-min_homology))
+    if fixed_errors:
+        errors = fixed_errors
+    #print(f"searching with {errors} errors...")
+
+    output_list = re.findall( '(' + query + '){e<=' + str(errors) + '}', sequence)
+
+    if inverted == True:
+        query=rev_compl(query)
+    
+    query_match_pairs=[query, output_list]
+    if len(output_list) > 0:
+        if len(output_list) > 1:
+            if not silent:
+                print(f"This is unusual... Found more than one match ({len(output_list)}) for the query...")
+        
+        if not silent:
+            print(f'Found possible template(s) for {query}: {output_list}')
+        return(query_match_pairs)
+    else:
+        return([])
+
+def findInvertedRepeat(sequence: str, query_length: int = 4, min_spacer: int = 4, imperfect_homology: bool = False,
+                        min_homology: float = 0.8, fixed_errors: bool = False, inverted: bool = True) -> list:
+    '''
+    Search for inverted repeats in a given DNA sequence.
+
+    Args:
+        sequence (str): the DNA sequence to search in.
+        query_length (int, optional): the length of the query sequence to search for. Defaults to 4.
+        min_spacer (int, optional): the minimum number of non-matching nucleotides between the two halves of the inverted repeat. Defaults to 4.
+        imperfect_homology (bool, optional): if True, allow for a certain number of errors in the inverted repeat. Defaults to False.
+        min_homology (float, optional): the minimum homology (similarity) between the query and the found sequence, as a fraction between 0 and 1. Only used if imperfect_homology is True. Defaults to 0.8.
+        fixed_errors (bool, optional): if True, use a fixed number of errors instead of calculating it from min_homology. Only used if imperfect_homology is True. Defaults to False.
+        inverted (bool, optional): if True, search for the reverse complement of the query sequence as well. Defaults to True.
+
+    Returns:
+        list: a list of query-match pairs, where each pair is a list containing the query sequence and a list of matching sequences.
+    '''
+    query_string=sequence[:query_length]
+
+    if inverted == True:
+        query = rev_compl(query_string)
+    elif inverted == False:
+        query = query_string
+        
+    sequence=sequence[query_length+min_spacer:]
+    query_complement = rev_compl(query)
+    output_pair_list=[]
+
+    for i in range(len(sequence)-query_length+1):
+        #print(sequence[i:query_length+i])
+        if imperfect_homology:
+            
+            
+            output_pair=imperfectHomologySearch(sequence[i:query_length+i],
+                                                query,
+                                                min_homology=min_homology,
+                                                fixed_errors=fixed_errors,
+                                                inverted=inverted)
+
+            if output_pair != []:
+                output_pair_list.append(output_pair)
+
+        else:
+            if sequence[i:query_length+i] == query:
+
+                if inverted == True:
+                    print(f"Success {query_complement}, {sequence[i:query_length+i]}")
+                    output_pair_list.append([query_complement, [sequence[i:query_length+i]]])
+
+                if inverted == False:
+                    print(f"Success {query}, {sequence[i:query_length+i]}")
+                    output_pair_list.append([query, [sequence[i:query_length+i]]])
+
+    return(output_pair_list)
+        
+def searchSequenceForRepeats(sequence: str, min_query_length: int = 4, max_query_length: int = 25,
+                            min_spacer: int = 0, window_size: int = 250, imperfect_homology:bool = False,
+                            min_homology:float = 0.8, fixed_errors:bool = False, inverted: bool = True) -> dict:
+    '''
+    Search a DNA sequence for inverted repeats of various lengths.
+
+    Args:
+        sequence (str): the DNA sequence to search in.
+        min_query_length (int, optional): the minimum length of the query sequence to search for. Defaults to 4.
+        max_query_length (int, optional): the maximum length of the query sequence to search for. Defaults to 25.
+        min_spacer (int, optional): the minimum number of non-matching nucleotides between the two halves of the inverted repeat. Defaults to 0.
+        window_size (int, optional): the size of the sliding window to use when searching for inverted repeats. Defaults to 250.
+        imperfect_homology (bool, optional): if True, allow for a certain number of errors in the inverted repeat. Defaults to False.
+        min_homology (float, optional): the minimum homology (similarity) between the query and the found sequence, as a fraction between 0 and 1. Only used if imperfect_homology is True. Defaults to 0.8.
+        fixed_errors (bool, optional): if True, use a fixed number of errors instead of calculating it from min_homology. Only used if imperfect_homology is True. Defaults to False.
+        inverted (bool, optional): if True, search for the reverse complement of the query sequence as well. Defaults to True.
+
+    Returns:
+        dict: a dictionary where the keys are the query sequences and the values are lists of matching sequences.
+    '''
+    if imperfect_homology:
+        print(f"Search has been set to find quasi-pallindromes")
+        if fixed_errors:
+            print(f"        Allowing up to {fixed_errors} errors/mismatches...")
+        if not fixed_errors:
+            print(f"        Searching with a minimum of {min_homology} homology")
+    if not imperfect_homology:
+        print(f"Search has been set to find perfect pallindromes")
+
+    
+    list_of_all_pairs=[]
+    for query_length in range(min_query_length, max_query_length+1):
+        print(f"###Searching for inverted-repeating {query_length}bp-long fragments...")
+        sequence_size=len(sequence)
+        
+        for i in range(sequence_size): #-window_size+1
+            seq=sequence[i:i+window_size]
+            output_pair_list = findInvertedRepeat(  seq,
+                                                    query_length=query_length,
+                                                    min_spacer=min_spacer,
+                                                    imperfect_homology=imperfect_homology,
+                                                    min_homology=min_homology,
+                                                    fixed_errors=fixed_errors,
+                                                    inverted=inverted)
+            for pair in output_pair_list:
+                list_of_all_pairs.append(pair)
+
+    results_dictionary={}
+    print("Consolodating Results...")
+    for pair in list_of_all_pairs:
+        query=pair[0]
+        if query in results_dictionary:
+            if pair[1][0] not in results_dictionary[query]:
+                results_dictionary[query].append(pair[1][0])
+        else:
+            results_dictionary[query] = [pair[1][0]]
+    print("Results were consolidated...")
+    return(results_dictionary)
+
+def saveToJSON(results_dictionary: dict, output_file_name: str = 'json_output.json') -> None:
+    '''
+    Save a dictionary to a JSON file.
+
+    Args:
+        results_dictionary (dict): the dictionary to save.
+        output_file_name (str, optional): the name of the output file. Defaults to 'json_output.json'.
+
+    Returns:
+        None
+    '''
+    import json
+    with open(output_file_name, 'w') as outfile:
+            json.dump(results_dictionary, outfile)
+