--- a +++ b/BioAid/repeatSearch.py @@ -0,0 +1,201 @@ +# This code was developed and authored by Jerzy Twarowski in Malkova Lab at the University of Iowa +# Contact: jerzymateusz-twarowski@uiowa.edu, tvarovski1@gmail.com + +## Contents: +## validateDNASquence +## imperfectHomologySearch +## findInvertedRepeat +## searchSequenceForRepeats +## saveToJSON + +#from .base import validateSquence +from .base import rev_compl + + +def validateDNASquence(sequence: str) -> bool: + ''' + Check if the given DNA sequence contains only valid nucleotides. + + Args: + sequence (str): the DNA sequence to validate. + + Returns: + bool: True if the sequence contains only valid nucleotides, False otherwise. + ''' + + bases = "ATGCatgc-" + for i in sequence: + if i not in bases: + print("warning, sequence doesn't contain cannonical nucleotides") + return(False) + return(True) + +def imperfectHomologySearch(sequence: str, query: str, min_homology: float = 0.8, fixed_errors: bool = False, inverted: bool = True, silent: bool = False) -> list: + ''' + Search for a query sequence in a given DNA sequence, allowing for a certain number of mismatches/imperfect homology. + + Args: + sequence (str): the DNA sequence to search in. + query (str): the query sequence to search for. + min_homology (float, optional): the minimum homology (similarity) between the query and the found sequence, as a fraction between 0 and 1. Defaults to 0.8. + fixed_errors (bool, optional): if True, use a fixed number of errors instead of calculating it from min_homology. Defaults to False. + inverted (bool, optional): if True, search for the reverse complement of the query sequence as well. Defaults to True. + silent (bool, optional): if True, don't print any messages. Defaults to False. + + Returns: + list: a list of query-match pairs, where each pair is a list containing the query sequence and a list of matching sequences. + ''' + import regex as re + errors = 0 + if min_homology: + errors = round(len(query)*(1-min_homology)) + if fixed_errors: + errors = fixed_errors + #print(f"searching with {errors} errors...") + + output_list = re.findall( '(' + query + '){e<=' + str(errors) + '}', sequence) + + if inverted == True: + query=rev_compl(query) + + query_match_pairs=[query, output_list] + if len(output_list) > 0: + if len(output_list) > 1: + if not silent: + print(f"This is unusual... Found more than one match ({len(output_list)}) for the query...") + + if not silent: + print(f'Found possible template(s) for {query}: {output_list}') + return(query_match_pairs) + else: + return([]) + +def findInvertedRepeat(sequence: str, query_length: int = 4, min_spacer: int = 4, imperfect_homology: bool = False, + min_homology: float = 0.8, fixed_errors: bool = False, inverted: bool = True) -> list: + ''' + Search for inverted repeats in a given DNA sequence. + + Args: + sequence (str): the DNA sequence to search in. + query_length (int, optional): the length of the query sequence to search for. Defaults to 4. + min_spacer (int, optional): the minimum number of non-matching nucleotides between the two halves of the inverted repeat. Defaults to 4. + imperfect_homology (bool, optional): if True, allow for a certain number of errors in the inverted repeat. Defaults to False. + min_homology (float, optional): the minimum homology (similarity) between the query and the found sequence, as a fraction between 0 and 1. Only used if imperfect_homology is True. Defaults to 0.8. + fixed_errors (bool, optional): if True, use a fixed number of errors instead of calculating it from min_homology. Only used if imperfect_homology is True. Defaults to False. + inverted (bool, optional): if True, search for the reverse complement of the query sequence as well. Defaults to True. + + Returns: + list: a list of query-match pairs, where each pair is a list containing the query sequence and a list of matching sequences. + ''' + query_string=sequence[:query_length] + + if inverted == True: + query = rev_compl(query_string) + elif inverted == False: + query = query_string + + sequence=sequence[query_length+min_spacer:] + query_complement = rev_compl(query) + output_pair_list=[] + + for i in range(len(sequence)-query_length+1): + #print(sequence[i:query_length+i]) + if imperfect_homology: + + + output_pair=imperfectHomologySearch(sequence[i:query_length+i], + query, + min_homology=min_homology, + fixed_errors=fixed_errors, + inverted=inverted) + + if output_pair != []: + output_pair_list.append(output_pair) + + else: + if sequence[i:query_length+i] == query: + + if inverted == True: + print(f"Success {query_complement}, {sequence[i:query_length+i]}") + output_pair_list.append([query_complement, [sequence[i:query_length+i]]]) + + if inverted == False: + print(f"Success {query}, {sequence[i:query_length+i]}") + output_pair_list.append([query, [sequence[i:query_length+i]]]) + + return(output_pair_list) + +def searchSequenceForRepeats(sequence: str, min_query_length: int = 4, max_query_length: int = 25, + min_spacer: int = 0, window_size: int = 250, imperfect_homology:bool = False, + min_homology:float = 0.8, fixed_errors:bool = False, inverted: bool = True) -> dict: + ''' + Search a DNA sequence for inverted repeats of various lengths. + + Args: + sequence (str): the DNA sequence to search in. + min_query_length (int, optional): the minimum length of the query sequence to search for. Defaults to 4. + max_query_length (int, optional): the maximum length of the query sequence to search for. Defaults to 25. + min_spacer (int, optional): the minimum number of non-matching nucleotides between the two halves of the inverted repeat. Defaults to 0. + window_size (int, optional): the size of the sliding window to use when searching for inverted repeats. Defaults to 250. + imperfect_homology (bool, optional): if True, allow for a certain number of errors in the inverted repeat. Defaults to False. + min_homology (float, optional): the minimum homology (similarity) between the query and the found sequence, as a fraction between 0 and 1. Only used if imperfect_homology is True. Defaults to 0.8. + fixed_errors (bool, optional): if True, use a fixed number of errors instead of calculating it from min_homology. Only used if imperfect_homology is True. Defaults to False. + inverted (bool, optional): if True, search for the reverse complement of the query sequence as well. Defaults to True. + + Returns: + dict: a dictionary where the keys are the query sequences and the values are lists of matching sequences. + ''' + if imperfect_homology: + print(f"Search has been set to find quasi-pallindromes") + if fixed_errors: + print(f" Allowing up to {fixed_errors} errors/mismatches...") + if not fixed_errors: + print(f" Searching with a minimum of {min_homology} homology") + if not imperfect_homology: + print(f"Search has been set to find perfect pallindromes") + + + list_of_all_pairs=[] + for query_length in range(min_query_length, max_query_length+1): + print(f"###Searching for inverted-repeating {query_length}bp-long fragments...") + sequence_size=len(sequence) + + for i in range(sequence_size): #-window_size+1 + seq=sequence[i:i+window_size] + output_pair_list = findInvertedRepeat( seq, + query_length=query_length, + min_spacer=min_spacer, + imperfect_homology=imperfect_homology, + min_homology=min_homology, + fixed_errors=fixed_errors, + inverted=inverted) + for pair in output_pair_list: + list_of_all_pairs.append(pair) + + results_dictionary={} + print("Consolodating Results...") + for pair in list_of_all_pairs: + query=pair[0] + if query in results_dictionary: + if pair[1][0] not in results_dictionary[query]: + results_dictionary[query].append(pair[1][0]) + else: + results_dictionary[query] = [pair[1][0]] + print("Results were consolidated...") + return(results_dictionary) + +def saveToJSON(results_dictionary: dict, output_file_name: str = 'json_output.json') -> None: + ''' + Save a dictionary to a JSON file. + + Args: + results_dictionary (dict): the dictionary to save. + output_file_name (str, optional): the name of the output file. Defaults to 'json_output.json'. + + Returns: + None + ''' + import json + with open(output_file_name, 'w') as outfile: + json.dump(results_dictionary, outfile) +