bio-aid / Git / [deb8e5] /BioAid/repeatSearch.py

Models:
MarcoTheBlack/
bio-aid
Downloads: 1
[deb8e5]: / BioAid / repeatSearch.py
History
Download this file
202 lines (163 with data), 9.7 kB

# This code was developed and authored by Jerzy Twarowski in Malkova Lab at the University of Iowa 
# Contact: jerzymateusz-twarowski@uiowa.edu, tvarovski1@gmail.com

## Contents:
## validateDNASquence
## imperfectHomologySearch
## findInvertedRepeat
## searchSequenceForRepeats
## saveToJSON

#from .base import validateSquence
from .base import rev_compl


def validateDNASquence(sequence: str) -> bool:
    '''
    Check if the given DNA sequence contains only valid nucleotides.

    Args:
        sequence (str): the DNA sequence to validate.

    Returns:
        bool: True if the sequence contains only valid nucleotides, False otherwise.
    '''

    bases = "ATGCatgc-"
    for i in sequence:
        if i not in bases:
            print("warning, sequence doesn't contain cannonical nucleotides")
            return(False)
    return(True)

def imperfectHomologySearch(sequence: str, query: str, min_homology: float = 0.8, fixed_errors: bool = False, inverted: bool = True, silent: bool = False) -> list:
    '''
    Search for a query sequence in a given DNA sequence, allowing for a certain number of mismatches/imperfect homology.

    Args:
        sequence (str): the DNA sequence to search in.
        query (str): the query sequence to search for.
        min_homology (float, optional): the minimum homology (similarity) between the query and the found sequence, as a fraction between 0 and 1. Defaults to 0.8.
        fixed_errors (bool, optional): if True, use a fixed number of errors instead of calculating it from min_homology. Defaults to False.
        inverted (bool, optional): if True, search for the reverse complement of the query sequence as well. Defaults to True.
        silent (bool, optional): if True, don't print any messages. Defaults to False.

    Returns:
        list: a list of query-match pairs, where each pair is a list containing the query sequence and a list of matching sequences.
    '''
    import regex as re
    errors = 0
    if min_homology:
        errors = round(len(query)*(1-min_homology))
    if fixed_errors:
        errors = fixed_errors
    #print(f"searching with {errors} errors...")

    output_list = re.findall( '(' + query + '){e<=' + str(errors) + '}', sequence)

    if inverted == True:
        query=rev_compl(query)
    
    query_match_pairs=[query, output_list]
    if len(output_list) > 0:
        if len(output_list) > 1:
            if not silent:
                print(f"This is unusual... Found more than one match ({len(output_list)}) for the query...")
        
        if not silent:
            print(f'Found possible template(s) for {query}: {output_list}')
        return(query_match_pairs)
    else:
        return([])

def findInvertedRepeat(sequence: str, query_length: int = 4, min_spacer: int = 4, imperfect_homology: bool = False,
                        min_homology: float = 0.8, fixed_errors: bool = False, inverted: bool = True) -> list:
    '''
    Search for inverted repeats in a given DNA sequence.

    Args:
        sequence (str): the DNA sequence to search in.
        query_length (int, optional): the length of the query sequence to search for. Defaults to 4.
        min_spacer (int, optional): the minimum number of non-matching nucleotides between the two halves of the inverted repeat. Defaults to 4.
        imperfect_homology (bool, optional): if True, allow for a certain number of errors in the inverted repeat. Defaults to False.
        min_homology (float, optional): the minimum homology (similarity) between the query and the found sequence, as a fraction between 0 and 1. Only used if imperfect_homology is True. Defaults to 0.8.
        fixed_errors (bool, optional): if True, use a fixed number of errors instead of calculating it from min_homology. Only used if imperfect_homology is True. Defaults to False.
        inverted (bool, optional): if True, search for the reverse complement of the query sequence as well. Defaults to True.

    Returns:
        list: a list of query-match pairs, where each pair is a list containing the query sequence and a list of matching sequences.
    '''
    query_string=sequence[:query_length]

    if inverted == True:
        query = rev_compl(query_string)
    elif inverted == False:
        query = query_string
        
    sequence=sequence[query_length+min_spacer:]
    query_complement = rev_compl(query)
    output_pair_list=[]

    for i in range(len(sequence)-query_length+1):
        #print(sequence[i:query_length+i])
        if imperfect_homology:
            
            
            output_pair=imperfectHomologySearch(sequence[i:query_length+i],
                                                query,
                                                min_homology=min_homology,
                                                fixed_errors=fixed_errors,
                                                inverted=inverted)

            if output_pair != []:
                output_pair_list.append(output_pair)

        else:
            if sequence[i:query_length+i] == query:

                if inverted == True:
                    print(f"Success {query_complement}, {sequence[i:query_length+i]}")
                    output_pair_list.append([query_complement, [sequence[i:query_length+i]]])

                if inverted == False:
                    print(f"Success {query}, {sequence[i:query_length+i]}")
                    output_pair_list.append([query, [sequence[i:query_length+i]]])

    return(output_pair_list)
        
def searchSequenceForRepeats(sequence: str, min_query_length: int = 4, max_query_length: int = 25,
                            min_spacer: int = 0, window_size: int = 250, imperfect_homology:bool = False,
                            min_homology:float = 0.8, fixed_errors:bool = False, inverted: bool = True) -> dict:
    '''
    Search a DNA sequence for inverted repeats of various lengths.

    Args:
        sequence (str): the DNA sequence to search in.
        min_query_length (int, optional): the minimum length of the query sequence to search for. Defaults to 4.
        max_query_length (int, optional): the maximum length of the query sequence to search for. Defaults to 25.
        min_spacer (int, optional): the minimum number of non-matching nucleotides between the two halves of the inverted repeat. Defaults to 0.
        window_size (int, optional): the size of the sliding window to use when searching for inverted repeats. Defaults to 250.
        imperfect_homology (bool, optional): if True, allow for a certain number of errors in the inverted repeat. Defaults to False.
        min_homology (float, optional): the minimum homology (similarity) between the query and the found sequence, as a fraction between 0 and 1. Only used if imperfect_homology is True. Defaults to 0.8.
        fixed_errors (bool, optional): if True, use a fixed number of errors instead of calculating it from min_homology. Only used if imperfect_homology is True. Defaults to False.
        inverted (bool, optional): if True, search for the reverse complement of the query sequence as well. Defaults to True.

    Returns:
        dict: a dictionary where the keys are the query sequences and the values are lists of matching sequences.
    '''
    if imperfect_homology:
        print(f"Search has been set to find quasi-pallindromes")
        if fixed_errors:
            print(f"        Allowing up to {fixed_errors} errors/mismatches...")
        if not fixed_errors:
            print(f"        Searching with a minimum of {min_homology} homology")
    if not imperfect_homology:
        print(f"Search has been set to find perfect pallindromes")

    
    list_of_all_pairs=[]
    for query_length in range(min_query_length, max_query_length+1):
        print(f"###Searching for inverted-repeating {query_length}bp-long fragments...")
        sequence_size=len(sequence)
        
        for i in range(sequence_size): #-window_size+1
            seq=sequence[i:i+window_size]
            output_pair_list = findInvertedRepeat(  seq,
                                                    query_length=query_length,
                                                    min_spacer=min_spacer,
                                                    imperfect_homology=imperfect_homology,
                                                    min_homology=min_homology,
                                                    fixed_errors=fixed_errors,
                                                    inverted=inverted)
            for pair in output_pair_list:
                list_of_all_pairs.append(pair)

    results_dictionary={}
    print("Consolodating Results...")
    for pair in list_of_all_pairs:
        query=pair[0]
        if query in results_dictionary:
            if pair[1][0] not in results_dictionary[query]:
                results_dictionary[query].append(pair[1][0])
        else:
            results_dictionary[query] = [pair[1][0]]
    print("Results were consolidated...")
    return(results_dictionary)

def saveToJSON(results_dictionary: dict, output_file_name: str = 'json_output.json') -> None:
    '''
    Save a dictionary to a JSON file.

    Args:
        results_dictionary (dict): the dictionary to save.
        output_file_name (str, optional): the name of the output file. Defaults to 'json_output.json'.

    Returns:
        None
    '''
    import json
    with open(output_file_name, 'w') as outfile:
            json.dump(results_dictionary, outfile)