--- a +++ b/BioAid/Kmers.py @@ -0,0 +1,107 @@ +# This code was developed and authored by Jerzy Twarowski in Malkova Lab at the University of Iowa +# Contact: jerzymateusz-twarowski@uiowa.edu, tvarovski1@gmail.com + +## Contents: +## findpairings +## findFrequencies +## plot +## runOligoFreqAnalysis + +import pandas as pd +from .base import extractSeqFromFastaToList +from .base import validateSquence + +def findpairings(sequence: str, pairing_length: int) -> list: + ''' + Find all oligonucleotides of a given length in a DNA sequence. + + Args: + sequence (str): the DNA sequence to search in. + pairing_length (int): the length of the oligonucleotides to find. + + Returns: + list: a list of all oligonucleotides of length 'pairing_length' found in 'sequence'. + ''' + + pairing_list = [] + for i in range(len(sequence)): + next_chunk = sequence.upper()[i:i+pairing_length] + if len(next_chunk) == pairing_length: + pairing_list.append(next_chunk) + return(pairing_list) + +def findFrequencies(alist: list) -> dict: + ''' + Count the frequency of each item in a list. + + Args: + alist (list): the list to count the frequency of. + + Returns: + dict: a dictionary where the keys are the items in the list and the values are their frequency. + ''' + + sequence_dict = {} + + for i in alist: + if i in sequence_dict: + sequence_dict[i]+=1 + else: + sequence_dict[i] = 1 + return(sequence_dict) + +def plot(oligo_dict: dict, title: str) -> None: + ''' + Plot a histogram of the frequency of each item in a dictionary. + + Args: + oligo_dict (dict): the dictionary to plot the frequency of. + title (str): the title of the plot. + + Returns: + None + ''' + freq_list = list(oligo_dict.items()) + freq_list.sort(key=lambda x:x[1], reverse=True) + df = pd.DataFrame(freq_list, columns=['oligo', 'frequency']) + total_oligos = df.frequency.sum() + df['frequency'] = df.frequency / total_oligos + df.plot(kind='bar', x='oligo', figsize=(15,8), title=title+" Oligonucleotide Frequency", xlabel='Oligonucleotide', ylabel='Frequency (%)') + +def runOligoFreqAnalysis(file_path: str) -> None: + ''' + Run oligonucleotide frequency analysis on all sequences in a FASTA file. + + Args: + file_path (str): the path to the FASTA file to analyze. + + Returns: + None + ''' + import matplotlib.pyplot as plt + fa_seq_list = extractSeqFromFastaToList(file_path) + + for fa_seq in fa_seq_list: + sequence = fa_seq[1] + title = fa_seq[0] + + if not validateSquence(sequence): + print(f'skipping "{title}", sequence validation failed') + continue + + pairing_list_1 = findpairings(sequence,2) + seqdeck_1 = findFrequencies(pairing_list_1) + + pairing_list_2 = findpairings(sequence,1) + seqdeck_2 = findFrequencies(pairing_list_2) + + plot(seqdeck_1, title) + plot(seqdeck_2, title) + plt.show() + + +if __name__ == "__main__": + import sys + file_path = sys.argv[1] + runOligoFreqAnalysis(file_path) +