bio-aid / Git / Diff of /BioAid/Kmers.py

Models:
MarcoTheBlack/
bio-aid
Downloads: 1
Diff of /BioAid/Kmers.py [000000] .. [deb8e5]
Switch to side-by-side view

--- a
+++ b/BioAid/Kmers.py
@@ -0,0 +1,107 @@
+# This code was developed and authored by Jerzy Twarowski in Malkova Lab at the University of Iowa 
+# Contact: jerzymateusz-twarowski@uiowa.edu, tvarovski1@gmail.com
+
+## Contents:
+## findpairings
+## findFrequencies
+## plot
+## runOligoFreqAnalysis
+
+import pandas as pd
+from .base import extractSeqFromFastaToList
+from .base import validateSquence
+
+def findpairings(sequence: str, pairing_length: int) -> list:
+    '''
+    Find all oligonucleotides of a given length in a DNA sequence.
+
+    Args:
+        sequence (str): the DNA sequence to search in.
+        pairing_length (int): the length of the oligonucleotides to find.
+
+    Returns:
+        list: a list of all oligonucleotides of length 'pairing_length' found in 'sequence'.
+    '''
+
+    pairing_list = []
+    for i in range(len(sequence)):
+        next_chunk = sequence.upper()[i:i+pairing_length]
+        if len(next_chunk) == pairing_length:
+            pairing_list.append(next_chunk)
+    return(pairing_list)
+
+def findFrequencies(alist: list) -> dict:
+    '''
+    Count the frequency of each item in a list.
+
+    Args:
+        alist (list): the list to count the frequency of.
+
+    Returns:
+        dict: a dictionary where the keys are the items in the list and the values are their frequency.
+    '''
+
+    sequence_dict = {}
+
+    for i in alist:
+        if i in sequence_dict:
+            sequence_dict[i]+=1
+        else:
+            sequence_dict[i] = 1
+    return(sequence_dict)
+
+def plot(oligo_dict: dict, title: str) -> None:
+    '''
+    Plot a histogram of the frequency of each item in a dictionary.
+
+    Args:
+        oligo_dict (dict): the dictionary to plot the frequency of.
+        title (str): the title of the plot.
+
+    Returns:
+        None
+    '''
+    freq_list = list(oligo_dict.items())
+    freq_list.sort(key=lambda x:x[1], reverse=True)
+    df = pd.DataFrame(freq_list, columns=['oligo', 'frequency'])
+    total_oligos = df.frequency.sum()
+    df['frequency'] = df.frequency / total_oligos
+    df.plot(kind='bar', x='oligo', figsize=(15,8), title=title+" Oligonucleotide Frequency", xlabel='Oligonucleotide', ylabel='Frequency (%)')
+
+def runOligoFreqAnalysis(file_path: str) -> None:
+    '''
+    Run oligonucleotide frequency analysis on all sequences in a FASTA file.
+
+    Args:
+        file_path (str): the path to the FASTA file to analyze.
+
+    Returns:
+        None
+    '''
+    import matplotlib.pyplot as plt
+    fa_seq_list = extractSeqFromFastaToList(file_path)
+
+    for fa_seq in fa_seq_list:
+        sequence = fa_seq[1]
+        title = fa_seq[0]
+
+        if not validateSquence(sequence):
+            print(f'skipping "{title}", sequence validation failed')
+            continue
+
+        pairing_list_1 = findpairings(sequence,2)
+        seqdeck_1 = findFrequencies(pairing_list_1)
+
+        pairing_list_2 = findpairings(sequence,1)
+        seqdeck_2 = findFrequencies(pairing_list_2)
+
+        plot(seqdeck_1, title)
+        plot(seqdeck_2, title)
+        plt.show()
+
+
+if __name__ == "__main__":
+    import sys
+    file_path = sys.argv[1]
+    runOligoFreqAnalysis(file_path)
+