a b/BioAid/Kmers.py
1
# This code was developed and authored by Jerzy Twarowski in Malkova Lab at the University of Iowa 
2
# Contact: jerzymateusz-twarowski@uiowa.edu, tvarovski1@gmail.com
3
4
## Contents:
5
## findpairings
6
## findFrequencies
7
## plot
8
## runOligoFreqAnalysis
9
10
import pandas as pd
11
from .base import extractSeqFromFastaToList
12
from .base import validateSquence
13
14
def findpairings(sequence: str, pairing_length: int) -> list:
15
    '''
16
    Find all oligonucleotides of a given length in a DNA sequence.
17
18
    Args:
19
        sequence (str): the DNA sequence to search in.
20
        pairing_length (int): the length of the oligonucleotides to find.
21
22
    Returns:
23
        list: a list of all oligonucleotides of length 'pairing_length' found in 'sequence'.
24
    '''
25
26
    pairing_list = []
27
    for i in range(len(sequence)):
28
        next_chunk = sequence.upper()[i:i+pairing_length]
29
        if len(next_chunk) == pairing_length:
30
            pairing_list.append(next_chunk)
31
    return(pairing_list)
32
33
def findFrequencies(alist: list) -> dict:
34
    '''
35
    Count the frequency of each item in a list.
36
37
    Args:
38
        alist (list): the list to count the frequency of.
39
40
    Returns:
41
        dict: a dictionary where the keys are the items in the list and the values are their frequency.
42
    '''
43
44
    sequence_dict = {}
45
46
    for i in alist:
47
        if i in sequence_dict:
48
            sequence_dict[i]+=1
49
        else:
50
            sequence_dict[i] = 1
51
    return(sequence_dict)
52
53
def plot(oligo_dict: dict, title: str) -> None:
54
    '''
55
    Plot a histogram of the frequency of each item in a dictionary.
56
57
    Args:
58
        oligo_dict (dict): the dictionary to plot the frequency of.
59
        title (str): the title of the plot.
60
61
    Returns:
62
        None
63
    '''
64
    freq_list = list(oligo_dict.items())
65
    freq_list.sort(key=lambda x:x[1], reverse=True)
66
    df = pd.DataFrame(freq_list, columns=['oligo', 'frequency'])
67
    total_oligos = df.frequency.sum()
68
    df['frequency'] = df.frequency / total_oligos
69
    df.plot(kind='bar', x='oligo', figsize=(15,8), title=title+" Oligonucleotide Frequency", xlabel='Oligonucleotide', ylabel='Frequency (%)')
70
71
def runOligoFreqAnalysis(file_path: str) -> None:
72
    '''
73
    Run oligonucleotide frequency analysis on all sequences in a FASTA file.
74
75
    Args:
76
        file_path (str): the path to the FASTA file to analyze.
77
78
    Returns:
79
        None
80
    '''
81
    import matplotlib.pyplot as plt
82
    fa_seq_list = extractSeqFromFastaToList(file_path)
83
84
    for fa_seq in fa_seq_list:
85
        sequence = fa_seq[1]
86
        title = fa_seq[0]
87
88
        if not validateSquence(sequence):
89
            print(f'skipping "{title}", sequence validation failed')
90
            continue
91
92
        pairing_list_1 = findpairings(sequence,2)
93
        seqdeck_1 = findFrequencies(pairing_list_1)
94
95
        pairing_list_2 = findpairings(sequence,1)
96
        seqdeck_2 = findFrequencies(pairing_list_2)
97
98
        plot(seqdeck_1, title)
99
        plot(seqdeck_2, title)
100
        plt.show()
101
102
103
if __name__ == "__main__":
104
    import sys
105
    file_path = sys.argv[1]
106
    runOligoFreqAnalysis(file_path)
107