|
a |
|
b/BioAid/Kmers.py |
|
|
1 |
# This code was developed and authored by Jerzy Twarowski in Malkova Lab at the University of Iowa |
|
|
2 |
# Contact: jerzymateusz-twarowski@uiowa.edu, tvarovski1@gmail.com |
|
|
3 |
|
|
|
4 |
## Contents: |
|
|
5 |
## findpairings |
|
|
6 |
## findFrequencies |
|
|
7 |
## plot |
|
|
8 |
## runOligoFreqAnalysis |
|
|
9 |
|
|
|
10 |
import pandas as pd |
|
|
11 |
from .base import extractSeqFromFastaToList |
|
|
12 |
from .base import validateSquence |
|
|
13 |
|
|
|
14 |
def findpairings(sequence: str, pairing_length: int) -> list: |
|
|
15 |
''' |
|
|
16 |
Find all oligonucleotides of a given length in a DNA sequence. |
|
|
17 |
|
|
|
18 |
Args: |
|
|
19 |
sequence (str): the DNA sequence to search in. |
|
|
20 |
pairing_length (int): the length of the oligonucleotides to find. |
|
|
21 |
|
|
|
22 |
Returns: |
|
|
23 |
list: a list of all oligonucleotides of length 'pairing_length' found in 'sequence'. |
|
|
24 |
''' |
|
|
25 |
|
|
|
26 |
pairing_list = [] |
|
|
27 |
for i in range(len(sequence)): |
|
|
28 |
next_chunk = sequence.upper()[i:i+pairing_length] |
|
|
29 |
if len(next_chunk) == pairing_length: |
|
|
30 |
pairing_list.append(next_chunk) |
|
|
31 |
return(pairing_list) |
|
|
32 |
|
|
|
33 |
def findFrequencies(alist: list) -> dict: |
|
|
34 |
''' |
|
|
35 |
Count the frequency of each item in a list. |
|
|
36 |
|
|
|
37 |
Args: |
|
|
38 |
alist (list): the list to count the frequency of. |
|
|
39 |
|
|
|
40 |
Returns: |
|
|
41 |
dict: a dictionary where the keys are the items in the list and the values are their frequency. |
|
|
42 |
''' |
|
|
43 |
|
|
|
44 |
sequence_dict = {} |
|
|
45 |
|
|
|
46 |
for i in alist: |
|
|
47 |
if i in sequence_dict: |
|
|
48 |
sequence_dict[i]+=1 |
|
|
49 |
else: |
|
|
50 |
sequence_dict[i] = 1 |
|
|
51 |
return(sequence_dict) |
|
|
52 |
|
|
|
53 |
def plot(oligo_dict: dict, title: str) -> None: |
|
|
54 |
''' |
|
|
55 |
Plot a histogram of the frequency of each item in a dictionary. |
|
|
56 |
|
|
|
57 |
Args: |
|
|
58 |
oligo_dict (dict): the dictionary to plot the frequency of. |
|
|
59 |
title (str): the title of the plot. |
|
|
60 |
|
|
|
61 |
Returns: |
|
|
62 |
None |
|
|
63 |
''' |
|
|
64 |
freq_list = list(oligo_dict.items()) |
|
|
65 |
freq_list.sort(key=lambda x:x[1], reverse=True) |
|
|
66 |
df = pd.DataFrame(freq_list, columns=['oligo', 'frequency']) |
|
|
67 |
total_oligos = df.frequency.sum() |
|
|
68 |
df['frequency'] = df.frequency / total_oligos |
|
|
69 |
df.plot(kind='bar', x='oligo', figsize=(15,8), title=title+" Oligonucleotide Frequency", xlabel='Oligonucleotide', ylabel='Frequency (%)') |
|
|
70 |
|
|
|
71 |
def runOligoFreqAnalysis(file_path: str) -> None: |
|
|
72 |
''' |
|
|
73 |
Run oligonucleotide frequency analysis on all sequences in a FASTA file. |
|
|
74 |
|
|
|
75 |
Args: |
|
|
76 |
file_path (str): the path to the FASTA file to analyze. |
|
|
77 |
|
|
|
78 |
Returns: |
|
|
79 |
None |
|
|
80 |
''' |
|
|
81 |
import matplotlib.pyplot as plt |
|
|
82 |
fa_seq_list = extractSeqFromFastaToList(file_path) |
|
|
83 |
|
|
|
84 |
for fa_seq in fa_seq_list: |
|
|
85 |
sequence = fa_seq[1] |
|
|
86 |
title = fa_seq[0] |
|
|
87 |
|
|
|
88 |
if not validateSquence(sequence): |
|
|
89 |
print(f'skipping "{title}", sequence validation failed') |
|
|
90 |
continue |
|
|
91 |
|
|
|
92 |
pairing_list_1 = findpairings(sequence,2) |
|
|
93 |
seqdeck_1 = findFrequencies(pairing_list_1) |
|
|
94 |
|
|
|
95 |
pairing_list_2 = findpairings(sequence,1) |
|
|
96 |
seqdeck_2 = findFrequencies(pairing_list_2) |
|
|
97 |
|
|
|
98 |
plot(seqdeck_1, title) |
|
|
99 |
plot(seqdeck_2, title) |
|
|
100 |
plt.show() |
|
|
101 |
|
|
|
102 |
|
|
|
103 |
if __name__ == "__main__": |
|
|
104 |
import sys |
|
|
105 |
file_path = sys.argv[1] |
|
|
106 |
runOligoFreqAnalysis(file_path) |
|
|
107 |
|