[deb8e5]: / BioAid / Kmers.py

Download this file

108 lines (82 with data), 3.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
# This code was developed and authored by Jerzy Twarowski in Malkova Lab at the University of Iowa
# Contact: jerzymateusz-twarowski@uiowa.edu, tvarovski1@gmail.com
## Contents:
## findpairings
## findFrequencies
## plot
## runOligoFreqAnalysis
import pandas as pd
from .base import extractSeqFromFastaToList
from .base import validateSquence
def findpairings(sequence: str, pairing_length: int) -> list:
'''
Find all oligonucleotides of a given length in a DNA sequence.
Args:
sequence (str): the DNA sequence to search in.
pairing_length (int): the length of the oligonucleotides to find.
Returns:
list: a list of all oligonucleotides of length 'pairing_length' found in 'sequence'.
'''
pairing_list = []
for i in range(len(sequence)):
next_chunk = sequence.upper()[i:i+pairing_length]
if len(next_chunk) == pairing_length:
pairing_list.append(next_chunk)
return(pairing_list)
def findFrequencies(alist: list) -> dict:
'''
Count the frequency of each item in a list.
Args:
alist (list): the list to count the frequency of.
Returns:
dict: a dictionary where the keys are the items in the list and the values are their frequency.
'''
sequence_dict = {}
for i in alist:
if i in sequence_dict:
sequence_dict[i]+=1
else:
sequence_dict[i] = 1
return(sequence_dict)
def plot(oligo_dict: dict, title: str) -> None:
'''
Plot a histogram of the frequency of each item in a dictionary.
Args:
oligo_dict (dict): the dictionary to plot the frequency of.
title (str): the title of the plot.
Returns:
None
'''
freq_list = list(oligo_dict.items())
freq_list.sort(key=lambda x:x[1], reverse=True)
df = pd.DataFrame(freq_list, columns=['oligo', 'frequency'])
total_oligos = df.frequency.sum()
df['frequency'] = df.frequency / total_oligos
df.plot(kind='bar', x='oligo', figsize=(15,8), title=title+" Oligonucleotide Frequency", xlabel='Oligonucleotide', ylabel='Frequency (%)')
def runOligoFreqAnalysis(file_path: str) -> None:
'''
Run oligonucleotide frequency analysis on all sequences in a FASTA file.
Args:
file_path (str): the path to the FASTA file to analyze.
Returns:
None
'''
import matplotlib.pyplot as plt
fa_seq_list = extractSeqFromFastaToList(file_path)
for fa_seq in fa_seq_list:
sequence = fa_seq[1]
title = fa_seq[0]
if not validateSquence(sequence):
print(f'skipping "{title}", sequence validation failed')
continue
pairing_list_1 = findpairings(sequence,2)
seqdeck_1 = findFrequencies(pairing_list_1)
pairing_list_2 = findpairings(sequence,1)
seqdeck_2 = findFrequencies(pairing_list_2)
plot(seqdeck_1, title)
plot(seqdeck_2, title)
plt.show()
if __name__ == "__main__":
import sys
file_path = sys.argv[1]
runOligoFreqAnalysis(file_path)