bio-aid / Git / [deb8e5] /BioAid/Kmers.py

Models:
MarcoTheBlack/
bio-aid
Downloads: 1
[deb8e5]: / BioAid / Kmers.py
History
Download this file
108 lines (82 with data), 3.2 kB

# This code was developed and authored by Jerzy Twarowski in Malkova Lab at the University of Iowa 
# Contact: jerzymateusz-twarowski@uiowa.edu, tvarovski1@gmail.com

## Contents:
## findpairings
## findFrequencies
## plot
## runOligoFreqAnalysis

import pandas as pd
from .base import extractSeqFromFastaToList
from .base import validateSquence

def findpairings(sequence: str, pairing_length: int) -> list:
    '''
    Find all oligonucleotides of a given length in a DNA sequence.

    Args:
        sequence (str): the DNA sequence to search in.
        pairing_length (int): the length of the oligonucleotides to find.

    Returns:
        list: a list of all oligonucleotides of length 'pairing_length' found in 'sequence'.
    '''

    pairing_list = []
    for i in range(len(sequence)):
        next_chunk = sequence.upper()[i:i+pairing_length]
        if len(next_chunk) == pairing_length:
            pairing_list.append(next_chunk)
    return(pairing_list)

def findFrequencies(alist: list) -> dict:
    '''
    Count the frequency of each item in a list.

    Args:
        alist (list): the list to count the frequency of.

    Returns:
        dict: a dictionary where the keys are the items in the list and the values are their frequency.
    '''

    sequence_dict = {}

    for i in alist:
        if i in sequence_dict:
            sequence_dict[i]+=1
        else:
            sequence_dict[i] = 1
    return(sequence_dict)

def plot(oligo_dict: dict, title: str) -> None:
    '''
    Plot a histogram of the frequency of each item in a dictionary.

    Args:
        oligo_dict (dict): the dictionary to plot the frequency of.
        title (str): the title of the plot.

    Returns:
        None
    '''
    freq_list = list(oligo_dict.items())
    freq_list.sort(key=lambda x:x[1], reverse=True)
    df = pd.DataFrame(freq_list, columns=['oligo', 'frequency'])
    total_oligos = df.frequency.sum()
    df['frequency'] = df.frequency / total_oligos
    df.plot(kind='bar', x='oligo', figsize=(15,8), title=title+" Oligonucleotide Frequency", xlabel='Oligonucleotide', ylabel='Frequency (%)')

def runOligoFreqAnalysis(file_path: str) -> None:
    '''
    Run oligonucleotide frequency analysis on all sequences in a FASTA file.

    Args:
        file_path (str): the path to the FASTA file to analyze.

    Returns:
        None
    '''
    import matplotlib.pyplot as plt
    fa_seq_list = extractSeqFromFastaToList(file_path)

    for fa_seq in fa_seq_list:
        sequence = fa_seq[1]
        title = fa_seq[0]

        if not validateSquence(sequence):
            print(f'skipping "{title}", sequence validation failed')
            continue

        pairing_list_1 = findpairings(sequence,2)
        seqdeck_1 = findFrequencies(pairing_list_1)

        pairing_list_2 = findpairings(sequence,1)
        seqdeck_2 = findFrequencies(pairing_list_2)

        plot(seqdeck_1, title)
        plot(seqdeck_2, title)
        plt.show()


if __name__ == "__main__":
    import sys
    file_path = sys.argv[1]
    runOligoFreqAnalysis(file_path)