bio-aid / Git / [deb8e5] /BioAid/complexity.py

Models:
MarcoTheBlack/
bio-aid
Downloads: 1
[deb8e5]: / BioAid / complexity.py
History
Download this file
196 lines (153 with data), 7.5 kB

# This code was developed and authored by Jerzy Twarowski in Malkova Lab at the University of Iowa 
# Contact: jerzymateusz-twarowski@uiowa.edu, tvarovski1@gmail.com

## Contents:
## wordsInSequence
## findComplexity
## movingWindow
## createLogFile
## createSequenceList
## createDataFrameFromLogFile

import pandas as pd
import ast
from pandas import DataFrame
from .base import unique

def wordsInSequence(sequence: str, treeLevel: int) -> list:
    """
    Extracts all possible words of a specified length from a genetic sequence.

    Args:
        sequence (str): The genetic sequence to extract words from.
        treeLevel (int): The length of the words to extract.

    Returns:
        list: A list of words, where each word is a string of length treeLevel.

    Examples:
        >>> wordsInSequence("ATCGATCG", 3)
        ['ATC', 'TCG', 'CGA', 'GAT', 'ATC', 'TCG']

        >>> wordsInSequence("ATCGATCG", 4)
        ['ATCG', 'TCGA', 'CGAT', 'GATC', 'ATCG']
    """
    length = len(sequence)
    max_possible_words_in_sequence = length-(treeLevel-1)
    wordList = []
    for i in range(max_possible_words_in_sequence):
        wordList.append(sequence[0+i:treeLevel+i])
    return wordList

def findComplexity(sequence, treeLevel, complexity_threshold):
    # takes a genetic sequence and calculates the linguistic complexity 
    # for that sequence for word length treeLevel

    wordList = wordsInSequence(sequence,treeLevel)
    wordList = unique(wordList)

    if len(sequence) > 4**treeLevel:
        complexity = len(wordList)/(4**treeLevel)
    else:
        complexity = len(wordList)/(len(sequence)-(treeLevel-1))

    if (complexity < complexity_threshold) & (complexity > 0.2):
        print("Complexity at tree level " + str(treeLevel) + " is " + str(complexity) + " for sequence: "+str(sequence))

    return ([complexity, treeLevel, sequence])

def movingWindow(sequence: str, treeLevel: int = 8, chunkSize: int = 20, complexity_threshold: float = 0.2, showGraph: bool = False) -> list:
    """
    Calculates the linguistic complexity scores for windows of a specified size and word lengths from 1 to a specified level.

    Args:
        sequence (str): The genetic sequence to calculate complexity scores for.
        treeLevel (int, optional): The maximum length of words to use in the complexity calculation. Defaults to 8.
        chunkSize (int, optional): The size of the windows to use in the complexity calculation. Defaults to 20.
        complexity_threshold (float, optional): The threshold below which a window is considered to have low complexity. Defaults to 0.2.
        showGraph (bool, optional): Whether to show a graph of the complexity scores. Defaults to False.

    Returns:
        list: A list containing the lowest complexity score window in the format [Boolean, complexity, treeLevel], where Boolean denotes if the complexity score is lower (True) than complexity_threshold.

    Raises:
        ValueError: If the sequence is too short to calculate complexity scores.

    Examples:
        >>> movingWindow("ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG", treeLevel=4, chunkSize=10, complexity_threshold=0.1, showGraph=True)
        [True, 0.0, 4]

        >>> movingWindow("ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG", treeLevel=10, chunkSize=5, complexity_threshold=0.5, showGraph=True)
        [False, 0.5, 10]
    """

    complexityList=[]

    for i in range(1, treeLevel+1):
        for chunk in range(len(sequence)-(chunkSize-1)):

            chunkSeq = sequence[chunk:(chunk+chunkSize)]
            complexityList.append(findComplexity(chunkSeq, i, complexity_threshold))

    try:
        lowest=min(complexityList, key=lambda x: x[0])

    except ValueError:
        print(f"ERROR with sequence: {sequence}. It is probably too short len={len(sequence)}. Skipping it.")
        print("complexityList: ", complexityList)
        lowest = [0,0,0]

    #print(complexityList)

    lowLvl=lowest[1]

    if showGraph:

        df = DataFrame(complexityList,columns=['complexity','tree','sequence'])
        df = df[['complexity','tree']]
        ax = df[df.tree == lowLvl].filter(items=["complexity"]).plot(fontsize=15, grid=True, figsize=(20,8))
        ax.axhline(complexity_threshold, color="red", linestyle="--")
        ax.axhline(1, color="green", linestyle="--")
        ax.set_ylim(ymin=0)
        ax.set_title(str(lowest[2]))

    if lowest[0] < complexity_threshold:
        return([True, lowest[0], lowest[1]])
        #return([True, lowest])
    else:
        return([False, lowest[0], lowest[1]])
        #return([False, lowest])

def createLogFile(seqlist: list, filename: str, complexity_threshold: float = 0.2, chunkSize: int = 20) -> None:
    """
    Creates a log file containing the lowest linguistic complexity score for each sequence in a list.

    Args:
        seqlist (list): A list of genetic sequences to calculate complexity scores for.
        filename (str): The name of the output file to write the log to.
        complexity_threshold (float, optional): The threshold below which a sequence is considered to have low complexity. Defaults to 0.2.
        chunkSize (int, optional): The size of the windows to use in the complexity calculation. Defaults to 20.

    Returns:
        None
    """
    outputf = open(filename, "w")
    truemmbir = 0
    for i in seqlist:
        seqScore = movingWindow(i, complexity_threshold = complexity_threshold, chunkSize = chunkSize)
        if seqScore[0] == False:
            truemmbir += 1
        outputf.write(str(seqScore[1])+"\n")
    print("sequences above threshold:", truemmbir)

    outputf.close()

def createSequenceList(filename: str) -> list:
    """
    Extracts genetic sequences from a file and returns them as a list.

    Args:
        filename (str): The name of the file to extract sequences from.

    Returns:
        list: A list of genetic sequences, where each sequence is a string.
    """
    seqfile = open(filename, "r")
    lines = seqfile.readlines()
    seqlist=[]
    for line in lines:
        if "bir:" in line:
            if len(line) > 100:
                seqlist.append(line.strip("\n").strip("bir:-"))
    seqlist = list(dict.fromkeys(seqlist))
    print(len(seqlist))

    longseq=0
    for i in seqlist:
        if len(i) > 300:
            longseq+=1
    print(longseq)

    return seqlist

def createDataFrameFromLogFile(logfile_path: str) -> DataFrame:
    """
    Creates a pandas DataFrame from a log file containing linguistic complexity scores.

    Args:
        logfile (str): The name of the log file to create the DataFrame from.

    Returns:
        DataFrame: A pandas DataFrame containing the complexity scores, where each row represents a sequence and contains the complexity score, the word length, and the sequence itself.
    """
    logfile = open(logfile_path, "r")
    lines = logfile.readlines()
    loglist=[]
    for line in lines:
        if "[" in line:
                loglist.append(line.strip("\n"))

    for i in range(len(loglist)):
        loglist[i] = ast.literal_eval(loglist[i]) 
    print("total sequences:", len(loglist))

    df = DataFrame(loglist,columns=['complexity', 'level', 'sequence'])
    
    return df