Diff of /BioAid/complexity.py [000000] .. [deb8e5]

Switch to side-by-side view

--- a
+++ b/BioAid/complexity.py
@@ -0,0 +1,196 @@
+# This code was developed and authored by Jerzy Twarowski in Malkova Lab at the University of Iowa 
+# Contact: jerzymateusz-twarowski@uiowa.edu, tvarovski1@gmail.com
+
+## Contents:
+## wordsInSequence
+## findComplexity
+## movingWindow
+## createLogFile
+## createSequenceList
+## createDataFrameFromLogFile
+
+import pandas as pd
+import ast
+from pandas import DataFrame
+from .base import unique
+
+def wordsInSequence(sequence: str, treeLevel: int) -> list:
+    """
+    Extracts all possible words of a specified length from a genetic sequence.
+
+    Args:
+        sequence (str): The genetic sequence to extract words from.
+        treeLevel (int): The length of the words to extract.
+
+    Returns:
+        list: A list of words, where each word is a string of length treeLevel.
+
+    Examples:
+        >>> wordsInSequence("ATCGATCG", 3)
+        ['ATC', 'TCG', 'CGA', 'GAT', 'ATC', 'TCG']
+
+        >>> wordsInSequence("ATCGATCG", 4)
+        ['ATCG', 'TCGA', 'CGAT', 'GATC', 'ATCG']
+    """
+    length = len(sequence)
+    max_possible_words_in_sequence = length-(treeLevel-1)
+    wordList = []
+    for i in range(max_possible_words_in_sequence):
+        wordList.append(sequence[0+i:treeLevel+i])
+    return wordList
+
+def findComplexity(sequence, treeLevel, complexity_threshold):
+    # takes a genetic sequence and calculates the linguistic complexity 
+    # for that sequence for word length treeLevel
+
+    wordList = wordsInSequence(sequence,treeLevel)
+    wordList = unique(wordList)
+
+    if len(sequence) > 4**treeLevel:
+        complexity = len(wordList)/(4**treeLevel)
+    else:
+        complexity = len(wordList)/(len(sequence)-(treeLevel-1))
+
+    if (complexity < complexity_threshold) & (complexity > 0.2):
+        print("Complexity at tree level " + str(treeLevel) + " is " + str(complexity) + " for sequence: "+str(sequence))
+
+    return ([complexity, treeLevel, sequence])
+
+def movingWindow(sequence: str, treeLevel: int = 8, chunkSize: int = 20, complexity_threshold: float = 0.2, showGraph: bool = False) -> list:
+    """
+    Calculates the linguistic complexity scores for windows of a specified size and word lengths from 1 to a specified level.
+
+    Args:
+        sequence (str): The genetic sequence to calculate complexity scores for.
+        treeLevel (int, optional): The maximum length of words to use in the complexity calculation. Defaults to 8.
+        chunkSize (int, optional): The size of the windows to use in the complexity calculation. Defaults to 20.
+        complexity_threshold (float, optional): The threshold below which a window is considered to have low complexity. Defaults to 0.2.
+        showGraph (bool, optional): Whether to show a graph of the complexity scores. Defaults to False.
+
+    Returns:
+        list: A list containing the lowest complexity score window in the format [Boolean, complexity, treeLevel], where Boolean denotes if the complexity score is lower (True) than complexity_threshold.
+
+    Raises:
+        ValueError: If the sequence is too short to calculate complexity scores.
+
+    Examples:
+        >>> movingWindow("ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG", treeLevel=4, chunkSize=10, complexity_threshold=0.1, showGraph=True)
+        [True, 0.0, 4]
+
+        >>> movingWindow("ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG", treeLevel=10, chunkSize=5, complexity_threshold=0.5, showGraph=True)
+        [False, 0.5, 10]
+    """
+
+    complexityList=[]
+
+    for i in range(1, treeLevel+1):
+        for chunk in range(len(sequence)-(chunkSize-1)):
+
+            chunkSeq = sequence[chunk:(chunk+chunkSize)]
+            complexityList.append(findComplexity(chunkSeq, i, complexity_threshold))
+
+    try:
+        lowest=min(complexityList, key=lambda x: x[0])
+
+    except ValueError:
+        print(f"ERROR with sequence: {sequence}. It is probably too short len={len(sequence)}. Skipping it.")
+        print("complexityList: ", complexityList)
+        lowest = [0,0,0]
+
+    #print(complexityList)
+
+    lowLvl=lowest[1]
+
+    if showGraph:
+
+        df = DataFrame(complexityList,columns=['complexity','tree','sequence'])
+        df = df[['complexity','tree']]
+        ax = df[df.tree == lowLvl].filter(items=["complexity"]).plot(fontsize=15, grid=True, figsize=(20,8))
+        ax.axhline(complexity_threshold, color="red", linestyle="--")
+        ax.axhline(1, color="green", linestyle="--")
+        ax.set_ylim(ymin=0)
+        ax.set_title(str(lowest[2]))
+
+    if lowest[0] < complexity_threshold:
+        return([True, lowest[0], lowest[1]])
+        #return([True, lowest])
+    else:
+        return([False, lowest[0], lowest[1]])
+        #return([False, lowest])
+
+def createLogFile(seqlist: list, filename: str, complexity_threshold: float = 0.2, chunkSize: int = 20) -> None:
+    """
+    Creates a log file containing the lowest linguistic complexity score for each sequence in a list.
+
+    Args:
+        seqlist (list): A list of genetic sequences to calculate complexity scores for.
+        filename (str): The name of the output file to write the log to.
+        complexity_threshold (float, optional): The threshold below which a sequence is considered to have low complexity. Defaults to 0.2.
+        chunkSize (int, optional): The size of the windows to use in the complexity calculation. Defaults to 20.
+
+    Returns:
+        None
+    """
+    outputf = open(filename, "w")
+    truemmbir = 0
+    for i in seqlist:
+        seqScore = movingWindow(i, complexity_threshold = complexity_threshold, chunkSize = chunkSize)
+        if seqScore[0] == False:
+            truemmbir += 1
+        outputf.write(str(seqScore[1])+"\n")
+    print("sequences above threshold:", truemmbir)
+
+    outputf.close()
+
+def createSequenceList(filename: str) -> list:
+    """
+    Extracts genetic sequences from a file and returns them as a list.
+
+    Args:
+        filename (str): The name of the file to extract sequences from.
+
+    Returns:
+        list: A list of genetic sequences, where each sequence is a string.
+    """
+    seqfile = open(filename, "r")
+    lines = seqfile.readlines()
+    seqlist=[]
+    for line in lines:
+        if "bir:" in line:
+            if len(line) > 100:
+                seqlist.append(line.strip("\n").strip("bir:-"))
+    seqlist = list(dict.fromkeys(seqlist))
+    print(len(seqlist))
+
+    longseq=0
+    for i in seqlist:
+        if len(i) > 300:
+            longseq+=1
+    print(longseq)
+
+    return seqlist
+
+def createDataFrameFromLogFile(logfile_path: str) -> DataFrame:
+    """
+    Creates a pandas DataFrame from a log file containing linguistic complexity scores.
+
+    Args:
+        logfile (str): The name of the log file to create the DataFrame from.
+
+    Returns:
+        DataFrame: A pandas DataFrame containing the complexity scores, where each row represents a sequence and contains the complexity score, the word length, and the sequence itself.
+    """
+    logfile = open(logfile_path, "r")
+    lines = logfile.readlines()
+    loglist=[]
+    for line in lines:
+        if "[" in line:
+                loglist.append(line.strip("\n"))
+
+    for i in range(len(loglist)):
+        loglist[i] = ast.literal_eval(loglist[i]) 
+    print("total sequences:", len(loglist))
+
+    df = DataFrame(loglist,columns=['complexity', 'level', 'sequence'])
+    
+    return df
\ No newline at end of file