--- a +++ b/BioAid/complexity.py @@ -0,0 +1,196 @@ +# This code was developed and authored by Jerzy Twarowski in Malkova Lab at the University of Iowa +# Contact: jerzymateusz-twarowski@uiowa.edu, tvarovski1@gmail.com + +## Contents: +## wordsInSequence +## findComplexity +## movingWindow +## createLogFile +## createSequenceList +## createDataFrameFromLogFile + +import pandas as pd +import ast +from pandas import DataFrame +from .base import unique + +def wordsInSequence(sequence: str, treeLevel: int) -> list: + """ + Extracts all possible words of a specified length from a genetic sequence. + + Args: + sequence (str): The genetic sequence to extract words from. + treeLevel (int): The length of the words to extract. + + Returns: + list: A list of words, where each word is a string of length treeLevel. + + Examples: + >>> wordsInSequence("ATCGATCG", 3) + ['ATC', 'TCG', 'CGA', 'GAT', 'ATC', 'TCG'] + + >>> wordsInSequence("ATCGATCG", 4) + ['ATCG', 'TCGA', 'CGAT', 'GATC', 'ATCG'] + """ + length = len(sequence) + max_possible_words_in_sequence = length-(treeLevel-1) + wordList = [] + for i in range(max_possible_words_in_sequence): + wordList.append(sequence[0+i:treeLevel+i]) + return wordList + +def findComplexity(sequence, treeLevel, complexity_threshold): + # takes a genetic sequence and calculates the linguistic complexity + # for that sequence for word length treeLevel + + wordList = wordsInSequence(sequence,treeLevel) + wordList = unique(wordList) + + if len(sequence) > 4**treeLevel: + complexity = len(wordList)/(4**treeLevel) + else: + complexity = len(wordList)/(len(sequence)-(treeLevel-1)) + + if (complexity < complexity_threshold) & (complexity > 0.2): + print("Complexity at tree level " + str(treeLevel) + " is " + str(complexity) + " for sequence: "+str(sequence)) + + return ([complexity, treeLevel, sequence]) + +def movingWindow(sequence: str, treeLevel: int = 8, chunkSize: int = 20, complexity_threshold: float = 0.2, showGraph: bool = False) -> list: + """ + Calculates the linguistic complexity scores for windows of a specified size and word lengths from 1 to a specified level. + + Args: + sequence (str): The genetic sequence to calculate complexity scores for. + treeLevel (int, optional): The maximum length of words to use in the complexity calculation. Defaults to 8. + chunkSize (int, optional): The size of the windows to use in the complexity calculation. Defaults to 20. + complexity_threshold (float, optional): The threshold below which a window is considered to have low complexity. Defaults to 0.2. + showGraph (bool, optional): Whether to show a graph of the complexity scores. Defaults to False. + + Returns: + list: A list containing the lowest complexity score window in the format [Boolean, complexity, treeLevel], where Boolean denotes if the complexity score is lower (True) than complexity_threshold. + + Raises: + ValueError: If the sequence is too short to calculate complexity scores. + + Examples: + >>> movingWindow("ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG", treeLevel=4, chunkSize=10, complexity_threshold=0.1, showGraph=True) + [True, 0.0, 4] + + >>> movingWindow("ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG", treeLevel=10, chunkSize=5, complexity_threshold=0.5, showGraph=True) + [False, 0.5, 10] + """ + + complexityList=[] + + for i in range(1, treeLevel+1): + for chunk in range(len(sequence)-(chunkSize-1)): + + chunkSeq = sequence[chunk:(chunk+chunkSize)] + complexityList.append(findComplexity(chunkSeq, i, complexity_threshold)) + + try: + lowest=min(complexityList, key=lambda x: x[0]) + + except ValueError: + print(f"ERROR with sequence: {sequence}. It is probably too short len={len(sequence)}. Skipping it.") + print("complexityList: ", complexityList) + lowest = [0,0,0] + + #print(complexityList) + + lowLvl=lowest[1] + + if showGraph: + + df = DataFrame(complexityList,columns=['complexity','tree','sequence']) + df = df[['complexity','tree']] + ax = df[df.tree == lowLvl].filter(items=["complexity"]).plot(fontsize=15, grid=True, figsize=(20,8)) + ax.axhline(complexity_threshold, color="red", linestyle="--") + ax.axhline(1, color="green", linestyle="--") + ax.set_ylim(ymin=0) + ax.set_title(str(lowest[2])) + + if lowest[0] < complexity_threshold: + return([True, lowest[0], lowest[1]]) + #return([True, lowest]) + else: + return([False, lowest[0], lowest[1]]) + #return([False, lowest]) + +def createLogFile(seqlist: list, filename: str, complexity_threshold: float = 0.2, chunkSize: int = 20) -> None: + """ + Creates a log file containing the lowest linguistic complexity score for each sequence in a list. + + Args: + seqlist (list): A list of genetic sequences to calculate complexity scores for. + filename (str): The name of the output file to write the log to. + complexity_threshold (float, optional): The threshold below which a sequence is considered to have low complexity. Defaults to 0.2. + chunkSize (int, optional): The size of the windows to use in the complexity calculation. Defaults to 20. + + Returns: + None + """ + outputf = open(filename, "w") + truemmbir = 0 + for i in seqlist: + seqScore = movingWindow(i, complexity_threshold = complexity_threshold, chunkSize = chunkSize) + if seqScore[0] == False: + truemmbir += 1 + outputf.write(str(seqScore[1])+"\n") + print("sequences above threshold:", truemmbir) + + outputf.close() + +def createSequenceList(filename: str) -> list: + """ + Extracts genetic sequences from a file and returns them as a list. + + Args: + filename (str): The name of the file to extract sequences from. + + Returns: + list: A list of genetic sequences, where each sequence is a string. + """ + seqfile = open(filename, "r") + lines = seqfile.readlines() + seqlist=[] + for line in lines: + if "bir:" in line: + if len(line) > 100: + seqlist.append(line.strip("\n").strip("bir:-")) + seqlist = list(dict.fromkeys(seqlist)) + print(len(seqlist)) + + longseq=0 + for i in seqlist: + if len(i) > 300: + longseq+=1 + print(longseq) + + return seqlist + +def createDataFrameFromLogFile(logfile_path: str) -> DataFrame: + """ + Creates a pandas DataFrame from a log file containing linguistic complexity scores. + + Args: + logfile (str): The name of the log file to create the DataFrame from. + + Returns: + DataFrame: A pandas DataFrame containing the complexity scores, where each row represents a sequence and contains the complexity score, the word length, and the sequence itself. + """ + logfile = open(logfile_path, "r") + lines = logfile.readlines() + loglist=[] + for line in lines: + if "[" in line: + loglist.append(line.strip("\n")) + + for i in range(len(loglist)): + loglist[i] = ast.literal_eval(loglist[i]) + print("total sequences:", len(loglist)) + + df = DataFrame(loglist,columns=['complexity', 'level', 'sequence']) + + return df \ No newline at end of file