[deb8e5]: / BioAid / complexity.py

Download this file

196 lines (153 with data), 7.5 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# This code was developed and authored by Jerzy Twarowski in Malkova Lab at the University of Iowa
# Contact: jerzymateusz-twarowski@uiowa.edu, tvarovski1@gmail.com
## Contents:
## wordsInSequence
## findComplexity
## movingWindow
## createLogFile
## createSequenceList
## createDataFrameFromLogFile
import pandas as pd
import ast
from pandas import DataFrame
from .base import unique
def wordsInSequence(sequence: str, treeLevel: int) -> list:
"""
Extracts all possible words of a specified length from a genetic sequence.
Args:
sequence (str): The genetic sequence to extract words from.
treeLevel (int): The length of the words to extract.
Returns:
list: A list of words, where each word is a string of length treeLevel.
Examples:
>>> wordsInSequence("ATCGATCG", 3)
['ATC', 'TCG', 'CGA', 'GAT', 'ATC', 'TCG']
>>> wordsInSequence("ATCGATCG", 4)
['ATCG', 'TCGA', 'CGAT', 'GATC', 'ATCG']
"""
length = len(sequence)
max_possible_words_in_sequence = length-(treeLevel-1)
wordList = []
for i in range(max_possible_words_in_sequence):
wordList.append(sequence[0+i:treeLevel+i])
return wordList
def findComplexity(sequence, treeLevel, complexity_threshold):
# takes a genetic sequence and calculates the linguistic complexity
# for that sequence for word length treeLevel
wordList = wordsInSequence(sequence,treeLevel)
wordList = unique(wordList)
if len(sequence) > 4**treeLevel:
complexity = len(wordList)/(4**treeLevel)
else:
complexity = len(wordList)/(len(sequence)-(treeLevel-1))
if (complexity < complexity_threshold) & (complexity > 0.2):
print("Complexity at tree level " + str(treeLevel) + " is " + str(complexity) + " for sequence: "+str(sequence))
return ([complexity, treeLevel, sequence])
def movingWindow(sequence: str, treeLevel: int = 8, chunkSize: int = 20, complexity_threshold: float = 0.2, showGraph: bool = False) -> list:
"""
Calculates the linguistic complexity scores for windows of a specified size and word lengths from 1 to a specified level.
Args:
sequence (str): The genetic sequence to calculate complexity scores for.
treeLevel (int, optional): The maximum length of words to use in the complexity calculation. Defaults to 8.
chunkSize (int, optional): The size of the windows to use in the complexity calculation. Defaults to 20.
complexity_threshold (float, optional): The threshold below which a window is considered to have low complexity. Defaults to 0.2.
showGraph (bool, optional): Whether to show a graph of the complexity scores. Defaults to False.
Returns:
list: A list containing the lowest complexity score window in the format [Boolean, complexity, treeLevel], where Boolean denotes if the complexity score is lower (True) than complexity_threshold.
Raises:
ValueError: If the sequence is too short to calculate complexity scores.
Examples:
>>> movingWindow("ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG", treeLevel=4, chunkSize=10, complexity_threshold=0.1, showGraph=True)
[True, 0.0, 4]
>>> movingWindow("ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG", treeLevel=10, chunkSize=5, complexity_threshold=0.5, showGraph=True)
[False, 0.5, 10]
"""
complexityList=[]
for i in range(1, treeLevel+1):
for chunk in range(len(sequence)-(chunkSize-1)):
chunkSeq = sequence[chunk:(chunk+chunkSize)]
complexityList.append(findComplexity(chunkSeq, i, complexity_threshold))
try:
lowest=min(complexityList, key=lambda x: x[0])
except ValueError:
print(f"ERROR with sequence: {sequence}. It is probably too short len={len(sequence)}. Skipping it.")
print("complexityList: ", complexityList)
lowest = [0,0,0]
#print(complexityList)
lowLvl=lowest[1]
if showGraph:
df = DataFrame(complexityList,columns=['complexity','tree','sequence'])
df = df[['complexity','tree']]
ax = df[df.tree == lowLvl].filter(items=["complexity"]).plot(fontsize=15, grid=True, figsize=(20,8))
ax.axhline(complexity_threshold, color="red", linestyle="--")
ax.axhline(1, color="green", linestyle="--")
ax.set_ylim(ymin=0)
ax.set_title(str(lowest[2]))
if lowest[0] < complexity_threshold:
return([True, lowest[0], lowest[1]])
#return([True, lowest])
else:
return([False, lowest[0], lowest[1]])
#return([False, lowest])
def createLogFile(seqlist: list, filename: str, complexity_threshold: float = 0.2, chunkSize: int = 20) -> None:
"""
Creates a log file containing the lowest linguistic complexity score for each sequence in a list.
Args:
seqlist (list): A list of genetic sequences to calculate complexity scores for.
filename (str): The name of the output file to write the log to.
complexity_threshold (float, optional): The threshold below which a sequence is considered to have low complexity. Defaults to 0.2.
chunkSize (int, optional): The size of the windows to use in the complexity calculation. Defaults to 20.
Returns:
None
"""
outputf = open(filename, "w")
truemmbir = 0
for i in seqlist:
seqScore = movingWindow(i, complexity_threshold = complexity_threshold, chunkSize = chunkSize)
if seqScore[0] == False:
truemmbir += 1
outputf.write(str(seqScore[1])+"\n")
print("sequences above threshold:", truemmbir)
outputf.close()
def createSequenceList(filename: str) -> list:
"""
Extracts genetic sequences from a file and returns them as a list.
Args:
filename (str): The name of the file to extract sequences from.
Returns:
list: A list of genetic sequences, where each sequence is a string.
"""
seqfile = open(filename, "r")
lines = seqfile.readlines()
seqlist=[]
for line in lines:
if "bir:" in line:
if len(line) > 100:
seqlist.append(line.strip("\n").strip("bir:-"))
seqlist = list(dict.fromkeys(seqlist))
print(len(seqlist))
longseq=0
for i in seqlist:
if len(i) > 300:
longseq+=1
print(longseq)
return seqlist
def createDataFrameFromLogFile(logfile_path: str) -> DataFrame:
"""
Creates a pandas DataFrame from a log file containing linguistic complexity scores.
Args:
logfile (str): The name of the log file to create the DataFrame from.
Returns:
DataFrame: A pandas DataFrame containing the complexity scores, where each row represents a sequence and contains the complexity score, the word length, and the sequence itself.
"""
logfile = open(logfile_path, "r")
lines = logfile.readlines()
loglist=[]
for line in lines:
if "[" in line:
loglist.append(line.strip("\n"))
for i in range(len(loglist)):
loglist[i] = ast.literal_eval(loglist[i])
print("total sequences:", len(loglist))
df = DataFrame(loglist,columns=['complexity', 'level', 'sequence'])
return df