[deb8e5]: / BioAid / base.py

Download this file

295 lines (235 with data), 10.8 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
# This code was developed and authored by Jerzy Twarowski in Malkova Lab at the University of Iowa
# Contact: jerzymateusz-twarowski@uiowa.edu, tvarovski1@gmail.com
## Contents:
## extractSeqFromFastaToList
## validateSquence
## compl
## rev_compl
## unique
## createChrList
## dataFrameImport
import logging
def extractSeqFromFastaToList(fasta_file_path: str) -> list:
'''Extracts sequence information from a FASTA file and returns it as a nested list.
Args:
fasta_file_path (str): The path to the FASTA file.
Returns:
list: A nested list of the form [[title_0, sequence_0], [title_1, sequence_1], ...].
Each element of the list is a list containing the title and sequence of a sequence in the FASTA file.
'''
fasta_file = open(fasta_file_path, 'r')
contents = fasta_file.readlines()
fasta_file.close()
fasta_list=[]
for i in contents:
if '>' in i:
fasta_list.append([i.strip('>').strip(),''])
else:
fasta_list[-1][1] = fasta_list[-1][1]+i.strip()
logging.info(f"Extraction of sequence information from {fasta_file_path} finished.")
return fasta_list
def validateSquence(sequence: str) -> bool:
'''Checks if a DNA sequence contains only valid nucleotides.
Args:
sequence (str): A DNA sequence to be validated.
Returns:
bool: True if the sequence contains only valid nucleotides, False otherwise.
If the sequence contains non-canonical nucleotides, a log message is generated.
If the sequence contains 'N's, a warning log message is generated.
The log messages are written to the default logger.
'''
bases = "ATGCatgcN"
for i in sequence:
if i not in bases:
logging.info(f"Sequence doesn't contain cannonical nucleotides: {i}")
return(False)
if i == "N":
logging.warning(f"Warning, sequence contains 'N's")
return(True)
def compl(base: str) -> str:
"""
Returns the complementary base for a given DNA base.
Args:
base (str): A single character representing a DNA base. Must be one of 'A', 'T', 'G', 'C', or '-'.
Returns:
str: The complementary base for the given DNA base. Returns '-' if the input is '-'.
Raises:
ValueError: If the input is not one of 'A', 'T', 'G', 'C', or '-'.
"""
if base == "A":
return('T')
elif base == "T":
return('A')
elif base == "G":
return('C')
elif base == "C":
return('G')
elif base == "-":
return('-')
else:
logging.error(f"Invalid base: {base}")
raise ValueError(f"Invalid base: {base}")
def rev_compl(seq: str) -> str:
"""
Returns the reverse complement of a given DNA sequence.
Args:
seq (str): A string representing a DNA sequence. Must only contain characters 'A', 'T', 'G', 'C', or '-'.
Returns:
str: The reverse complement of the given DNA sequence.
Raises:
ValueError: If the input sequence contains characters other than 'A', 'T', 'G', 'C', or '-' (Through compl())
"""
new_seq = ""
for base in seq:
new_base = compl(base)
new_seq = new_base + new_seq
return(new_seq)
def unique(list1: list) -> list:
"""
Returns a new list containing only the unique elements of the input list.
Args:
list1 (list): A list of elements.
Returns:
list: A new list containing only the unique elements of the input list.
Examples:
>>> unique([1, 2, 3, 2, 1])
[1, 2, 3]
>>> unique(['a', 'b', 'c', 'b', 'a'])
['a', 'b', 'c']
"""
list_set = set(list1)
unique_list = (list(list_set))
return unique_list
def createChrList(chrNum: int) -> list:
"""
Creates a list of chromosome names.
Args:
chrNum (int): The number of chromosomes to include in the list.
Returns:
list: A list of chromosome names, where each name is a string of the form "chrX", where X is the chromosome number.
Examples:
>>> createChrList(3)
['chr1', 'chr2', 'chr3']
>>> createChrList(5)
['chr1', 'chr2', 'chr3', 'chr4', 'chr5']
"""
chrList = []
for i in range(chrNum):
chrList.append(f"chr{i+1}")
return(chrList)
def dataFrameImport(directory: str) -> tuple:
"""
Imports all TSV files in a directory as Pandas dataframes.
Args:
directory (str): The path to the directory containing the TSV files.
Returns:
tuple: A tuple containing two elements:
- A list of Pandas dataframes, where each dataframe corresponds to a TSV file in the directory.
- A list of sample names, where each name is a string representing the name of the corresponding TSV file.
"""
import os
import pandas as pd
frames_list_samples = []
sample_names = []
counter=1
for filename in os.listdir(directory):
if filename.endswith(".tsv"):
sample_names.append(filename[:10].strip())
path = os.path.join(directory, filename)
globals()[f"sample{counter}"] = pd.read_csv(path, sep='\t')
print(f'dataframe {sample_names[-1]} has {len(globals()[f"sample{counter}"])} total rows')
frames_list_samples.append(globals()[f"sample{counter}"])
counter+=1
logging.info(f"found {len(frames_list_samples)} samples in {directory}")
logging.info(len(sample_names), sample_names)
return(frames_list_samples, sample_names)
def pullGenomicContext(list_of_positions: list, fasta_file_path: str, context_flank: int = 5) -> list:
"""
Extracts genomic context of a specified length around each position in a list of positions.
Args:
list_of_positions (list): A list of positions, where each position is a list of two elements: the chromosome name and the position.
fasta_file_path (str): The path to the FASTA file containing the reference genome.
context_flank (int, optional): The length of the context to extract on either side of each position. Defaults to 5.
Returns:
list: A list of genomic contexts, where each context is a list of four elements:
- The sequence of bases to the left of the position.
- The query base at the position.
- The sequence of bases to the right of the position.
- The chromosome name.
"""
fasta_list = extractSeqFromFastaToList(fasta_file_path)
context_list = []
for i in fasta_list:
logging.debug(f"extracting context from {i[0]}. It has {len(i[1])} bases.")
sequence = i[1]
for j in list_of_positions:
#check if it is the same chromosome
if j[0] != i[0]:
continue
#set j to be the position
j = j[1]
#find the position in the fasta list
#remember that the fasta sequences start with 1, not 0, so we need to subtract 1 from the position
try:
query_base = sequence[j-1]
except:
logging.warning(f"position {j} not found in {i[0]}. Skipping...")
continue
#extract the context
context_left = sequence[j-1-context_flank:j-1]
context_right = sequence[j:j+context_flank]
context_list.append([context_left, query_base, context_right, i[0]])
return(context_list)
def drawGenomicContext(context_list: list, show: bool = False, **kwargs: dict) -> None:
'''
Draws a sequence logo of a list of genomic contexts.
Args:
context_list (list): A list of genomic contexts, where each context is a list of four elements:
- The sequence of bases to the left of the position.
- The query base at the position.
- The sequence of bases to the right of the position.
- The chromosome name.
show (bool, optional): Whether to show the plot. Defaults to False.
**kwargs (dict): Additional keyword arguments to pass to the function. Supported arguments include:
- save_path (str): The path to save the plot to.
Returns:
None
Raises:
ImportError: If the required packages matplotlib.pyplot and pandas are not installed.
'''
import matplotlib.pyplot as plt
import pandas as pd
#create a dataframe from the context list
df = pd.DataFrame(context_list, columns=['context_left', '0', 'context_right', 'chromosome'])
#split the context into columns by base
split_left = df['context_left'].apply(lambda x: pd.Series(list(x)))
split_right = df['context_right'].apply(lambda x: pd.Series(list(x)))
#rename the columns such that they are numbered. e.g. -3,-2,-1, 0, 1, 2, 3
split_left.columns = [f"-{i}" for i in range(len(split_left.columns), 0, -1)]
split_right.columns = [f"{i+1}" for i in range(len(split_right.columns))]
#make a new dataframe with the context split into columns, query still in the middle
df_context = pd.concat([split_left, df['0'], split_right], axis=1)
df_context_freq = df_context.apply(pd.value_counts).fillna(0) #get the frequencies of each base in each column
df_context_freq = df_context_freq.div(df_context_freq.sum(axis=0), axis=1) #normalize the frequencies
df_context_freq = df_context_freq.transpose()
df_context_freq = df_context_freq*100 #set the frequencies to %
#set color scheme for the bases so that they are consistent
colors = {
'A':'tab:green',
'T':'tab:red',
'G':'tab:orange',
'C':'tab:blue',
'N':'tab:gray',
}
#plot the dataframe as a stacked barplot
column_count = len(df_context_freq.columns) #count the number of columns
df_context_freq.plot.bar(stacked=True, figsize=(column_count+3,4), color = list(colors.values()))
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.ylabel('Frequency (%)')
plt.xlabel('Relative Position')
if 'save_path' in kwargs:
plt.savefig(kwargs['save_path'], bbox_inches='tight', dpi=300)
logging.info(f"saved plot to {kwargs['save_path']}")
if show == True:
plt.show()
plt.close()