|
a |
|
b/BioAid/makeMMBSearchRef.py |
|
|
1 |
# This code was developed and authored by Jerzy Twarowski in Malkova Lab at the University of Iowa |
|
|
2 |
# Contact: jerzymateusz-twarowski@uiowa.edu, tvarovski1@gmail.com |
|
|
3 |
|
|
|
4 |
import regex as re |
|
|
5 |
|
|
|
6 |
def createMMBSearchReference(file_path_in: str, file_path_out: str) -> None: |
|
|
7 |
""" |
|
|
8 |
Reads a FASTA file containing genome data and creates a new file with a modified header line for each chromosome. |
|
|
9 |
It's purpose is to create a FASTA file that can be used as a reference for MMBSearch. |
|
|
10 |
|
|
|
11 |
Args: |
|
|
12 |
file_path_in (str): The path to the input FASTA file. |
|
|
13 |
file_path_out (str): The path to the output file to be created. |
|
|
14 |
|
|
|
15 |
Returns: |
|
|
16 |
None |
|
|
17 |
""" |
|
|
18 |
#good for human genome |
|
|
19 |
file_in = open(file_path_in, "r") |
|
|
20 |
save=False |
|
|
21 |
readlines=True |
|
|
22 |
while readlines: |
|
|
23 |
try: |
|
|
24 |
line = file_in.readline() |
|
|
25 |
except: |
|
|
26 |
print("Failed to read line. EOF? Exiting...") |
|
|
27 |
readlines=False |
|
|
28 |
break |
|
|
29 |
|
|
|
30 |
if line[0] == ">": |
|
|
31 |
if re.search("chromosome.*Primary.Assembly$", line) != None: |
|
|
32 |
print(line) |
|
|
33 |
linewords = line.split() |
|
|
34 |
chromosome=linewords[4].strip(",") |
|
|
35 |
if (chromosome=="1") | (chromosome=="2"): |
|
|
36 |
chromosome=f">chr0{chromosome}\n" |
|
|
37 |
elif chromosome=="X": |
|
|
38 |
chromosome=f">chrX\n" |
|
|
39 |
elif chromosome=="Y": |
|
|
40 |
chromosome=f">chrY\n" |
|
|
41 |
else: |
|
|
42 |
chromosome=f">chr{chromosome}\n" |
|
|
43 |
line=chromosome |
|
|
44 |
print(line) |
|
|
45 |
save=True |
|
|
46 |
#fix for mitochondrial chr |
|
|
47 |
elif re.search("mitochondrion, complete genome$", line) != None: |
|
|
48 |
print(line) |
|
|
49 |
line=f">chrM\n" |
|
|
50 |
print(line) |
|
|
51 |
save=True |
|
|
52 |
else: |
|
|
53 |
save=False |
|
|
54 |
|
|
|
55 |
if save: |
|
|
56 |
with open(file_path_out, "a") as file_out: |
|
|
57 |
file_out.write(line) |