a b/BioAid/makeMMBSearchRef.py
1
# This code was developed and authored by Jerzy Twarowski in Malkova Lab at the University of Iowa 
2
# Contact: jerzymateusz-twarowski@uiowa.edu, tvarovski1@gmail.com
3
4
import regex as re
5
6
def createMMBSearchReference(file_path_in: str, file_path_out: str) -> None:
7
    """
8
    Reads a FASTA file containing genome data and creates a new file with a modified header line for each chromosome.
9
    It's purpose is to create a FASTA file that can be used as a reference for MMBSearch.
10
11
    Args:
12
        file_path_in (str): The path to the input FASTA file.
13
        file_path_out (str): The path to the output file to be created.
14
15
    Returns:
16
        None
17
    """
18
    #good for human genome
19
    file_in = open(file_path_in, "r")
20
    save=False
21
    readlines=True
22
    while readlines:
23
        try:
24
            line = file_in.readline()
25
        except:
26
            print("Failed to read line. EOF? Exiting...")
27
            readlines=False
28
            break
29
30
        if line[0] == ">":
31
            if re.search("chromosome.*Primary.Assembly$", line) != None:
32
                print(line)
33
                linewords = line.split()
34
                chromosome=linewords[4].strip(",")
35
                if (chromosome=="1") | (chromosome=="2"):
36
                    chromosome=f">chr0{chromosome}\n"
37
                elif chromosome=="X":
38
                    chromosome=f">chrX\n"
39
                elif chromosome=="Y":
40
                    chromosome=f">chrY\n"
41
                else:
42
                    chromosome=f">chr{chromosome}\n"
43
                line=chromosome
44
                print(line)
45
                save=True
46
            #fix for mitochondrial chr
47
            elif re.search("mitochondrion, complete genome$", line) != None:
48
                print(line)
49
                line=f">chrM\n"
50
                print(line)
51
                save=True
52
            else:
53
                save=False
54
55
        if save:
56
            with open(file_path_out, "a") as file_out:
57
                file_out.write(line)