[98867e]: / util / preprocess_fasta_for_dna2vec.py

Download this file

31 lines (22 with data), 1.1 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import os
import configargparse
from Bio.SeqRecord import SeqRecord
from faigen.data.sequence import Dna2VecList, GSFileProcessor
def preprocess_for_dna2vec_training(out_path, **kwargs):
p = Path(out_path) if isinstance(out_path, str) else out_path
data = Dna2VecList.from_folder(**kwargs)
GSFileProcessor().process(data)
if not os.path.exists(str(p)):
os.makedirs(str(p))
for i, seq in enumerate(iter(data.items)):
record = SeqRecord(seq, id=data.ids[i], name=data.names[i], description=data.descriptions[i])
with open(p / f"{data.ids[i]}.fasta", "w") as output:
output.write(record.format("fasta"))
def main():
argp = configargparse.get_argument_parser()
argp.add_argument('-i', help='input folder with Fasta files', type=str, default='.')
argp.add_argument('-o', help='output folder', type=str, default="../d2v_dataset")
args = {k:v for k,v in vars(argp.parse_args()).items()}
preprocess_for_dna2vec_training(path= args["i"], out_path=args["o"])
if __name__ == '__main__':
main()