a b/preprocess_dataset.py
1
import os
2
import numpy as np
3
from Bio import SeqIO
4
from sklearn.model_selection import train_test_split
5
from tensorflow.keras.preprocessing.text import Tokenizer
6
from tensorflow.keras.preprocessing.sequence import pad_sequences
7
8
def preprocess_dataset(dataset_path, max_sequence_length):
9
    sequences = []
10
    labels = []
11
    classes = os.listdir(dataset_path)
12
    
13
    for class_name in classes:
14
        class_path = os.path.join(dataset_path, class_name)
15
        for file_name in os.listdir(class_path):
16
            file_path = os.path.join(class_path, file_name)
17
            sequence = str(SeqIO.read(file_path, "fasta").seq)
18
            sequences.append(sequence)
19
            labels.append(class_name)
20
    
21
    tokenizer = Tokenizer()
22
    tokenizer.fit_on_texts(sequences)
23
    sequences = tokenizer.texts_to_sequences(sequences)
24
    sequences = pad_sequences(sequences, maxlen=max_sequence_length)
25
    
26
    label_mapping = {class_name: i for i, class_name in enumerate(set(labels))}
27
    labels = [label_mapping[label] for label in labels]
28
    
29
    sequences = np.array(sequences)
30
    labels = np.array(labels)
31
    
32
    train_sequences, test_sequences, train_labels, test_labels = train_test_split(
33
        sequences, labels, test_size=0.2, random_state=42)
34
    
35
    return train_sequences, test_sequences, train_labels, test_labels, label_mapping