|
a |
|
b/preprocess_dataset.py |
|
|
1 |
import os |
|
|
2 |
import numpy as np |
|
|
3 |
from Bio import SeqIO |
|
|
4 |
from sklearn.model_selection import train_test_split |
|
|
5 |
from tensorflow.keras.preprocessing.text import Tokenizer |
|
|
6 |
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
|
7 |
|
|
|
8 |
def preprocess_dataset(dataset_path, max_sequence_length): |
|
|
9 |
sequences = [] |
|
|
10 |
labels = [] |
|
|
11 |
classes = os.listdir(dataset_path) |
|
|
12 |
|
|
|
13 |
for class_name in classes: |
|
|
14 |
class_path = os.path.join(dataset_path, class_name) |
|
|
15 |
for file_name in os.listdir(class_path): |
|
|
16 |
file_path = os.path.join(class_path, file_name) |
|
|
17 |
sequence = str(SeqIO.read(file_path, "fasta").seq) |
|
|
18 |
sequences.append(sequence) |
|
|
19 |
labels.append(class_name) |
|
|
20 |
|
|
|
21 |
tokenizer = Tokenizer() |
|
|
22 |
tokenizer.fit_on_texts(sequences) |
|
|
23 |
sequences = tokenizer.texts_to_sequences(sequences) |
|
|
24 |
sequences = pad_sequences(sequences, maxlen=max_sequence_length) |
|
|
25 |
|
|
|
26 |
label_mapping = {class_name: i for i, class_name in enumerate(set(labels))} |
|
|
27 |
labels = [label_mapping[label] for label in labels] |
|
|
28 |
|
|
|
29 |
sequences = np.array(sequences) |
|
|
30 |
labels = np.array(labels) |
|
|
31 |
|
|
|
32 |
train_sequences, test_sequences, train_labels, test_labels = train_test_split( |
|
|
33 |
sequences, labels, test_size=0.2, random_state=42) |
|
|
34 |
|
|
|
35 |
return train_sequences, test_sequences, train_labels, test_labels, label_mapping |