Diff of /preprocess_dataset.py [000000] .. [6ab127]

Switch to side-by-side view

--- a
+++ b/preprocess_dataset.py
@@ -0,0 +1,35 @@
+import os
+import numpy as np
+from Bio import SeqIO
+from sklearn.model_selection import train_test_split
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+
+def preprocess_dataset(dataset_path, max_sequence_length):
+    sequences = []
+    labels = []
+    classes = os.listdir(dataset_path)
+    
+    for class_name in classes:
+        class_path = os.path.join(dataset_path, class_name)
+        for file_name in os.listdir(class_path):
+            file_path = os.path.join(class_path, file_name)
+            sequence = str(SeqIO.read(file_path, "fasta").seq)
+            sequences.append(sequence)
+            labels.append(class_name)
+    
+    tokenizer = Tokenizer()
+    tokenizer.fit_on_texts(sequences)
+    sequences = tokenizer.texts_to_sequences(sequences)
+    sequences = pad_sequences(sequences, maxlen=max_sequence_length)
+    
+    label_mapping = {class_name: i for i, class_name in enumerate(set(labels))}
+    labels = [label_mapping[label] for label in labels]
+    
+    sequences = np.array(sequences)
+    labels = np.array(labels)
+    
+    train_sequences, test_sequences, train_labels, test_labels = train_test_split(
+        sequences, labels, test_size=0.2, random_state=42)
+    
+    return train_sequences, test_sequences, train_labels, test_labels, label_mapping