Diff of /scripts/combine-data.py [000000] .. [248dc9]

Switch to side-by-side view

--- a
+++ b/scripts/combine-data.py
@@ -0,0 +1,64 @@
+import os
+import json
+import random
+
+def genrate_train_test_data(folder_path = 'data/entity_extraction_reports/'):
+    """
+    Generate train and test data from the JSON files in the given folder path.
+
+    Args:
+        folder_path (str): Path to the folder containing JSON files.
+    
+    Returns:
+        None
+    """
+
+    # Create the folder if it doesn't exist
+    if not os.path.exists("data/entity_extraction"):
+        os.makedirs("data/entity_extraction")
+    
+    # Initialize an empty list to store JSON objects
+    json_objects = []
+
+    # Loop through each file in the folder
+    for filename in os.listdir(folder_path):
+        # Check if the file is a text file
+        if filename.endswith('.txt'):
+            print(f"[INFO] Processing file: {filename}")
+
+            # Read the file and load the JSON object
+            file_path = os.path.join(folder_path, filename)
+            
+            with open(file_path, 'r') as file:
+                try:
+                    json_object = json.loads(file.read())
+                    
+                    # Convert the 'output' field from a JSON object to a JSON string
+                    for item in json_object:
+                        item['output'] = json.dumps(item['output']) 
+                    
+                    json_objects.extend(json_object)
+                except json.JSONDecodeError:
+                    print(f"Error reading file: {file_path}")
+
+    # Shuffle the JSON objects
+    random.shuffle(json_objects)
+
+    # Split the data into train and test
+    train_data = json_objects[:700]  # First 700 objects for training
+    test_data = json_objects[700:]   # Last 59 objects for testing
+
+    # Write the train data to a file
+    with open('data/entity_extraction/entity-extraction-train-data.json', 'w') as file:
+        json.dump(train_data, file, indent=4)
+
+    # Write the test data to a file
+    with open('data/entity_extraction/entity-extraction-test-data.json', 'w') as file:
+        json.dump(test_data, file, indent=4)
+
+if __name__ == '__main__':
+    # Replace 'folder_path' with the actual path to your folder containing text files
+    folder_path = 'data/entity_extraction_reports/'
+
+    # Call the function
+    genrate_train_test_data(folder_path)
\ No newline at end of file