a b/scripts/combine-data.py
1
import os
2
import json
3
import random
4
5
def genrate_train_test_data(folder_path = 'data/entity_extraction_reports/'):
6
    """
7
    Generate train and test data from the JSON files in the given folder path.
8
9
    Args:
10
        folder_path (str): Path to the folder containing JSON files.
11
    
12
    Returns:
13
        None
14
    """
15
16
    # Create the folder if it doesn't exist
17
    if not os.path.exists("data/entity_extraction"):
18
        os.makedirs("data/entity_extraction")
19
    
20
    # Initialize an empty list to store JSON objects
21
    json_objects = []
22
23
    # Loop through each file in the folder
24
    for filename in os.listdir(folder_path):
25
        # Check if the file is a text file
26
        if filename.endswith('.txt'):
27
            print(f"[INFO] Processing file: {filename}")
28
29
            # Read the file and load the JSON object
30
            file_path = os.path.join(folder_path, filename)
31
            
32
            with open(file_path, 'r') as file:
33
                try:
34
                    json_object = json.loads(file.read())
35
                    
36
                    # Convert the 'output' field from a JSON object to a JSON string
37
                    for item in json_object:
38
                        item['output'] = json.dumps(item['output']) 
39
                    
40
                    json_objects.extend(json_object)
41
                except json.JSONDecodeError:
42
                    print(f"Error reading file: {file_path}")
43
44
    # Shuffle the JSON objects
45
    random.shuffle(json_objects)
46
47
    # Split the data into train and test
48
    train_data = json_objects[:700]  # First 700 objects for training
49
    test_data = json_objects[700:]   # Last 59 objects for testing
50
51
    # Write the train data to a file
52
    with open('data/entity_extraction/entity-extraction-train-data.json', 'w') as file:
53
        json.dump(train_data, file, indent=4)
54
55
    # Write the test data to a file
56
    with open('data/entity_extraction/entity-extraction-test-data.json', 'w') as file:
57
        json.dump(test_data, file, indent=4)
58
59
if __name__ == '__main__':
60
    # Replace 'folder_path' with the actual path to your folder containing text files
61
    folder_path = 'data/entity_extraction_reports/'
62
63
    # Call the function
64
    genrate_train_test_data(folder_path)