a b/src/Matcher/database_builder.py
1
# Import the load_dotenv function
2
from dotenv import load_dotenv
3
import os
4
import json
5
from langchain.docstore.document import Document
6
from langchain_community.embeddings import HuggingFaceEmbeddings
7
from langchain_community.vectorstores import Chroma
8
9
10
import json
11
import os
12
from langchain.docstore.document import Document
13
14
embeddings = HuggingFaceEmbeddings(model_name="dmlls/all-mpnet-base-v2-negation")
15
16
class TrialDatabaseBuilder:
17
    def __init__(self, json_directory, desired_fields, fields_to_concatenate):
18
        self.json_directory = json_directory
19
        self.desired_fields = desired_fields
20
        self.fields_to_concatenate = fields_to_concatenate
21
        self.docs = []
22
        self.ids = []
23
24
    def load_json_files(self):
25
        for filename in os.listdir(self.json_directory):
26
            if filename.endswith('.json'):
27
                file_path = os.path.join(self.json_directory, filename)
28
                with open(file_path, 'r') as file:
29
                    json_data = json.load(file)
30
                    extracted_data = {field: json_data.get(field) for field in self.desired_fields}
31
                    self.ids.append(extracted_data["nct_id"])
32
                    
33
                    metadata = {
34
                        "id": extracted_data.get("nct_id", ""),
35
                        "gender": extracted_data.get("gender", ""),
36
                        "condition": extracted_data.get("condition", ""),
37
                        "phase": extracted_data.get("phase", ""),
38
                        "minimum_age": extracted_data.get("minimum_age", ""),
39
                        "maximum_age": extracted_data.get("maximum_age", ""),
40
                        
41
                    }
42
                    metadata = {k: v for k, v in metadata.items() if v is not None}
43
                        
44
                    concatenated_string = ', '.join(str(extracted_data[field]) for field in self.fields_to_concatenate)
45
                    doc  = Document(page_content=concatenated_string, metadata=metadata)
46
                    self.docs.append(doc)
47
                    
48
    def build_vectorstore(self):
49
        vectorstore = Chroma.from_documents(self.docs, embeddings, persist_directory="../../data/db/", collection_name="trials")
50
        vectorstore.persist()
51
        vectorstore = None
52
53
54
class CriteriaDatabaseBuilder:
55
    def __init__(self, json_directory, desired_fields):
56
        self.json_directory = json_directory
57
        self.desired_fields = desired_fields
58
        self.docs = []
59
60
    def load_json_files(self):
61
        for filename in os.listdir(self.json_directory):
62
            if filename.endswith('.json'):
63
                file_path = os.path.join(self.json_directory, filename)
64
                with open(file_path, 'r') as file:
65
                    json_data = json.load(file)
66
                    extracted_data = {field: json_data.get(field) for field in self.desired_fields}
67
                    eligibility_criteria = json_data.get("eligibility")
68
                    if eligibility_criteria is not None:
69
                        for index, criterion in enumerate(eligibility_criteria):
70
                            metadata = {
71
                                "nct_id": extracted_data['nct_id'],
72
                                "idx": index + 1,
73
                            }
74
                            metadata["criteria_type"] = criterion["entities_data"][0]["field"]
75
                            for i, entity in enumerate(criterion["entities_data"]):
76
                                for key, value in entity.items():
77
                                    if key != "field":
78
                                        metadata[f"{key}_{i + 1}"] = value
79
                            doc = Document(page_content=criterion["text"], metadata=metadata)
80
                            self.docs.append(doc)
81
82
    def build_vectorstore(self):
83
        vectorstore = Chroma.from_documents(self.docs, embeddings, persist_directory="../../data/db/", collection_name="criteria")
84
        vectorstore.persist()
85
        vectorstore = None
86
87
88
def main():
89
    load_dotenv('../.env')
90
    openai_access_key = os.getenv('OPENAI_ACCESS_KEY')
91
    huggingface_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
92
    json_directory = '../../data/trials_jsons/'
93
    desired_fields_criteria = ["nct_id", "eligibility"]
94
95
    criteriadb_builder = CriteriaDatabaseBuilder(json_directory, desired_fields_criteria)
96
    criteriadb_builder.load_json_files()
97
    criteriadb_builder.build_vectorstore()
98
    
99
    desired_fields_trials=["nct_id", "brief_title", "brief_summary", "condition", "gender", "minimum_age", "maximum_age", "phase"]
100
    trialdb_builder = TrialDatabaseBuilder(json_directory, desired_fields_trials)
101
    trialdb_builder.load_json_files()
102
    trialdb_builder.build_vectorstore()
103
104
105
if __name__ == "__main__":
106
    main()
107
    
108