--- a +++ b/src/Matcher/database_builder.py @@ -0,0 +1,108 @@ +# Import the load_dotenv function +from dotenv import load_dotenv +import os +import json +from langchain.docstore.document import Document +from langchain_community.embeddings import HuggingFaceEmbeddings +from langchain_community.vectorstores import Chroma + + +import json +import os +from langchain.docstore.document import Document + +embeddings = HuggingFaceEmbeddings(model_name="dmlls/all-mpnet-base-v2-negation") + +class TrialDatabaseBuilder: + def __init__(self, json_directory, desired_fields, fields_to_concatenate): + self.json_directory = json_directory + self.desired_fields = desired_fields + self.fields_to_concatenate = fields_to_concatenate + self.docs = [] + self.ids = [] + + def load_json_files(self): + for filename in os.listdir(self.json_directory): + if filename.endswith('.json'): + file_path = os.path.join(self.json_directory, filename) + with open(file_path, 'r') as file: + json_data = json.load(file) + extracted_data = {field: json_data.get(field) for field in self.desired_fields} + self.ids.append(extracted_data["nct_id"]) + + metadata = { + "id": extracted_data.get("nct_id", ""), + "gender": extracted_data.get("gender", ""), + "condition": extracted_data.get("condition", ""), + "phase": extracted_data.get("phase", ""), + "minimum_age": extracted_data.get("minimum_age", ""), + "maximum_age": extracted_data.get("maximum_age", ""), + + } + metadata = {k: v for k, v in metadata.items() if v is not None} + + concatenated_string = ', '.join(str(extracted_data[field]) for field in self.fields_to_concatenate) + doc = Document(page_content=concatenated_string, metadata=metadata) + self.docs.append(doc) + + def build_vectorstore(self): + vectorstore = Chroma.from_documents(self.docs, embeddings, persist_directory="../../data/db/", collection_name="trials") + vectorstore.persist() + vectorstore = None + + +class CriteriaDatabaseBuilder: + def __init__(self, json_directory, desired_fields): + self.json_directory = json_directory + self.desired_fields = desired_fields + self.docs = [] + + def load_json_files(self): + for filename in os.listdir(self.json_directory): + if filename.endswith('.json'): + file_path = os.path.join(self.json_directory, filename) + with open(file_path, 'r') as file: + json_data = json.load(file) + extracted_data = {field: json_data.get(field) for field in self.desired_fields} + eligibility_criteria = json_data.get("eligibility") + if eligibility_criteria is not None: + for index, criterion in enumerate(eligibility_criteria): + metadata = { + "nct_id": extracted_data['nct_id'], + "idx": index + 1, + } + metadata["criteria_type"] = criterion["entities_data"][0]["field"] + for i, entity in enumerate(criterion["entities_data"]): + for key, value in entity.items(): + if key != "field": + metadata[f"{key}_{i + 1}"] = value + doc = Document(page_content=criterion["text"], metadata=metadata) + self.docs.append(doc) + + def build_vectorstore(self): + vectorstore = Chroma.from_documents(self.docs, embeddings, persist_directory="../../data/db/", collection_name="criteria") + vectorstore.persist() + vectorstore = None + + +def main(): + load_dotenv('../.env') + openai_access_key = os.getenv('OPENAI_ACCESS_KEY') + huggingface_token = os.getenv('HUGGINGFACEHUB_API_TOKEN') + json_directory = '../../data/trials_jsons/' + desired_fields_criteria = ["nct_id", "eligibility"] + + criteriadb_builder = CriteriaDatabaseBuilder(json_directory, desired_fields_criteria) + criteriadb_builder.load_json_files() + criteriadb_builder.build_vectorstore() + + desired_fields_trials=["nct_id", "brief_title", "brief_summary", "condition", "gender", "minimum_age", "maximum_age", "phase"] + trialdb_builder = TrialDatabaseBuilder(json_directory, desired_fields_trials) + trialdb_builder.load_json_files() + trialdb_builder.build_vectorstore() + + +if __name__ == "__main__": + main() + + \ No newline at end of file