[f87529]: / src / Matcher / database_builder.py

Download this file

108 lines (87 with data), 4.8 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
# Import the load_dotenv function
from dotenv import load_dotenv
import os
import json
from langchain.docstore.document import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
import json
import os
from langchain.docstore.document import Document
embeddings = HuggingFaceEmbeddings(model_name="dmlls/all-mpnet-base-v2-negation")
class TrialDatabaseBuilder:
def __init__(self, json_directory, desired_fields, fields_to_concatenate):
self.json_directory = json_directory
self.desired_fields = desired_fields
self.fields_to_concatenate = fields_to_concatenate
self.docs = []
self.ids = []
def load_json_files(self):
for filename in os.listdir(self.json_directory):
if filename.endswith('.json'):
file_path = os.path.join(self.json_directory, filename)
with open(file_path, 'r') as file:
json_data = json.load(file)
extracted_data = {field: json_data.get(field) for field in self.desired_fields}
self.ids.append(extracted_data["nct_id"])
metadata = {
"id": extracted_data.get("nct_id", ""),
"gender": extracted_data.get("gender", ""),
"condition": extracted_data.get("condition", ""),
"phase": extracted_data.get("phase", ""),
"minimum_age": extracted_data.get("minimum_age", ""),
"maximum_age": extracted_data.get("maximum_age", ""),
}
metadata = {k: v for k, v in metadata.items() if v is not None}
concatenated_string = ', '.join(str(extracted_data[field]) for field in self.fields_to_concatenate)
doc = Document(page_content=concatenated_string, metadata=metadata)
self.docs.append(doc)
def build_vectorstore(self):
vectorstore = Chroma.from_documents(self.docs, embeddings, persist_directory="../../data/db/", collection_name="trials")
vectorstore.persist()
vectorstore = None
class CriteriaDatabaseBuilder:
def __init__(self, json_directory, desired_fields):
self.json_directory = json_directory
self.desired_fields = desired_fields
self.docs = []
def load_json_files(self):
for filename in os.listdir(self.json_directory):
if filename.endswith('.json'):
file_path = os.path.join(self.json_directory, filename)
with open(file_path, 'r') as file:
json_data = json.load(file)
extracted_data = {field: json_data.get(field) for field in self.desired_fields}
eligibility_criteria = json_data.get("eligibility")
if eligibility_criteria is not None:
for index, criterion in enumerate(eligibility_criteria):
metadata = {
"nct_id": extracted_data['nct_id'],
"idx": index + 1,
}
metadata["criteria_type"] = criterion["entities_data"][0]["field"]
for i, entity in enumerate(criterion["entities_data"]):
for key, value in entity.items():
if key != "field":
metadata[f"{key}_{i + 1}"] = value
doc = Document(page_content=criterion["text"], metadata=metadata)
self.docs.append(doc)
def build_vectorstore(self):
vectorstore = Chroma.from_documents(self.docs, embeddings, persist_directory="../../data/db/", collection_name="criteria")
vectorstore.persist()
vectorstore = None
def main():
load_dotenv('../.env')
openai_access_key = os.getenv('OPENAI_ACCESS_KEY')
huggingface_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
json_directory = '../../data/trials_jsons/'
desired_fields_criteria = ["nct_id", "eligibility"]
criteriadb_builder = CriteriaDatabaseBuilder(json_directory, desired_fields_criteria)
criteriadb_builder.load_json_files()
criteriadb_builder.build_vectorstore()
desired_fields_trials=["nct_id", "brief_title", "brief_summary", "condition", "gender", "minimum_age", "maximum_age", "phase"]
trialdb_builder = TrialDatabaseBuilder(json_directory, desired_fields_trials)
trialdb_builder.load_json_files()
trialdb_builder.build_vectorstore()
if __name__ == "__main__":
main()