[eb2c92]: / create_clinical_trial_embeddings.py

Download this file

100 lines (84 with data), 3.7 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
import chromadb
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
def get_or_create_collection(client, collection_name):
"""Helps to create a ChromaDB collection if it doesn't already exist
Args:
client (ChromaDB client): a chromaDB client
collection_name (str): Name of your ChromaDB collection
Returns:
chromadb.collection: the collection object
"""
# Check if the collection already exists
existing_collections = client.list_collections()
for collection in existing_collections:
if collection.name == collection_name:
return client.get_collection(collection_name)
# If it doesn't exist, create the collection
return client.create_collection(collection_name)
def init():
"""Initialization of ChromaDB client, collection and embedding model
Returns:
tuple: (collection, collection, model)
"""
# Initialize ChromaDB client and create a collection
client = chromadb.PersistentClient(path="./chromadb_clinicaltrial")
# Separate collections for inclusion and exclusion criteria
inclusion_collection = get_or_create_collection(client, "inclusion_criteria")
exclusion_collection = get_or_create_collection(client, "exclusion_criteria")
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
return inclusion_collection, exclusion_collection, model
def embed_and_add_single_entry(collection, model, data, id, study_title=None):
"""As the name suggests, Embed input data, add to ChromaDB collection
Args:
collection (chromadb.collection): ChromaDB collection object
model (embedding model): The embedding model object
data (str): The data to embed and store in chromadb
id (str): ID of the data
study_title (str, optional): Title study, in this case clinical trial
Returns:
None
"""
# Sample data (embedding and metadata)
embedding = model.encode(data, convert_to_tensor=False).tolist()
if study_title is not None:
metadata = {"trial_id": id, "study_title": study_title}
else:
metadata = {"patient_id": id}
collection.upsert(
embeddings=[embedding],
documents=[data],
metadatas=[metadata],
ids=[id]
)
# if 'ids' in response:
# print(f"Successfully added trial ID: {id} with embedding.")
# else:
# print(f"Failed to add trial ID: {id}. Response: {response}")
def check_id_exists(collection, id_to_check):
"""Checks if an item already exists given it's ID
Args:
collection (chromadb.collection): collection object
id_to_check (str): the ID to check if it exists
Returns:
bool: True/False depending on whether ID exists
"""
result = collection.get(ids=[id_to_check])
return len(result['ids']) > 0
def embed_and_add_multiple_entry(data):
"""Function to add multiple entries, requires data to be in a dict, key as ID and value containing data
of the clinical trial
Args:
data (dict): Dictionary of data
Returns:
None
"""
collection, embedding_model = init()
if isinstance(data, dict):
for key, value in tqdm(data.items(), desc="Processing Trials: "):
id = key
study_title = value['Study Title']
inclusion_criteria = value['Inclusion Criteria']
exclusion_criteria = value['Exclusion Criteria']
data_to_embed = f'Inclusion Criteria: {inclusion_criteria}, Exclusion Criteria: {exclusion_criteria}'
embed_and_add_single_entry(collection, embedding_model, data_to_embed, id, study_title)