ai_clinical_trial / Git / [eb2c92] /create_clinical_trial

Models:
joseph-gordon/
ai_clinical_trial
Downloads: 1
[eb2c92]: / create_clinical_trial_embeddings.py
History
Download this file
100 lines (84 with data), 3.7 kB

import chromadb
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

def get_or_create_collection(client, collection_name):
    """Helps to create a ChromaDB collection if it doesn't already exist

    Args:
        client (ChromaDB client): a chromaDB client
        collection_name (str): Name of your ChromaDB collection

    Returns:
        chromadb.collection: the collection object
    """
    # Check if the collection already exists
    existing_collections = client.list_collections()
    for collection in existing_collections:
        if collection.name == collection_name:
            return client.get_collection(collection_name)
    
    # If it doesn't exist, create the collection
    return client.create_collection(collection_name)

def init():
    """Initialization of ChromaDB client, collection and embedding model

    Returns:
        tuple: (collection, collection, model)
    """
# Initialize ChromaDB client and create a collection
    client = chromadb.PersistentClient(path="./chromadb_clinicaltrial")
    # Separate collections for inclusion and exclusion criteria
    inclusion_collection = get_or_create_collection(client, "inclusion_criteria")
    exclusion_collection = get_or_create_collection(client, "exclusion_criteria")
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    return inclusion_collection, exclusion_collection, model

def embed_and_add_single_entry(collection, model, data, id, study_title=None):
    """As the name suggests, Embed input data, add to ChromaDB collection

    Args:
        collection (chromadb.collection): ChromaDB collection object
        model (embedding model): The embedding model object
        data (str): The data to embed and store in chromadb
        id (str): ID of the data
        study_title (str, optional): Title study, in this case clinical trial

    Returns:
        None
    """
# Sample data (embedding and metadata)
    embedding = model.encode(data, convert_to_tensor=False).tolist()
    if study_title is not None:
        metadata = {"trial_id": id, "study_title": study_title}
    else:
        metadata = {"patient_id": id}

    collection.upsert(
        embeddings=[embedding],
        documents=[data],
        metadatas=[metadata],
        ids=[id]
    )
    # if 'ids' in response:
    #     print(f"Successfully added trial ID: {id} with embedding.")
    # else:
    #     print(f"Failed to add trial ID: {id}. Response: {response}")

def check_id_exists(collection, id_to_check):
    """Checks if an item already exists given it's ID

    Args:
        collection (chromadb.collection): collection object
        id_to_check (str): the ID to check if it exists

    Returns:
       bool: True/False depending on whether ID exists
    """
    result = collection.get(ids=[id_to_check])
    return len(result['ids']) > 0

def embed_and_add_multiple_entry(data):
    """Function to add multiple entries, requires data to be in a dict, key as ID and value containing data
    of the clinical trial

    Args:
        data (dict): Dictionary of data
    
    Returns:
        None
    """
    collection, embedding_model = init()
    if isinstance(data, dict):
        for key, value in tqdm(data.items(), desc="Processing Trials: "):
            id = key
            study_title = value['Study Title']
            inclusion_criteria = value['Inclusion Criteria']
            exclusion_criteria = value['Exclusion Criteria']
            data_to_embed = f'Inclusion Criteria: {inclusion_criteria}, Exclusion Criteria: {exclusion_criteria}'
            embed_and_add_single_entry(collection, embedding_model, data_to_embed, id, study_title)