Switch to unified view

a b/src/Embedder/chromaDB_embedding.py
1
import chromadb
2
import os
3
import pandas as pd
4
from chromadb.utils import embedding_functions
5
6
client = chromadb.PersistentClient()
7
em = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2", device="cuda")
8
collection = client.get_or_create_collection("eligibility_criteria_collection", metadata={"hnsw:space": "cosine"}, embedding_function=em) # cosine is the default
9
10
# Create an empty list to store filenames without extension
11
folder_path='../data/preprocessed_data/clinical_trials'
12
files = os.listdir(folder_path)
13
filenames_without_extension = []
14
for file in files:
15
    filename, file_extension = os.path.splitext(file)
16
    filenames_without_extension.append(filename)
17
18
def add_documents_collection(filenames, collection, folder_path):
19
    for f in filenames:
20
        print(f)
21
        df = pd.read_csv(f"{folder_path}/{f}.csv")
22
        df = df.dropna()
23
        for index, row in df.iterrows():
24
            text = row["sentence"]
25
            nct_id = row["id"]
26
            metadatas = {
27
                "criteria": row["criteria"],  # Replace "criteria" with the actual column name
28
                "sub-criteria": row["sub_criteria"]  # Replace "sub-criteria" with the actual column name
29
            }
30
            # Add the document to the collection
31
            collection.add(
32
                documents=[text],
33
                ids=["{}-{}".format(nct_id, index + 1)],
34
                metadatas=metadatas
35
            )
36
add_documents_collection(filenames=filenames_without_extension, collection=collection, folder_path=folder_path);
37
print(collection.count())