TrialMatchAI / Git / Diff of /src/Embedder/chromaDB

Models:

MarcoTheBlack/

TrialMatchAI

Downloads: 1

Diff of /src/Embedder/chromaDB_embedding.py [000000] .. [f87529]

Switch to unified view

 b/src/Embedder/chromaDB_embedding.py
+import chromadb
+import os
+import pandas as pd
+from chromadb.utils import embedding_functions
+client = chromadb.PersistentClient()
+em = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2", device="cuda")
+collection = client.get_or_create_collection("eligibility_criteria_collection", metadata={"hnsw:space": "cosine"}, embedding_function=em) # cosine is the default
+# Create an empty list to store filenames without extension
+folder_path='../data/preprocessed_data/clinical_trials'
+files = os.listdir(folder_path)
+filenames_without_extension = []
+for file in files:
+    filename, file_extension = os.path.splitext(file)
+    filenames_without_extension.append(filename)
+def add_documents_collection(filenames, collection, folder_path):
+    for f in filenames:
+        print(f)
+        df = pd.read_csv(f"{folder_path}/{f}.csv")
+        df = df.dropna()
+        for index, row in df.iterrows():
+            text = row["sentence"]
+            nct_id = row["id"]
+            metadatas = {
+                "criteria": row["criteria"],  # Replace "criteria" with the actual column name
+                "sub-criteria": row["sub_criteria"]  # Replace "sub-criteria" with the actual column name
+            }
+            # Add the document to the collection
+            collection.add(
+                documents=[text],
+                ids=["{}-{}".format(nct_id, index + 1)],
+                metadatas=metadatas
+            )
+add_documents_collection(filenames=filenames_without_extension, collection=collection, folder_path=folder_path);
+print(collection.count())