|
a |
|
b/src/Embedder/chromaDB_embedding.py |
|
|
1 |
import chromadb |
|
|
2 |
import os |
|
|
3 |
import pandas as pd |
|
|
4 |
from chromadb.utils import embedding_functions |
|
|
5 |
|
|
|
6 |
client = chromadb.PersistentClient() |
|
|
7 |
em = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2", device="cuda") |
|
|
8 |
collection = client.get_or_create_collection("eligibility_criteria_collection", metadata={"hnsw:space": "cosine"}, embedding_function=em) # cosine is the default |
|
|
9 |
|
|
|
10 |
# Create an empty list to store filenames without extension |
|
|
11 |
folder_path='../data/preprocessed_data/clinical_trials' |
|
|
12 |
files = os.listdir(folder_path) |
|
|
13 |
filenames_without_extension = [] |
|
|
14 |
for file in files: |
|
|
15 |
filename, file_extension = os.path.splitext(file) |
|
|
16 |
filenames_without_extension.append(filename) |
|
|
17 |
|
|
|
18 |
def add_documents_collection(filenames, collection, folder_path): |
|
|
19 |
for f in filenames: |
|
|
20 |
print(f) |
|
|
21 |
df = pd.read_csv(f"{folder_path}/{f}.csv") |
|
|
22 |
df = df.dropna() |
|
|
23 |
for index, row in df.iterrows(): |
|
|
24 |
text = row["sentence"] |
|
|
25 |
nct_id = row["id"] |
|
|
26 |
metadatas = { |
|
|
27 |
"criteria": row["criteria"], # Replace "criteria" with the actual column name |
|
|
28 |
"sub-criteria": row["sub_criteria"] # Replace "sub-criteria" with the actual column name |
|
|
29 |
} |
|
|
30 |
# Add the document to the collection |
|
|
31 |
collection.add( |
|
|
32 |
documents=[text], |
|
|
33 |
ids=["{}-{}".format(nct_id, index + 1)], |
|
|
34 |
metadatas=metadatas |
|
|
35 |
) |
|
|
36 |
add_documents_collection(filenames=filenames_without_extension, collection=collection, folder_path=folder_path); |
|
|
37 |
print(collection.count()) |