[f87529]: / src / Embedder / mongodb_vector_embedded.py

Download this file

107 lines (90 with data), 4.6 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
import os
import pymongo
import requests
from typing import List, Dict
from tenacity import retry, wait_random_exponential, stop_after_attempt
import openai
mongodb_link = os.environ.get('MONGODB_LINK')
client = pymongo.MongoClient(mongodb_link)
db = client.trialmatchai
collection = db.clinicaltrials
openapi_token = os.environ.get('OPENAPI_TOKEN')
openai.api_key = openapi_token
# hf_token = "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"
# embedding_url = "https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2"
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(10))
def generate_embedding(text: str, model="text-embedding-ada-002") -> list[float]:
return openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"]
def embed_text_fields(doc, fields):
for field in fields:
if field in doc and isinstance(doc[field], str):
# Generate embeddings for the text in the specified field
doc[f'{field}_embedding_hf'] = generate_embedding(doc[field])
def embed_eligibility_text_and_entity_text(doc):
print(doc["nct_id"])
# Check if the eligibility field exists and is a list
if 'eligibility' in doc and isinstance(doc['eligibility'], list):
for entry in doc['eligibility']:
# Check if the entry has 'text' and 'entity_text' sub-fields
if 'text' in entry:
# Generate embeddings for 'text' and 'entity_text' if available
entry['text_embedding_hf'] = generate_embedding(entry['text'])
if 'entities_data' in entry:
for entity_data in entry['entities_data']:
if 'entity_text' in entity_data:
# Generate embeddings for 'entity_text' if available
entity_data['entity_text_embedding_hf'] = generate_embedding(entity_data['entity_text'])
# List of fields to embed
fields_to_embed = ['eligibility', 'brief_title']
# Embed specified fields for each document in the collection
for doc in collection.find({}):
embed_text_fields(doc, fields_to_embed)
# Special case for 'eligibility' field
embed_eligibility_text_and_entity_text(doc)
# Update the document in the collection
collection.replace_one({'_id': doc['_id']}, doc)
query = """The patient has mutational status of EGFR and ALK.
The patient suffers from stage 4 lung cancer. The patient has not received any prior treatment.
The patient has a life expectancy of at least 3 months.
The patient has an ECOG performance status of 0 or 1.
The patient has measurable disease.
The patient has adequate organ function.
The patient has a negative pregnancy test.
The patient has a history of interstitial lung disease.
The patient has a history of non-infectious pneumonitis.
The patient has no history of radiation pneumonitis.
The patient has no history of drug-induced pneumonitis.
The patient has no history of idiopathic pulmonary fibrosis.
The patient has no history of organizing pneumonia.
The patient has no history of autoimmune disease.
The patient has no history of systemic lupus erythematosus.
The patient has no history of sarcoidosis.
The patient has no history of vasculitis.
The patient has no history of hypophysitis.
The patient has no history of uveitis.
The patient has no history of iritis.
The patient has a history of hepatitis B.
The patient has no history of hepatitis C.
The patient has no history of human immunodeficiency virus.
The patient has no history of tuberculosis.
The patient has no history of active infection.
The patient has a history of severe infection.
The patient has no history of severe or uncontrolled cardiovascular disease.
The patient has no history of severe or uncontrolled hypertension.
The patient has no history of severe or uncontrolled diabetes.
The patient has no history of severe or uncontrolled hyperlipidemia.
The patient has no history of severe or uncontrolled hypertriglyceridemia.
The patient has no history of severe or uncontrolled hypercholesterolemia.
The patient has a history of severe or uncontrolled hypomagnesemia.
The patient has a history of severe or uncontrolled hypophosphatemia.
The patient has no history of severe or uncontrolled hypovitaminosis D.
The patient has a history of severe or uncontrolled hyperthyroidism"""
results = collection.aggregate([
{"$vectorSearch": {
"queryVector": generate_embedding(query),
"path": "plot_embedding_hf",
"numCandidates": 100,
"limit": 4,
"index": "PlotSemanticSearch",
}}
])