In [None]:
import getpass
import os

os.environ["HUGGINGFACEHUB_API_TOKEN"] = getpass.getpass("HUGGINGFACEHUB_API_TOKEN:")

In [None]:
import json
import os
from langchain.docstore.document import Document
# Specify the directory containing your JSON files
json_directory = '../data/trials_jsons/'
desired_fields = ['nct_id', "brief_title", "eligibility"]
fields_to_concatenate = ['nct_id', "brief_title", "eligibility"]
# Initialize an empty list to store the loaded data
docs = []
ids = []
# Loop through each file in the directory
for filename in os.listdir(json_directory):
    if filename.endswith('.json'):
        # Construct the full path to the JSON file
        file_path = os.path.join(json_directory, filename)

        # Open and load the JSON file
        with open(file_path, 'r') as file:
            json_data = json.load(file)
            
            extracted_data = {field: json_data.get(field) for field in desired_fields}
            ids.append(extracted_data["nct_id"])
            concatenated_string = ', '.join(str(extracted_data[field]) for field in fields_to_concatenate)
            # print(extracted_data[["nct_id", "brief_title"]])
            docs.append(Document(page_content=concatenated_string, metadata={"id": extracted_data["nct_id"]}))


In [None]:
prompt_template = """
Answer the question based only on the supplied context. If you don't know the answer, say you don't know the answer.
Context: {context}
Question: {question}
Your answer:
"""
prompt = ChatPromptTemplate.from_template(prompt_template)

questions = [
    "What motivates the narrator, Montresor, to seek revenge against Fortunato?",
    "What are the major themes in this story?",
    "What is the significance of the story taking place during the carnival season?",
    "How is vivid and descriptive language used in the story?",
    "Is there any foreshadowing in the story? If yes, how is it used in the story?"
]

In [None]:
def do_retrieval(chain):
    for i in range(len(questions)):
        print("-" * 40)
        print(f"Question: {questions[i]}\n")
        with get_openai_callback() as cb:
            pprint_result(chain.invoke(questions[i]))
            print(f'\nTotal Tokens: {cb.total_tokens}\n')

In [None]:
# Initialize the models
model = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.1)
embedding = OpenAIEmbeddings()

# Initialize a vector store for storing the child chunks
vstore = AstraDB(
    collection_name="full_documents",
    embedding=embedding,
    token="AstraCS:JucYiNGfUmfAOjzsZFASTwNj:7c98757e0a085ebf892144f1c77df195fded39811f43d632f9da19dd9eb7d6d7",
    api_endpoint="https://2d6b7600-886e-4852-8f9a-1b59508dg040-us-east-2.apps.astra.datastax.com"
)

# Initialize in-memory storage for the parent chunks
parent_store = InMemoryStore()


# Create a splitter for the child documents
# Note: child documents should be smaller than parent documents
child_splitter = TokenTextSplitter(chunk_size=200, chunk_overlap=20)

# Create a parent document retriever
parent_retriever = ParentDocumentRetriever(
    vectorstore=vstore,
    docstore=parent_store,
    child_splitter=child_splitter,
)
# Split and load the documents into the vector and parent stores
parent_retriever.add_documents(docs[0:100])

In [None]:
# This text splitter is used to create the child documents
import chromadb
persistent_client = chromadb.PersistentClient()
client=persistent_client
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    client= persistent_client, collection_name="documents",
    embedding_function=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
)

child_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=20)
# The storage layer for the parent documents
store = InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
)

In [None]:
for doc in docs[0:10]:
    # print(doc.metadata["id"])
    retriever.add_documents([doc], ids=[doc.metadata["id"]])

In [None]:
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.memory import ConversationBufferMemory
from langchain_community.llms import HuggingFaceHub
from langchain.chains import (
    StuffDocumentsChain, LLMChain, ConversationalRetrievalChain
)
from langchain_core.prompts import PromptTemplate

prompt= """You are clinical assistant tasked with finding and ranking the top 5 clinical trials that are best fit for a patient with cancer to be recruited to. 
You have been provided with a large database of clinical trials, that include the trial id, brief title and eligibility criteria (both inclusion and exclusion). 
The eligibility criteria have been parsed and biomedical named entities have been extracted to aid you in the search. Each crtieria and entity is labelled as either an inclusion or an exclusion criteria. 
Each named entity is also tagged with the class of entity it belongs to and whether it is negated or not. Below are the details of the patient. 

Patient ID: 1012, Age: 58, Sex: F,
History of Present Condition: Presented with persistent abdominal pain, unintentional weight loss, and changes in bowel habits. Colonoscopy revealed a 6 cm mass in the ascending colon.
Past Medical History: No significant medical history. Regular screenings revealed no abnormalities.
Symptoms: Persistent abdominal pain (right-sided), unintentional weight loss, changes in bowel habits (diarrhea and constipation).
Allergies: Allergic to Penicillin and Codeine.
Past and Present Medications: No regular medications.
Genetic Mutations: Underwent genetic testing revealing wild-type KRAS, NRAS, and BRAF.
Diagnosis Staging: T4aN1bM0, Stage IIIB colorectal adenocarcinoma.

Imaging Reports:
1. **Colonoscopy:**
   - Findings: 6 cm mass in the ascending colon, biopsy confirmed adenocarcinoma.

2. **CT Scan of Abdomen and Pelvis:**
   - Localized tumor in the ascending colon with involvement of adjacent structures.

3. **PET-CT Scan:**
   - Detected hypermetabolic activity in the primary tumor and regional lymph nodes.

Pathological Findings:
- Tumor Size: 6 cm
- Margin Status: Positive
- Lymphovascular Invasion: Present

Immunohistochemistry Results:
- CK20: Positive
- CDX2: Positive
- Mismatch Repair Proteins: Intact

Molecular Testing:
- KRAS Mutation: Wild-type
- NRAS Mutation: Wild-type
- BRAF Mutation: Wild-type

Treatment History:
- Underwent laparoscopic right hemicolectomy with clear surgical margins.

Adjuvant Chemotherapy:
- Initiated adjuvant chemotherapy with FOLFOX regimen.

Medical Examinations:
1. **Blood Tests:**
   - Normal complete blood count, liver and kidney function.

2. **Colonoscopic Biopsy Pathology:**
   - Histology: Moderately differentiated adenocarcinoma.
   - Tumor Grade: G2.

Follow-up Imaging:
- Regular CT scans for surveillance post-surgery and during chemotherapy.

Treatment Outcomes:
- Good response to adjuvant chemotherapy with no evidence of disease recurrence.

Family History: Limited family history; paternal grandfather with colorectal cancer at an advanced age.
"""
question = """Find and rank the top 5 clinical trials that this patient could be eligible for."""


template = """
  {prompt}
  
  Clinical Trials:
  {context}
  
  QUESTION: 
  {question}

  CHAT HISTORY: 
  {chat_history}
  
  ANSWER:
  """

promptHist = PromptTemplate(
    input_variables=["context", "question"],
    template=template
)

global memory
memory = ConversationBufferMemory(
            memory_key="chat_history",
            input_key="question",
            output_key='answer',
            return_messages=True
            )


llm = HuggingFaceHub(
    repo_id="HuggingFaceH4/zephyr-7b-beta",
    task="text-generation",
    model_kwargs={
        "top_k": 5,
        "temperature": 0.5,
        "repetition_penalty": 1.03,
        "max_new_tokens":512,
    },
)

In [None]:
from langchain_community.chat_models.huggingface import ChatHuggingFace
from langchain.chains import RetrievalQA
chat_model = ChatHuggingFace(llm=llm)

query = """Find and rank the top 5 clinical trials that this patient could be eligible for."""

qa = ConversationalRetrievalChain.from_llm(
        llm=llm, chain_type="stuff", 
        retriever=retriever, 
        verbose = True,
        combine_docs_chain_kwargs={'prompt': promptHist},
        memory = memory,
    )


In [None]:
qa.run(question=question, prompt=promptHist)

In [None]:
del prompt

In [None]:
prompt