--- a
+++ b/src/Matcher/ParentDocumentMatcher.ipynb
@@ -0,0 +1,349 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import getpass\n",
+    "import os\n",
+    "\n",
+    "os.environ[\"HUGGINGFACEHUB_API_TOKEN\"] = getpass.getpass(\"HUGGINGFACEHUB_API_TOKEN:\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import os\n",
+    "from langchain.docstore.document import Document\n",
+    "# Specify the directory containing your JSON files\n",
+    "json_directory = '../data/trials_jsons/'\n",
+    "desired_fields = ['nct_id', \"brief_title\", \"eligibility\"]\n",
+    "fields_to_concatenate = ['nct_id', \"brief_title\", \"eligibility\"]\n",
+    "# Initialize an empty list to store the loaded data\n",
+    "docs = []\n",
+    "ids = []\n",
+    "# Loop through each file in the directory\n",
+    "for filename in os.listdir(json_directory):\n",
+    "    if filename.endswith('.json'):\n",
+    "        # Construct the full path to the JSON file\n",
+    "        file_path = os.path.join(json_directory, filename)\n",
+    "\n",
+    "        # Open and load the JSON file\n",
+    "        with open(file_path, 'r') as file:\n",
+    "            json_data = json.load(file)\n",
+    "            \n",
+    "            extracted_data = {field: json_data.get(field) for field in desired_fields}\n",
+    "            ids.append(extracted_data[\"nct_id\"])\n",
+    "            concatenated_string = ', '.join(str(extracted_data[field]) for field in fields_to_concatenate)\n",
+    "            # print(extracted_data[[\"nct_id\", \"brief_title\"]])\n",
+    "            docs.append(Document(page_content=concatenated_string, metadata={\"id\": extracted_data[\"nct_id\"]}))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt_template = \"\"\"\n",
+    "Answer the question based only on the supplied context. If you don't know the answer, say you don't know the answer.\n",
+    "Context: {context}\n",
+    "Question: {question}\n",
+    "Your answer:\n",
+    "\"\"\"\n",
+    "prompt = ChatPromptTemplate.from_template(prompt_template)\n",
+    "\n",
+    "questions = [\n",
+    "    \"What motivates the narrator, Montresor, to seek revenge against Fortunato?\",\n",
+    "    \"What are the major themes in this story?\",\n",
+    "    \"What is the significance of the story taking place during the carnival season?\",\n",
+    "    \"How is vivid and descriptive language used in the story?\",\n",
+    "    \"Is there any foreshadowing in the story? If yes, how is it used in the story?\"\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def do_retrieval(chain):\n",
+    "    for i in range(len(questions)):\n",
+    "        print(\"-\" * 40)\n",
+    "        print(f\"Question: {questions[i]}\\n\")\n",
+    "        with get_openai_callback() as cb:\n",
+    "            pprint_result(chain.invoke(questions[i]))\n",
+    "            print(f'\\nTotal Tokens: {cb.total_tokens}\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Initialize the models\n",
+    "model = ChatOpenAI(model_name=\"gpt-3.5-turbo\", temperature=0.1)\n",
+    "embedding = OpenAIEmbeddings()\n",
+    "\n",
+    "# Initialize a vector store for storing the child chunks\n",
+    "vstore = AstraDB(\n",
+    "    collection_name=\"full_documents\",\n",
+    "    embedding=embedding,\n",
+    "    token=\"AstraCS:JucYiNGfUmfAOjzsZFASTwNj:7c98757e0a085ebf892144f1c77df195fded39811f43d632f9da19dd9eb7d6d7\",\n",
+    "    api_endpoint=\"https://2d6b7600-886e-4852-8f9a-1b59508dg040-us-east-2.apps.astra.datastax.com\"\n",
+    ")\n",
+    "\n",
+    "# Initialize in-memory storage for the parent chunks\n",
+    "parent_store = InMemoryStore()\n",
+    "\n",
+    "\n",
+    "# Create a splitter for the child documents\n",
+    "# Note: child documents should be smaller than parent documents\n",
+    "child_splitter = TokenTextSplitter(chunk_size=200, chunk_overlap=20)\n",
+    "\n",
+    "# Create a parent document retriever\n",
+    "parent_retriever = ParentDocumentRetriever(\n",
+    "    vectorstore=vstore,\n",
+    "    docstore=parent_store,\n",
+    "    child_splitter=child_splitter,\n",
+    ")\n",
+    "# Split and load the documents into the vector and parent stores\n",
+    "parent_retriever.add_documents(docs[0:100])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# This text splitter is used to create the child documents\n",
+    "import chromadb\n",
+    "persistent_client = chromadb.PersistentClient()\n",
+    "client=persistent_client\n",
+    "# The vectorstore to use to index the child chunks\n",
+    "vectorstore = Chroma(\n",
+    "    client= persistent_client, collection_name=\"documents\",\n",
+    "    embedding_function=HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n",
+    ")\n",
+    "\n",
+    "child_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=20)\n",
+    "# The storage layer for the parent documents\n",
+    "store = InMemoryStore()\n",
+    "retriever = ParentDocumentRetriever(\n",
+    "    vectorstore=vectorstore,\n",
+    "    docstore=store,\n",
+    "    child_splitter=child_splitter,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for doc in docs[0:10]:\n",
+    "    # print(doc.metadata[\"id\"])\n",
+    "    retriever.add_documents([doc], ids=[doc.metadata[\"id\"]])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.prompts import PromptTemplate\n",
+    "from langchain.chains.question_answering import load_qa_chain\n",
+    "from langchain.memory import ConversationBufferMemory\n",
+    "from langchain_community.llms import HuggingFaceHub\n",
+    "from langchain.chains import (\n",
+    "    StuffDocumentsChain, LLMChain, ConversationalRetrievalChain\n",
+    ")\n",
+    "from langchain_core.prompts import PromptTemplate\n",
+    "\n",
+    "prompt= \"\"\"You are clinical assistant tasked with finding and ranking the top 5 clinical trials that are best fit for a patient with cancer to be recruited to. \n",
+    "You have been provided with a large database of clinical trials, that include the trial id, brief title and eligibility criteria (both inclusion and exclusion). \n",
+    "The eligibility criteria have been parsed and biomedical named entities have been extracted to aid you in the search. Each crtieria and entity is labelled as either an inclusion or an exclusion criteria. \n",
+    "Each named entity is also tagged with the class of entity it belongs to and whether it is negated or not. Below are the details of the patient. \n",
+    "\n",
+    "Patient ID: 1012, Age: 58, Sex: F,\n",
+    "History of Present Condition: Presented with persistent abdominal pain, unintentional weight loss, and changes in bowel habits. Colonoscopy revealed a 6 cm mass in the ascending colon.\n",
+    "Past Medical History: No significant medical history. Regular screenings revealed no abnormalities.\n",
+    "Symptoms: Persistent abdominal pain (right-sided), unintentional weight loss, changes in bowel habits (diarrhea and constipation).\n",
+    "Allergies: Allergic to Penicillin and Codeine.\n",
+    "Past and Present Medications: No regular medications.\n",
+    "Genetic Mutations: Underwent genetic testing revealing wild-type KRAS, NRAS, and BRAF.\n",
+    "Diagnosis Staging: T4aN1bM0, Stage IIIB colorectal adenocarcinoma.\n",
+    "\n",
+    "Imaging Reports:\n",
+    "1. **Colonoscopy:**\n",
+    "   - Findings: 6 cm mass in the ascending colon, biopsy confirmed adenocarcinoma.\n",
+    "\n",
+    "2. **CT Scan of Abdomen and Pelvis:**\n",
+    "   - Localized tumor in the ascending colon with involvement of adjacent structures.\n",
+    "\n",
+    "3. **PET-CT Scan:**\n",
+    "   - Detected hypermetabolic activity in the primary tumor and regional lymph nodes.\n",
+    "\n",
+    "Pathological Findings:\n",
+    "- Tumor Size: 6 cm\n",
+    "- Margin Status: Positive\n",
+    "- Lymphovascular Invasion: Present\n",
+    "\n",
+    "Immunohistochemistry Results:\n",
+    "- CK20: Positive\n",
+    "- CDX2: Positive\n",
+    "- Mismatch Repair Proteins: Intact\n",
+    "\n",
+    "Molecular Testing:\n",
+    "- KRAS Mutation: Wild-type\n",
+    "- NRAS Mutation: Wild-type\n",
+    "- BRAF Mutation: Wild-type\n",
+    "\n",
+    "Treatment History:\n",
+    "- Underwent laparoscopic right hemicolectomy with clear surgical margins.\n",
+    "\n",
+    "Adjuvant Chemotherapy:\n",
+    "- Initiated adjuvant chemotherapy with FOLFOX regimen.\n",
+    "\n",
+    "Medical Examinations:\n",
+    "1. **Blood Tests:**\n",
+    "   - Normal complete blood count, liver and kidney function.\n",
+    "\n",
+    "2. **Colonoscopic Biopsy Pathology:**\n",
+    "   - Histology: Moderately differentiated adenocarcinoma.\n",
+    "   - Tumor Grade: G2.\n",
+    "\n",
+    "Follow-up Imaging:\n",
+    "- Regular CT scans for surveillance post-surgery and during chemotherapy.\n",
+    "\n",
+    "Treatment Outcomes:\n",
+    "- Good response to adjuvant chemotherapy with no evidence of disease recurrence.\n",
+    "\n",
+    "Family History: Limited family history; paternal grandfather with colorectal cancer at an advanced age.\n",
+    "\"\"\"\n",
+    "question = \"\"\"Find and rank the top 5 clinical trials that this patient could be eligible for.\"\"\"\n",
+    "\n",
+    "\n",
+    "template = \"\"\"\n",
+    "  {prompt}\n",
+    "  \n",
+    "  Clinical Trials:\n",
+    "  {context}\n",
+    "  \n",
+    "  QUESTION: \n",
+    "  {question}\n",
+    "\n",
+    "  CHAT HISTORY: \n",
+    "  {chat_history}\n",
+    "  \n",
+    "  ANSWER:\n",
+    "  \"\"\"\n",
+    "\n",
+    "promptHist = PromptTemplate(\n",
+    "    input_variables=[\"context\", \"question\"],\n",
+    "    template=template\n",
+    ")\n",
+    "\n",
+    "global memory\n",
+    "memory = ConversationBufferMemory(\n",
+    "            memory_key=\"chat_history\",\n",
+    "            input_key=\"question\",\n",
+    "            output_key='answer',\n",
+    "            return_messages=True\n",
+    "            )\n",
+    "\n",
+    "\n",
+    "llm = HuggingFaceHub(\n",
+    "    repo_id=\"HuggingFaceH4/zephyr-7b-beta\",\n",
+    "    task=\"text-generation\",\n",
+    "    model_kwargs={\n",
+    "        \"top_k\": 5,\n",
+    "        \"temperature\": 0.5,\n",
+    "        \"repetition_penalty\": 1.03,\n",
+    "        \"max_new_tokens\":512,\n",
+    "    },\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_community.chat_models.huggingface import ChatHuggingFace\n",
+    "from langchain.chains import RetrievalQA\n",
+    "chat_model = ChatHuggingFace(llm=llm)\n",
+    "\n",
+    "query = \"\"\"Find and rank the top 5 clinical trials that this patient could be eligible for.\"\"\"\n",
+    "\n",
+    "qa = ConversationalRetrievalChain.from_llm(\n",
+    "        llm=llm, chain_type=\"stuff\", \n",
+    "        retriever=retriever, \n",
+    "        verbose = True,\n",
+    "        combine_docs_chain_kwargs={'prompt': promptHist},\n",
+    "        memory = memory,\n",
+    "    )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "qa.run(question=question, prompt=promptHist)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "del prompt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}