Switch to side-by-side view

--- a
+++ b/notebooks/final_architecture.ipynb
@@ -0,0 +1,485 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import os\n",
+    "from langchain.prompts import PromptTemplate\n",
+    "from langchain.chains import LLMChain\n",
+    "from langchain_openai import ChatOpenAI\n",
+    "from langchain.chains import LLMChain, SimpleSequentialChain\n",
+    "\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "openai_api_key = os.getenv(\"OPENAI_API_KEY\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "single_patient_ehr_path = '/Users/bharathbeeravelly/Desktop/patient-trials-matching/data/processed/patients_small/1a654b50-5c1d-ec96-1d56-8d7c12140983_data.json'\n",
+    "single_trial_criteria_path = '/Users/bharathbeeravelly/Desktop/patient-trials-matching/data/raw/scraped_small/NCT06576401_criteria.txt'\n",
+    "\n",
+    "# Read the single patient EHR and trial criteria\n",
+    "with open(single_patient_ehr_path) as f:\n",
+    "    patient_ehr = json.load(f)\n",
+    "\n",
+    "with open(single_trial_criteria_path) as f:\n",
+    "    trial_criteria = f.read()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 1st LLM: Identify keywords for each criterion\n",
+    "def identify_criteria_keywords(trial_criteria):\n",
+    "    # Define the system message for the LLM to identify relevant keywords\n",
+    "    system_message = \"\"\"\n",
+    "    You are a clinical trial assistant.\n",
+    "    Your task is to read the inclusion, exclusion, and other criteria of a clinical trial, and identify relevant keywords from each criterion.\n",
+    "    \n",
+    "    Common keywords may include: \"Gender\", \"Age\", \"Race\", \"Ethnic Group\", \"Language\", \"BMI\", \"BPM\", \"Height\", \"Weight\", etc.\n",
+    "\n",
+    "    For each criterion, respond with the most relevant keyword or attribute it is concerned with.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    # Initialize the OpenAI LLM model\n",
+    "    llm = ChatOpenAI(temperature=0, model = 'gpt-4o-mini', openai_api_key=openai_api_key)\n",
+    "\n",
+    "    # Create the prompt for keyword identification\n",
+    "    prompt_template = PromptTemplate(\n",
+    "        input_variables=[\"criteria\"],\n",
+    "        template=f\"\"\"\n",
+    "        {system_message}\n",
+    "\n",
+    "        Trial Criteria: {{criteria}}\n",
+    "\n",
+    "        For each criterion, identify the relevant keyword or patient attribute.\n",
+    "        \"\"\"\n",
+    "    )\n",
+    "    \n",
+    "    # Format the prompt with the actual trial criteria\n",
+    "    prompt = prompt_template.format(criteria=trial_criteria)\n",
+    "    \n",
+    "    # Send the prompt to the LLM for processing\n",
+    "    response = llm(prompt)\n",
+    "    \n",
+    "    # Return the keywords identified by the LLM\n",
+    "    return response"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Function to extract only the relevant information from Patient_EHR for LLM processing\n",
+    "def extract_relevant_patient_data(patient_ehr):\n",
+    "    # Extracting necessary attributes from the patient EHR\n",
+    "    # Extract the relevant patient information from the EHR for the second chain\n",
+    "    relevant_patient_data = {\n",
+    "        \"Gender\": patient_ehr.get(\"Gender\"),\n",
+    "        \"Age\": patient_ehr.get(\"Age\"),\n",
+    "        \"Race\": patient_ehr.get(\"Race\"),\n",
+    "        \"Ethnic Group\": patient_ehr.get(\"Ethnic Group\"),\n",
+    "        \"Language\": patient_ehr.get(\"Language\"),\n",
+    "        \"Vital Signs\": patient_ehr.get(\"Vital Signs\"),\n",
+    "        \"Medications\": patient_ehr.get(\"Medications\"),\n",
+    "        \"Problems\": patient_ehr.get(\"Problems\"),\n",
+    "        \"Surgeries\": patient_ehr.get(\"Surgeries\"),\n",
+    "        \"Immunizations\": patient_ehr.get(\"Immunizations\"),\n",
+    "    }\n",
+    "    return relevant_patient_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 2nd LLM: Evaluate patient eligibility based on keywords\n",
+    "def evaluate_criteria_by_keywords(criteria_keywords, patient_ehr):\n",
+    "    # Define the system message for the LLM to evaluate eligibility\n",
+    "    system_message = \"\"\"\n",
+    "    You are a clinical trial assistant. Today's date is 07th October 2024. \n",
+    "    Your task is to compare the patient's information (Gender, Age, Race, Ethnic Group, Language, Vital Signs) \n",
+    "    with the clinical trial's inclusion and exclusion criteria using the identified keywords. \n",
+    "    If any medication is used by the patient, then check the last usage of the medication.\n",
+    "    \n",
+    "    \n",
+    "    For each inclusion criterion, respond with one of the following:\n",
+    "    - \"Yes\" if the patient meets the criterion\n",
+    "    - \"No\" if there is evidence that the criterion is not met\n",
+    "    - \"Yes\" if there is no information available to determine eligibility.\n",
+    "    \n",
+    "    For each exclusion criterion, respond with one of the following:\n",
+    "    - \"Yes\" if the patient does not meet the criterion\n",
+    "    - \"No\" if there is evidence that the criterion is met\n",
+    "    - \"Yes\" if there is no information available to determine eligibility.\n",
+    "    \n",
+    "    \"\"\"\n",
+    "\n",
+    "    # Initialize the OpenAI LLM model\n",
+    "    llm = ChatOpenAI(temperature=0, model = 'gpt-4o-mini', openai_api_key=openai_api_key)\n",
+    "\n",
+    "    # Extract relevant patient data from EHR\n",
+    "    relevant_patient_data = extract_relevant_patient_data(patient_ehr)\n",
+    "\n",
+    "    # Create the prompt for evaluating eligibility\n",
+    "    prompt_template = PromptTemplate(\n",
+    "        input_variables=[\"criteria_keywords\", \"patient_data\"],\n",
+    "        template=f\"\"\"\n",
+    "        {system_message}\n",
+    "\n",
+    "        Criteria Keywords: {{criteria_keywords}}\n",
+    "\n",
+    "        Patient Information: {{patient_data}}\n",
+    "\n",
+    "        For each criterion keyword, respond with:\n",
+    "        - \"Yes\" if the patient meets the criterion\n",
+    "        - \"No\" if the patient does not meet the criterion and reason\n",
+    "        \n",
+    "        While evaluating one criteria, consider only the respective criteria but not any other criteria.\n",
+    "        While rating the criteria, with 'Yes' or 'No', do not give any reasoning\n",
+    "        \n",
+    "        The format of response should be as below:\n",
+    "        \n",
+    "        Inclusion Criteria:\n",
+    "        - Keyword Placeholder 1: Yes \n",
+    "        - Keyword Placeholder 2: No \n",
+    "        - Keyword Placeholder 3: No \n",
+    "        - Keyword Placeholder 4: Yes \n",
+    "        .\n",
+    "        .\n",
+    "        .\n",
+    "        - Keyword Placeholder N: Yes\n",
+    "        \n",
+    "        Exclusion Criteria:\n",
+    "        - Keyword Placeholder 1: No\n",
+    "        - Keyword Placeholder 2: Yes\n",
+    "        - Keyword Placeholder 3: No\n",
+    "        - Keyword Placeholder 4: Yes\n",
+    "        .\n",
+    "        .\n",
+    "        .\n",
+    "        - Keyword Placeholder N: No\n",
+    "        \n",
+    "        While giving the response, do not output the whole criteria mentioned in the txt file. Instead, just give the keyword and the response.\n",
+    "      \n",
+    "        \n",
+    "        \"\"\"\n",
+    "    )\n",
+    "\n",
+    "    # Format the prompt with the criteria keywords and patient data\n",
+    "    prompt = prompt_template.format(\n",
+    "        criteria_keywords=criteria_keywords,\n",
+    "        patient_data=relevant_patient_data\n",
+    "    )\n",
+    "    \n",
+    "    # Send the prompt to the LLM for processing\n",
+    "    response = llm(prompt)\n",
+    "    \n",
+    "    return response"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def process_patient_eligibility(trial_criteria, patient_ehr):\n",
+    "    # Step 1: Identify keywords from trial criteria\n",
+    "    criteria_keywords = identify_criteria_keywords(trial_criteria)\n",
+    "    \n",
+    "    # Step 2: Evaluate patient eligibility based on identified keywords\n",
+    "    eligibility_results = evaluate_criteria_by_keywords(criteria_keywords, patient_ehr)\n",
+    "    \n",
+    "    return eligibility_results.content\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/vh/y8k1dgkj76s01krzn8qtn3nw0000gn/T/ipykernel_81061/552846339.py:32: LangChainDeprecationWarning: The method `BaseChatModel.__call__` was deprecated in langchain-core 0.1.7 and will be removed in 1.0. Use :meth:`~invoke` instead.\n",
+      "  response = llm(prompt)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'Inclusion Criteria:\\n- Gender: No \\n- Age: No \\n- BMI: No \\n\\nExclusion Criteria:\\n- Health Status: Yes\\n- Medical History: Yes\\n- Medication Use: No \\n- Substance Use: Yes\\n- Infectious Disease Status: Yes\\n- Environmental Exposure: Yes'"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "process_patient_eligibility(trial_criteria, patient_ehr)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Inclusion Criteria:\n",
+      "- Gender: No \n",
+      "- Age: No \n",
+      "- BMI: No \n",
+      "\n",
+      "Exclusion Criteria:\n",
+      "- Health Status: Yes\n",
+      "- Medical History: Yes\n",
+      "- Medication Use: No \n",
+      "- Substance Use: Yes\n",
+      "- Infectious Disease Status: Yes\n",
+      "- Environmental Exposure: Yes\n"
+     ]
+    }
+   ],
+   "source": [
+    "print('Inclusion Criteria:\\n- Gender: No \\n- Age: No \\n- BMI: No \\n\\nExclusion Criteria:\\n- Health Status: Yes\\n- Medical History: Yes\\n- Medication Use: No \\n- Substance Use: Yes\\n- Infectious Disease Status: Yes\\n- Environmental Exposure: Yes'\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Patient Eligibility Details:\n",
+      "Inclusion Criteria:\n",
+      "- Gender: No \n",
+      "- Age: No \n",
+      "- BMI: No \n",
+      "- Weight: Yes \n",
+      "\n",
+      "Exclusion Criteria:\n",
+      "- Health Status: Yes\n",
+      "- Medical History: Yes\n",
+      "- Medication Use: No\n",
+      "- Substance Use: Yes\n",
+      "- Infectious Disease Status: Yes\n",
+      "- Environmental Exposure: Yes\n",
+      "\n",
+      "Overall Patient Eligibility: No\n"
+     ]
+    }
+   ],
+   "source": [
+    "def determine_overall_eligibility(eligibility_results):\n",
+    "    # Split the results into inclusion and exclusion criteria\n",
+    "    inclusion_criteria, exclusion_criteria = eligibility_results.split('Exclusion Criteria:')\n",
+    "    \n",
+    "    # Check inclusion criteria\n",
+    "    inclusion_eligible = all(result.strip().endswith(': Yes') for result in inclusion_criteria.split('\\n') if result.strip())\n",
+    "    \n",
+    "    # Check exclusion criteria\n",
+    "    exclusion_eligible = all(result.strip().endswith(': No') for result in exclusion_criteria.split('\\n') if result.strip())\n",
+    "    \n",
+    "    # Determine overall eligibility\n",
+    "    overall_eligible = inclusion_eligible and exclusion_eligible\n",
+    "    \n",
+    "    return \"Yes\" if overall_eligible else \"No\"\n",
+    "\n",
+    "# Use the function\n",
+    "eligibility_results = process_patient_eligibility(trial_criteria, patient_ehr)\n",
+    "final_eligibility = determine_overall_eligibility(eligibility_results)\n",
+    "\n",
+    "print(f\"Patient Eligibility Details:\\n{eligibility_results}\")\n",
+    "print(f\"\\nOverall Patient Eligibility: {final_eligibility}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Patient Eligibility Details:\n",
+      "Gender: No\n",
+      "Age: No\n",
+      "BMI: No\n",
+      "Weight: Yes\n",
+      "Health Status: Yes\n",
+      "Medical History: Yes\n",
+      "Medication Use: No\n",
+      "Substance Use: Yes\n",
+      "Infectious Disease Status: Yes\n",
+      "Environmental Exposure: Yes\n",
+      "\n",
+      "Overall Patient Eligibility: No\n"
+     ]
+    }
+   ],
+   "source": [
+    "def parse_eligibility_results(eligibility_results):\n",
+    "    # Split the results into inclusion and exclusion criteria\n",
+    "    inclusion_criteria, exclusion_criteria = eligibility_results.split('Exclusion Criteria:')\n",
+    "    \n",
+    "    eligibility_dict = {}\n",
+    "    \n",
+    "    # Process inclusion criteria\n",
+    "    for line in inclusion_criteria.split('\\n'):\n",
+    "        if line.strip().startswith('-'):\n",
+    "            key, value = line.strip('- ').split(': ')\n",
+    "            eligibility_dict[key.strip()] = value.strip()\n",
+    "    \n",
+    "    # Process exclusion criteria\n",
+    "    for line in exclusion_criteria.split('\\n'):\n",
+    "        if line.strip().startswith('-'):\n",
+    "            key, value = line.strip('- ').split(': ')\n",
+    "            eligibility_dict[key.strip()] = value.strip()\n",
+    "    \n",
+    "    return eligibility_dict\n",
+    "\n",
+    "def determine_overall_eligibility(eligibility_dict):\n",
+    "    # Check if all values are 'Yes'\n",
+    "    return \"Yes\" if all(value == 'Yes' for value in eligibility_dict.values()) else \"No\"\n",
+    "\n",
+    "# Use the functions\n",
+    "eligibility_dict = parse_eligibility_results(eligibility_results)\n",
+    "final_eligibility = determine_overall_eligibility(eligibility_dict)\n",
+    "\n",
+    "print(\"Patient Eligibility Details:\")\n",
+    "for criterion, status in eligibility_dict.items():\n",
+    "    print(f\"{criterion}: {status}\")\n",
+    "\n",
+    "print(f\"\\nOverall Patient Eligibility: {final_eligibility}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Eligibility results saved to 1a654b50-5c1d-ec96-1d56-8d7c12140983_eligibility.json\n",
+      "\n",
+      "Overall Patient Eligibility: No\n"
+     ]
+    }
+   ],
+   "source": [
+    "def extract_ids(patient_ehr_path, trial_criteria_path):\n",
+    "    # Extract patient ID from the file name\n",
+    "    patient_id = os.path.basename(patient_ehr_path).split('_')[0]\n",
+    "    \n",
+    "    # Extract trial ID from the file name\n",
+    "    trial_id = os.path.basename(trial_criteria_path).split('_')[0]\n",
+    "    \n",
+    "    return patient_id, trial_id\n",
+    "\n",
+    "def extract_study_title(trial_criteria_path):\n",
+    "    # Read the first line of the trial criteria file to get the study title\n",
+    "    with open(trial_criteria_path, 'r') as f:\n",
+    "        first_line = f.readline().strip()\n",
+    "        # Assuming the format is \"Study Title: {title}\"\n",
+    "        if first_line.startswith(\"Study Title:\"):\n",
+    "            return first_line.replace(\"Study Title:\", \"\").strip()\n",
+    "    return None\n",
+    "\n",
+    "def create_eligibility_json(patient_id, trial_id, study_title, eligibility_dict):\n",
+    "    # Create the JSON structure\n",
+    "    eligibility_json = {\n",
+    "        \"patientId\": patient_id,\n",
+    "        \"eligibleTrials\": [\n",
+    "            {\n",
+    "                \"trialId\": trial_id,\n",
+    "                \"studyTitle\": study_title,\n",
+    "                \"eligibilityCriteriaMet\": [\n",
+    "                    criterion for criterion, status in eligibility_dict.items() if status == \"Yes\"\n",
+    "                ]\n",
+    "            }\n",
+    "        ]\n",
+    "    }\n",
+    "    return eligibility_json\n",
+    "\n",
+    "# Main process\n",
+    "patient_id, trial_id = extract_ids(single_patient_ehr_path, single_trial_criteria_path)\n",
+    "study_title = extract_study_title(single_trial_criteria_path)\n",
+    "\n",
+    "# Process eligibility\n",
+    "# eligibility_results = process_patient_eligibility(trial_criteria, patient_ehr)\n",
+    "eligibility_dict = parse_eligibility_results(eligibility_results)\n",
+    "\n",
+    "# Create the JSON structure\n",
+    "eligibility_json = create_eligibility_json(patient_id, trial_id, study_title, eligibility_dict)\n",
+    "\n",
+    "# Save the JSON file\n",
+    "output_filename = f\"{patient_id}_eligibility.json\"\n",
+    "with open(output_filename, 'w') as f:\n",
+    "    json.dump(eligibility_json, f, indent=2)\n",
+    "\n",
+    "print(f\"Eligibility results saved to {output_filename}\")\n",
+    "\n",
+    "# Print overall eligibility\n",
+    "final_eligibility = determine_overall_eligibility(eligibility_dict)\n",
+    "print(f\"\\nOverall Patient Eligibility: {final_eligibility}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Scraping Environment",
+   "language": "python",
+   "name": "scraping_env"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}