[8c54ae]: / notebooks / final_architecture.ipynb

Download this file

486 lines (485 with data), 18.0 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "from langchain.prompts import PromptTemplate\n",
    "from langchain.chains import LLMChain\n",
    "from langchain_openai import ChatOpenAI\n",
    "from langchain.chains import LLMChain, SimpleSequentialChain\n",
    "\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "openai_api_key = os.getenv(\"OPENAI_API_KEY\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "single_patient_ehr_path = '/Users/bharathbeeravelly/Desktop/patient-trials-matching/data/processed/patients_small/1a654b50-5c1d-ec96-1d56-8d7c12140983_data.json'\n",
    "single_trial_criteria_path = '/Users/bharathbeeravelly/Desktop/patient-trials-matching/data/raw/scraped_small/NCT06576401_criteria.txt'\n",
    "\n",
    "# Read the single patient EHR and trial criteria\n",
    "with open(single_patient_ehr_path) as f:\n",
    "    patient_ehr = json.load(f)\n",
    "\n",
    "with open(single_trial_criteria_path) as f:\n",
    "    trial_criteria = f.read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1st LLM: Identify keywords for each criterion\n",
    "def identify_criteria_keywords(trial_criteria):\n",
    "    # Define the system message for the LLM to identify relevant keywords\n",
    "    system_message = \"\"\"\n",
    "    You are a clinical trial assistant.\n",
    "    Your task is to read the inclusion, exclusion, and other criteria of a clinical trial, and identify relevant keywords from each criterion.\n",
    "    \n",
    "    Common keywords may include: \"Gender\", \"Age\", \"Race\", \"Ethnic Group\", \"Language\", \"BMI\", \"BPM\", \"Height\", \"Weight\", etc.\n",
    "\n",
    "    For each criterion, respond with the most relevant keyword or attribute it is concerned with.\n",
    "    \"\"\"\n",
    "\n",
    "    # Initialize the OpenAI LLM model\n",
    "    llm = ChatOpenAI(temperature=0, model = 'gpt-4o-mini', openai_api_key=openai_api_key)\n",
    "\n",
    "    # Create the prompt for keyword identification\n",
    "    prompt_template = PromptTemplate(\n",
    "        input_variables=[\"criteria\"],\n",
    "        template=f\"\"\"\n",
    "        {system_message}\n",
    "\n",
    "        Trial Criteria: {{criteria}}\n",
    "\n",
    "        For each criterion, identify the relevant keyword or patient attribute.\n",
    "        \"\"\"\n",
    "    )\n",
    "    \n",
    "    # Format the prompt with the actual trial criteria\n",
    "    prompt = prompt_template.format(criteria=trial_criteria)\n",
    "    \n",
    "    # Send the prompt to the LLM for processing\n",
    "    response = llm(prompt)\n",
    "    \n",
    "    # Return the keywords identified by the LLM\n",
    "    return response"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to extract only the relevant information from Patient_EHR for LLM processing\n",
    "def extract_relevant_patient_data(patient_ehr):\n",
    "    # Extracting necessary attributes from the patient EHR\n",
    "    # Extract the relevant patient information from the EHR for the second chain\n",
    "    relevant_patient_data = {\n",
    "        \"Gender\": patient_ehr.get(\"Gender\"),\n",
    "        \"Age\": patient_ehr.get(\"Age\"),\n",
    "        \"Race\": patient_ehr.get(\"Race\"),\n",
    "        \"Ethnic Group\": patient_ehr.get(\"Ethnic Group\"),\n",
    "        \"Language\": patient_ehr.get(\"Language\"),\n",
    "        \"Vital Signs\": patient_ehr.get(\"Vital Signs\"),\n",
    "        \"Medications\": patient_ehr.get(\"Medications\"),\n",
    "        \"Problems\": patient_ehr.get(\"Problems\"),\n",
    "        \"Surgeries\": patient_ehr.get(\"Surgeries\"),\n",
    "        \"Immunizations\": patient_ehr.get(\"Immunizations\"),\n",
    "    }\n",
    "    return relevant_patient_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2nd LLM: Evaluate patient eligibility based on keywords\n",
    "def evaluate_criteria_by_keywords(criteria_keywords, patient_ehr):\n",
    "    # Define the system message for the LLM to evaluate eligibility\n",
    "    system_message = \"\"\"\n",
    "    You are a clinical trial assistant. Today's date is 07th October 2024. \n",
    "    Your task is to compare the patient's information (Gender, Age, Race, Ethnic Group, Language, Vital Signs) \n",
    "    with the clinical trial's inclusion and exclusion criteria using the identified keywords. \n",
    "    If any medication is used by the patient, then check the last usage of the medication.\n",
    "    \n",
    "    \n",
    "    For each inclusion criterion, respond with one of the following:\n",
    "    - \"Yes\" if the patient meets the criterion\n",
    "    - \"No\" if there is evidence that the criterion is not met\n",
    "    - \"Yes\" if there is no information available to determine eligibility.\n",
    "    \n",
    "    For each exclusion criterion, respond with one of the following:\n",
    "    - \"Yes\" if the patient does not meet the criterion\n",
    "    - \"No\" if there is evidence that the criterion is met\n",
    "    - \"Yes\" if there is no information available to determine eligibility.\n",
    "    \n",
    "    \"\"\"\n",
    "\n",
    "    # Initialize the OpenAI LLM model\n",
    "    llm = ChatOpenAI(temperature=0, model = 'gpt-4o-mini', openai_api_key=openai_api_key)\n",
    "\n",
    "    # Extract relevant patient data from EHR\n",
    "    relevant_patient_data = extract_relevant_patient_data(patient_ehr)\n",
    "\n",
    "    # Create the prompt for evaluating eligibility\n",
    "    prompt_template = PromptTemplate(\n",
    "        input_variables=[\"criteria_keywords\", \"patient_data\"],\n",
    "        template=f\"\"\"\n",
    "        {system_message}\n",
    "\n",
    "        Criteria Keywords: {{criteria_keywords}}\n",
    "\n",
    "        Patient Information: {{patient_data}}\n",
    "\n",
    "        For each criterion keyword, respond with:\n",
    "        - \"Yes\" if the patient meets the criterion\n",
    "        - \"No\" if the patient does not meet the criterion and reason\n",
    "        \n",
    "        While evaluating one criteria, consider only the respective criteria but not any other criteria.\n",
    "        While rating the criteria, with 'Yes' or 'No', do not give any reasoning\n",
    "        \n",
    "        The format of response should be as below:\n",
    "        \n",
    "        Inclusion Criteria:\n",
    "        - Keyword Placeholder 1: Yes \n",
    "        - Keyword Placeholder 2: No \n",
    "        - Keyword Placeholder 3: No \n",
    "        - Keyword Placeholder 4: Yes \n",
    "        .\n",
    "        .\n",
    "        .\n",
    "        - Keyword Placeholder N: Yes\n",
    "        \n",
    "        Exclusion Criteria:\n",
    "        - Keyword Placeholder 1: No\n",
    "        - Keyword Placeholder 2: Yes\n",
    "        - Keyword Placeholder 3: No\n",
    "        - Keyword Placeholder 4: Yes\n",
    "        .\n",
    "        .\n",
    "        .\n",
    "        - Keyword Placeholder N: No\n",
    "        \n",
    "        While giving the response, do not output the whole criteria mentioned in the txt file. Instead, just give the keyword and the response.\n",
    "      \n",
    "        \n",
    "        \"\"\"\n",
    "    )\n",
    "\n",
    "    # Format the prompt with the criteria keywords and patient data\n",
    "    prompt = prompt_template.format(\n",
    "        criteria_keywords=criteria_keywords,\n",
    "        patient_data=relevant_patient_data\n",
    "    )\n",
    "    \n",
    "    # Send the prompt to the LLM for processing\n",
    "    response = llm(prompt)\n",
    "    \n",
    "    return response"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_patient_eligibility(trial_criteria, patient_ehr):\n",
    "    # Step 1: Identify keywords from trial criteria\n",
    "    criteria_keywords = identify_criteria_keywords(trial_criteria)\n",
    "    \n",
    "    # Step 2: Evaluate patient eligibility based on identified keywords\n",
    "    eligibility_results = evaluate_criteria_by_keywords(criteria_keywords, patient_ehr)\n",
    "    \n",
    "    return eligibility_results.content\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/vh/y8k1dgkj76s01krzn8qtn3nw0000gn/T/ipykernel_81061/552846339.py:32: LangChainDeprecationWarning: The method `BaseChatModel.__call__` was deprecated in langchain-core 0.1.7 and will be removed in 1.0. Use :meth:`~invoke` instead.\n",
      "  response = llm(prompt)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'Inclusion Criteria:\\n- Gender: No \\n- Age: No \\n- BMI: No \\n\\nExclusion Criteria:\\n- Health Status: Yes\\n- Medical History: Yes\\n- Medication Use: No \\n- Substance Use: Yes\\n- Infectious Disease Status: Yes\\n- Environmental Exposure: Yes'"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "process_patient_eligibility(trial_criteria, patient_ehr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Inclusion Criteria:\n",
      "- Gender: No \n",
      "- Age: No \n",
      "- BMI: No \n",
      "\n",
      "Exclusion Criteria:\n",
      "- Health Status: Yes\n",
      "- Medical History: Yes\n",
      "- Medication Use: No \n",
      "- Substance Use: Yes\n",
      "- Infectious Disease Status: Yes\n",
      "- Environmental Exposure: Yes\n"
     ]
    }
   ],
   "source": [
    "print('Inclusion Criteria:\\n- Gender: No \\n- Age: No \\n- BMI: No \\n\\nExclusion Criteria:\\n- Health Status: Yes\\n- Medical History: Yes\\n- Medication Use: No \\n- Substance Use: Yes\\n- Infectious Disease Status: Yes\\n- Environmental Exposure: Yes'\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Patient Eligibility Details:\n",
      "Inclusion Criteria:\n",
      "- Gender: No \n",
      "- Age: No \n",
      "- BMI: No \n",
      "- Weight: Yes \n",
      "\n",
      "Exclusion Criteria:\n",
      "- Health Status: Yes\n",
      "- Medical History: Yes\n",
      "- Medication Use: No\n",
      "- Substance Use: Yes\n",
      "- Infectious Disease Status: Yes\n",
      "- Environmental Exposure: Yes\n",
      "\n",
      "Overall Patient Eligibility: No\n"
     ]
    }
   ],
   "source": [
    "def determine_overall_eligibility(eligibility_results):\n",
    "    # Split the results into inclusion and exclusion criteria\n",
    "    inclusion_criteria, exclusion_criteria = eligibility_results.split('Exclusion Criteria:')\n",
    "    \n",
    "    # Check inclusion criteria\n",
    "    inclusion_eligible = all(result.strip().endswith(': Yes') for result in inclusion_criteria.split('\\n') if result.strip())\n",
    "    \n",
    "    # Check exclusion criteria\n",
    "    exclusion_eligible = all(result.strip().endswith(': No') for result in exclusion_criteria.split('\\n') if result.strip())\n",
    "    \n",
    "    # Determine overall eligibility\n",
    "    overall_eligible = inclusion_eligible and exclusion_eligible\n",
    "    \n",
    "    return \"Yes\" if overall_eligible else \"No\"\n",
    "\n",
    "# Use the function\n",
    "eligibility_results = process_patient_eligibility(trial_criteria, patient_ehr)\n",
    "final_eligibility = determine_overall_eligibility(eligibility_results)\n",
    "\n",
    "print(f\"Patient Eligibility Details:\\n{eligibility_results}\")\n",
    "print(f\"\\nOverall Patient Eligibility: {final_eligibility}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Patient Eligibility Details:\n",
      "Gender: No\n",
      "Age: No\n",
      "BMI: No\n",
      "Weight: Yes\n",
      "Health Status: Yes\n",
      "Medical History: Yes\n",
      "Medication Use: No\n",
      "Substance Use: Yes\n",
      "Infectious Disease Status: Yes\n",
      "Environmental Exposure: Yes\n",
      "\n",
      "Overall Patient Eligibility: No\n"
     ]
    }
   ],
   "source": [
    "def parse_eligibility_results(eligibility_results):\n",
    "    # Split the results into inclusion and exclusion criteria\n",
    "    inclusion_criteria, exclusion_criteria = eligibility_results.split('Exclusion Criteria:')\n",
    "    \n",
    "    eligibility_dict = {}\n",
    "    \n",
    "    # Process inclusion criteria\n",
    "    for line in inclusion_criteria.split('\\n'):\n",
    "        if line.strip().startswith('-'):\n",
    "            key, value = line.strip('- ').split(': ')\n",
    "            eligibility_dict[key.strip()] = value.strip()\n",
    "    \n",
    "    # Process exclusion criteria\n",
    "    for line in exclusion_criteria.split('\\n'):\n",
    "        if line.strip().startswith('-'):\n",
    "            key, value = line.strip('- ').split(': ')\n",
    "            eligibility_dict[key.strip()] = value.strip()\n",
    "    \n",
    "    return eligibility_dict\n",
    "\n",
    "def determine_overall_eligibility(eligibility_dict):\n",
    "    # Check if all values are 'Yes'\n",
    "    return \"Yes\" if all(value == 'Yes' for value in eligibility_dict.values()) else \"No\"\n",
    "\n",
    "# Use the functions\n",
    "eligibility_dict = parse_eligibility_results(eligibility_results)\n",
    "final_eligibility = determine_overall_eligibility(eligibility_dict)\n",
    "\n",
    "print(\"Patient Eligibility Details:\")\n",
    "for criterion, status in eligibility_dict.items():\n",
    "    print(f\"{criterion}: {status}\")\n",
    "\n",
    "print(f\"\\nOverall Patient Eligibility: {final_eligibility}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Eligibility results saved to 1a654b50-5c1d-ec96-1d56-8d7c12140983_eligibility.json\n",
      "\n",
      "Overall Patient Eligibility: No\n"
     ]
    }
   ],
   "source": [
    "def extract_ids(patient_ehr_path, trial_criteria_path):\n",
    "    # Extract patient ID from the file name\n",
    "    patient_id = os.path.basename(patient_ehr_path).split('_')[0]\n",
    "    \n",
    "    # Extract trial ID from the file name\n",
    "    trial_id = os.path.basename(trial_criteria_path).split('_')[0]\n",
    "    \n",
    "    return patient_id, trial_id\n",
    "\n",
    "def extract_study_title(trial_criteria_path):\n",
    "    # Read the first line of the trial criteria file to get the study title\n",
    "    with open(trial_criteria_path, 'r') as f:\n",
    "        first_line = f.readline().strip()\n",
    "        # Assuming the format is \"Study Title: {title}\"\n",
    "        if first_line.startswith(\"Study Title:\"):\n",
    "            return first_line.replace(\"Study Title:\", \"\").strip()\n",
    "    return None\n",
    "\n",
    "def create_eligibility_json(patient_id, trial_id, study_title, eligibility_dict):\n",
    "    # Create the JSON structure\n",
    "    eligibility_json = {\n",
    "        \"patientId\": patient_id,\n",
    "        \"eligibleTrials\": [\n",
    "            {\n",
    "                \"trialId\": trial_id,\n",
    "                \"studyTitle\": study_title,\n",
    "                \"eligibilityCriteriaMet\": [\n",
    "                    criterion for criterion, status in eligibility_dict.items() if status == \"Yes\"\n",
    "                ]\n",
    "            }\n",
    "        ]\n",
    "    }\n",
    "    return eligibility_json\n",
    "\n",
    "# Main process\n",
    "patient_id, trial_id = extract_ids(single_patient_ehr_path, single_trial_criteria_path)\n",
    "study_title = extract_study_title(single_trial_criteria_path)\n",
    "\n",
    "# Process eligibility\n",
    "# eligibility_results = process_patient_eligibility(trial_criteria, patient_ehr)\n",
    "eligibility_dict = parse_eligibility_results(eligibility_results)\n",
    "\n",
    "# Create the JSON structure\n",
    "eligibility_json = create_eligibility_json(patient_id, trial_id, study_title, eligibility_dict)\n",
    "\n",
    "# Save the JSON file\n",
    "output_filename = f\"{patient_id}_eligibility.json\"\n",
    "with open(output_filename, 'w') as f:\n",
    "    json.dump(eligibility_json, f, indent=2)\n",
    "\n",
    "print(f\"Eligibility results saved to {output_filename}\")\n",
    "\n",
    "# Print overall eligibility\n",
    "final_eligibility = determine_overall_eligibility(eligibility_dict)\n",
    "print(f\"\\nOverall Patient Eligibility: {final_eligibility}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Scraping Environment",
   "language": "python",
   "name": "scraping_env"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}