--- a +++ b/notebooks/final_architecture.ipynb @@ -0,0 +1,485 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import os\n", + "from langchain.prompts import PromptTemplate\n", + "from langchain.chains import LLMChain\n", + "from langchain_openai import ChatOpenAI\n", + "from langchain.chains import LLMChain, SimpleSequentialChain\n", + "\n", + "from dotenv import load_dotenv\n", + "\n", + "openai_api_key = os.getenv(\"OPENAI_API_KEY\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "single_patient_ehr_path = '/Users/bharathbeeravelly/Desktop/patient-trials-matching/data/processed/patients_small/1a654b50-5c1d-ec96-1d56-8d7c12140983_data.json'\n", + "single_trial_criteria_path = '/Users/bharathbeeravelly/Desktop/patient-trials-matching/data/raw/scraped_small/NCT06576401_criteria.txt'\n", + "\n", + "# Read the single patient EHR and trial criteria\n", + "with open(single_patient_ehr_path) as f:\n", + " patient_ehr = json.load(f)\n", + "\n", + "with open(single_trial_criteria_path) as f:\n", + " trial_criteria = f.read()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# 1st LLM: Identify keywords for each criterion\n", + "def identify_criteria_keywords(trial_criteria):\n", + " # Define the system message for the LLM to identify relevant keywords\n", + " system_message = \"\"\"\n", + " You are a clinical trial assistant.\n", + " Your task is to read the inclusion, exclusion, and other criteria of a clinical trial, and identify relevant keywords from each criterion.\n", + " \n", + " Common keywords may include: \"Gender\", \"Age\", \"Race\", \"Ethnic Group\", \"Language\", \"BMI\", \"BPM\", \"Height\", \"Weight\", etc.\n", + "\n", + " For each criterion, respond with the most relevant keyword or attribute it is concerned with.\n", + " \"\"\"\n", + "\n", + " # Initialize the OpenAI LLM model\n", + " llm = ChatOpenAI(temperature=0, model = 'gpt-4o-mini', openai_api_key=openai_api_key)\n", + "\n", + " # Create the prompt for keyword identification\n", + " prompt_template = PromptTemplate(\n", + " input_variables=[\"criteria\"],\n", + " template=f\"\"\"\n", + " {system_message}\n", + "\n", + " Trial Criteria: {{criteria}}\n", + "\n", + " For each criterion, identify the relevant keyword or patient attribute.\n", + " \"\"\"\n", + " )\n", + " \n", + " # Format the prompt with the actual trial criteria\n", + " prompt = prompt_template.format(criteria=trial_criteria)\n", + " \n", + " # Send the prompt to the LLM for processing\n", + " response = llm(prompt)\n", + " \n", + " # Return the keywords identified by the LLM\n", + " return response" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Function to extract only the relevant information from Patient_EHR for LLM processing\n", + "def extract_relevant_patient_data(patient_ehr):\n", + " # Extracting necessary attributes from the patient EHR\n", + " # Extract the relevant patient information from the EHR for the second chain\n", + " relevant_patient_data = {\n", + " \"Gender\": patient_ehr.get(\"Gender\"),\n", + " \"Age\": patient_ehr.get(\"Age\"),\n", + " \"Race\": patient_ehr.get(\"Race\"),\n", + " \"Ethnic Group\": patient_ehr.get(\"Ethnic Group\"),\n", + " \"Language\": patient_ehr.get(\"Language\"),\n", + " \"Vital Signs\": patient_ehr.get(\"Vital Signs\"),\n", + " \"Medications\": patient_ehr.get(\"Medications\"),\n", + " \"Problems\": patient_ehr.get(\"Problems\"),\n", + " \"Surgeries\": patient_ehr.get(\"Surgeries\"),\n", + " \"Immunizations\": patient_ehr.get(\"Immunizations\"),\n", + " }\n", + " return relevant_patient_data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# 2nd LLM: Evaluate patient eligibility based on keywords\n", + "def evaluate_criteria_by_keywords(criteria_keywords, patient_ehr):\n", + " # Define the system message for the LLM to evaluate eligibility\n", + " system_message = \"\"\"\n", + " You are a clinical trial assistant. Today's date is 07th October 2024. \n", + " Your task is to compare the patient's information (Gender, Age, Race, Ethnic Group, Language, Vital Signs) \n", + " with the clinical trial's inclusion and exclusion criteria using the identified keywords. \n", + " If any medication is used by the patient, then check the last usage of the medication.\n", + " \n", + " \n", + " For each inclusion criterion, respond with one of the following:\n", + " - \"Yes\" if the patient meets the criterion\n", + " - \"No\" if there is evidence that the criterion is not met\n", + " - \"Yes\" if there is no information available to determine eligibility.\n", + " \n", + " For each exclusion criterion, respond with one of the following:\n", + " - \"Yes\" if the patient does not meet the criterion\n", + " - \"No\" if there is evidence that the criterion is met\n", + " - \"Yes\" if there is no information available to determine eligibility.\n", + " \n", + " \"\"\"\n", + "\n", + " # Initialize the OpenAI LLM model\n", + " llm = ChatOpenAI(temperature=0, model = 'gpt-4o-mini', openai_api_key=openai_api_key)\n", + "\n", + " # Extract relevant patient data from EHR\n", + " relevant_patient_data = extract_relevant_patient_data(patient_ehr)\n", + "\n", + " # Create the prompt for evaluating eligibility\n", + " prompt_template = PromptTemplate(\n", + " input_variables=[\"criteria_keywords\", \"patient_data\"],\n", + " template=f\"\"\"\n", + " {system_message}\n", + "\n", + " Criteria Keywords: {{criteria_keywords}}\n", + "\n", + " Patient Information: {{patient_data}}\n", + "\n", + " For each criterion keyword, respond with:\n", + " - \"Yes\" if the patient meets the criterion\n", + " - \"No\" if the patient does not meet the criterion and reason\n", + " \n", + " While evaluating one criteria, consider only the respective criteria but not any other criteria.\n", + " While rating the criteria, with 'Yes' or 'No', do not give any reasoning\n", + " \n", + " The format of response should be as below:\n", + " \n", + " Inclusion Criteria:\n", + " - Keyword Placeholder 1: Yes \n", + " - Keyword Placeholder 2: No \n", + " - Keyword Placeholder 3: No \n", + " - Keyword Placeholder 4: Yes \n", + " .\n", + " .\n", + " .\n", + " - Keyword Placeholder N: Yes\n", + " \n", + " Exclusion Criteria:\n", + " - Keyword Placeholder 1: No\n", + " - Keyword Placeholder 2: Yes\n", + " - Keyword Placeholder 3: No\n", + " - Keyword Placeholder 4: Yes\n", + " .\n", + " .\n", + " .\n", + " - Keyword Placeholder N: No\n", + " \n", + " While giving the response, do not output the whole criteria mentioned in the txt file. Instead, just give the keyword and the response.\n", + " \n", + " \n", + " \"\"\"\n", + " )\n", + "\n", + " # Format the prompt with the criteria keywords and patient data\n", + " prompt = prompt_template.format(\n", + " criteria_keywords=criteria_keywords,\n", + " patient_data=relevant_patient_data\n", + " )\n", + " \n", + " # Send the prompt to the LLM for processing\n", + " response = llm(prompt)\n", + " \n", + " return response" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def process_patient_eligibility(trial_criteria, patient_ehr):\n", + " # Step 1: Identify keywords from trial criteria\n", + " criteria_keywords = identify_criteria_keywords(trial_criteria)\n", + " \n", + " # Step 2: Evaluate patient eligibility based on identified keywords\n", + " eligibility_results = evaluate_criteria_by_keywords(criteria_keywords, patient_ehr)\n", + " \n", + " return eligibility_results.content\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/vh/y8k1dgkj76s01krzn8qtn3nw0000gn/T/ipykernel_81061/552846339.py:32: LangChainDeprecationWarning: The method `BaseChatModel.__call__` was deprecated in langchain-core 0.1.7 and will be removed in 1.0. Use :meth:`~invoke` instead.\n", + " response = llm(prompt)\n" + ] + }, + { + "data": { + "text/plain": [ + "'Inclusion Criteria:\\n- Gender: No \\n- Age: No \\n- BMI: No \\n\\nExclusion Criteria:\\n- Health Status: Yes\\n- Medical History: Yes\\n- Medication Use: No \\n- Substance Use: Yes\\n- Infectious Disease Status: Yes\\n- Environmental Exposure: Yes'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "process_patient_eligibility(trial_criteria, patient_ehr)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Inclusion Criteria:\n", + "- Gender: No \n", + "- Age: No \n", + "- BMI: No \n", + "\n", + "Exclusion Criteria:\n", + "- Health Status: Yes\n", + "- Medical History: Yes\n", + "- Medication Use: No \n", + "- Substance Use: Yes\n", + "- Infectious Disease Status: Yes\n", + "- Environmental Exposure: Yes\n" + ] + } + ], + "source": [ + "print('Inclusion Criteria:\\n- Gender: No \\n- Age: No \\n- BMI: No \\n\\nExclusion Criteria:\\n- Health Status: Yes\\n- Medical History: Yes\\n- Medication Use: No \\n- Substance Use: Yes\\n- Infectious Disease Status: Yes\\n- Environmental Exposure: Yes'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Patient Eligibility Details:\n", + "Inclusion Criteria:\n", + "- Gender: No \n", + "- Age: No \n", + "- BMI: No \n", + "- Weight: Yes \n", + "\n", + "Exclusion Criteria:\n", + "- Health Status: Yes\n", + "- Medical History: Yes\n", + "- Medication Use: No\n", + "- Substance Use: Yes\n", + "- Infectious Disease Status: Yes\n", + "- Environmental Exposure: Yes\n", + "\n", + "Overall Patient Eligibility: No\n" + ] + } + ], + "source": [ + "def determine_overall_eligibility(eligibility_results):\n", + " # Split the results into inclusion and exclusion criteria\n", + " inclusion_criteria, exclusion_criteria = eligibility_results.split('Exclusion Criteria:')\n", + " \n", + " # Check inclusion criteria\n", + " inclusion_eligible = all(result.strip().endswith(': Yes') for result in inclusion_criteria.split('\\n') if result.strip())\n", + " \n", + " # Check exclusion criteria\n", + " exclusion_eligible = all(result.strip().endswith(': No') for result in exclusion_criteria.split('\\n') if result.strip())\n", + " \n", + " # Determine overall eligibility\n", + " overall_eligible = inclusion_eligible and exclusion_eligible\n", + " \n", + " return \"Yes\" if overall_eligible else \"No\"\n", + "\n", + "# Use the function\n", + "eligibility_results = process_patient_eligibility(trial_criteria, patient_ehr)\n", + "final_eligibility = determine_overall_eligibility(eligibility_results)\n", + "\n", + "print(f\"Patient Eligibility Details:\\n{eligibility_results}\")\n", + "print(f\"\\nOverall Patient Eligibility: {final_eligibility}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Patient Eligibility Details:\n", + "Gender: No\n", + "Age: No\n", + "BMI: No\n", + "Weight: Yes\n", + "Health Status: Yes\n", + "Medical History: Yes\n", + "Medication Use: No\n", + "Substance Use: Yes\n", + "Infectious Disease Status: Yes\n", + "Environmental Exposure: Yes\n", + "\n", + "Overall Patient Eligibility: No\n" + ] + } + ], + "source": [ + "def parse_eligibility_results(eligibility_results):\n", + " # Split the results into inclusion and exclusion criteria\n", + " inclusion_criteria, exclusion_criteria = eligibility_results.split('Exclusion Criteria:')\n", + " \n", + " eligibility_dict = {}\n", + " \n", + " # Process inclusion criteria\n", + " for line in inclusion_criteria.split('\\n'):\n", + " if line.strip().startswith('-'):\n", + " key, value = line.strip('- ').split(': ')\n", + " eligibility_dict[key.strip()] = value.strip()\n", + " \n", + " # Process exclusion criteria\n", + " for line in exclusion_criteria.split('\\n'):\n", + " if line.strip().startswith('-'):\n", + " key, value = line.strip('- ').split(': ')\n", + " eligibility_dict[key.strip()] = value.strip()\n", + " \n", + " return eligibility_dict\n", + "\n", + "def determine_overall_eligibility(eligibility_dict):\n", + " # Check if all values are 'Yes'\n", + " return \"Yes\" if all(value == 'Yes' for value in eligibility_dict.values()) else \"No\"\n", + "\n", + "# Use the functions\n", + "eligibility_dict = parse_eligibility_results(eligibility_results)\n", + "final_eligibility = determine_overall_eligibility(eligibility_dict)\n", + "\n", + "print(\"Patient Eligibility Details:\")\n", + "for criterion, status in eligibility_dict.items():\n", + " print(f\"{criterion}: {status}\")\n", + "\n", + "print(f\"\\nOverall Patient Eligibility: {final_eligibility}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Eligibility results saved to 1a654b50-5c1d-ec96-1d56-8d7c12140983_eligibility.json\n", + "\n", + "Overall Patient Eligibility: No\n" + ] + } + ], + "source": [ + "def extract_ids(patient_ehr_path, trial_criteria_path):\n", + " # Extract patient ID from the file name\n", + " patient_id = os.path.basename(patient_ehr_path).split('_')[0]\n", + " \n", + " # Extract trial ID from the file name\n", + " trial_id = os.path.basename(trial_criteria_path).split('_')[0]\n", + " \n", + " return patient_id, trial_id\n", + "\n", + "def extract_study_title(trial_criteria_path):\n", + " # Read the first line of the trial criteria file to get the study title\n", + " with open(trial_criteria_path, 'r') as f:\n", + " first_line = f.readline().strip()\n", + " # Assuming the format is \"Study Title: {title}\"\n", + " if first_line.startswith(\"Study Title:\"):\n", + " return first_line.replace(\"Study Title:\", \"\").strip()\n", + " return None\n", + "\n", + "def create_eligibility_json(patient_id, trial_id, study_title, eligibility_dict):\n", + " # Create the JSON structure\n", + " eligibility_json = {\n", + " \"patientId\": patient_id,\n", + " \"eligibleTrials\": [\n", + " {\n", + " \"trialId\": trial_id,\n", + " \"studyTitle\": study_title,\n", + " \"eligibilityCriteriaMet\": [\n", + " criterion for criterion, status in eligibility_dict.items() if status == \"Yes\"\n", + " ]\n", + " }\n", + " ]\n", + " }\n", + " return eligibility_json\n", + "\n", + "# Main process\n", + "patient_id, trial_id = extract_ids(single_patient_ehr_path, single_trial_criteria_path)\n", + "study_title = extract_study_title(single_trial_criteria_path)\n", + "\n", + "# Process eligibility\n", + "# eligibility_results = process_patient_eligibility(trial_criteria, patient_ehr)\n", + "eligibility_dict = parse_eligibility_results(eligibility_results)\n", + "\n", + "# Create the JSON structure\n", + "eligibility_json = create_eligibility_json(patient_id, trial_id, study_title, eligibility_dict)\n", + "\n", + "# Save the JSON file\n", + "output_filename = f\"{patient_id}_eligibility.json\"\n", + "with open(output_filename, 'w') as f:\n", + " json.dump(eligibility_json, f, indent=2)\n", + "\n", + "print(f\"Eligibility results saved to {output_filename}\")\n", + "\n", + "# Print overall eligibility\n", + "final_eligibility = determine_overall_eligibility(eligibility_dict)\n", + "print(f\"\\nOverall Patient Eligibility: {final_eligibility}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Scraping Environment", + "language": "python", + "name": "scraping_env" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}