486 lines (485 with data), 18.0 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import os\n",
"from langchain.prompts import PromptTemplate\n",
"from langchain.chains import LLMChain\n",
"from langchain_openai import ChatOpenAI\n",
"from langchain.chains import LLMChain, SimpleSequentialChain\n",
"\n",
"from dotenv import load_dotenv\n",
"\n",
"openai_api_key = os.getenv(\"OPENAI_API_KEY\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"single_patient_ehr_path = '/Users/bharathbeeravelly/Desktop/patient-trials-matching/data/processed/patients_small/1a654b50-5c1d-ec96-1d56-8d7c12140983_data.json'\n",
"single_trial_criteria_path = '/Users/bharathbeeravelly/Desktop/patient-trials-matching/data/raw/scraped_small/NCT06576401_criteria.txt'\n",
"\n",
"# Read the single patient EHR and trial criteria\n",
"with open(single_patient_ehr_path) as f:\n",
" patient_ehr = json.load(f)\n",
"\n",
"with open(single_trial_criteria_path) as f:\n",
" trial_criteria = f.read()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# 1st LLM: Identify keywords for each criterion\n",
"def identify_criteria_keywords(trial_criteria):\n",
" # Define the system message for the LLM to identify relevant keywords\n",
" system_message = \"\"\"\n",
" You are a clinical trial assistant.\n",
" Your task is to read the inclusion, exclusion, and other criteria of a clinical trial, and identify relevant keywords from each criterion.\n",
" \n",
" Common keywords may include: \"Gender\", \"Age\", \"Race\", \"Ethnic Group\", \"Language\", \"BMI\", \"BPM\", \"Height\", \"Weight\", etc.\n",
"\n",
" For each criterion, respond with the most relevant keyword or attribute it is concerned with.\n",
" \"\"\"\n",
"\n",
" # Initialize the OpenAI LLM model\n",
" llm = ChatOpenAI(temperature=0, model = 'gpt-4o-mini', openai_api_key=openai_api_key)\n",
"\n",
" # Create the prompt for keyword identification\n",
" prompt_template = PromptTemplate(\n",
" input_variables=[\"criteria\"],\n",
" template=f\"\"\"\n",
" {system_message}\n",
"\n",
" Trial Criteria: {{criteria}}\n",
"\n",
" For each criterion, identify the relevant keyword or patient attribute.\n",
" \"\"\"\n",
" )\n",
" \n",
" # Format the prompt with the actual trial criteria\n",
" prompt = prompt_template.format(criteria=trial_criteria)\n",
" \n",
" # Send the prompt to the LLM for processing\n",
" response = llm(prompt)\n",
" \n",
" # Return the keywords identified by the LLM\n",
" return response"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Function to extract only the relevant information from Patient_EHR for LLM processing\n",
"def extract_relevant_patient_data(patient_ehr):\n",
" # Extracting necessary attributes from the patient EHR\n",
" # Extract the relevant patient information from the EHR for the second chain\n",
" relevant_patient_data = {\n",
" \"Gender\": patient_ehr.get(\"Gender\"),\n",
" \"Age\": patient_ehr.get(\"Age\"),\n",
" \"Race\": patient_ehr.get(\"Race\"),\n",
" \"Ethnic Group\": patient_ehr.get(\"Ethnic Group\"),\n",
" \"Language\": patient_ehr.get(\"Language\"),\n",
" \"Vital Signs\": patient_ehr.get(\"Vital Signs\"),\n",
" \"Medications\": patient_ehr.get(\"Medications\"),\n",
" \"Problems\": patient_ehr.get(\"Problems\"),\n",
" \"Surgeries\": patient_ehr.get(\"Surgeries\"),\n",
" \"Immunizations\": patient_ehr.get(\"Immunizations\"),\n",
" }\n",
" return relevant_patient_data"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# 2nd LLM: Evaluate patient eligibility based on keywords\n",
"def evaluate_criteria_by_keywords(criteria_keywords, patient_ehr):\n",
" # Define the system message for the LLM to evaluate eligibility\n",
" system_message = \"\"\"\n",
" You are a clinical trial assistant. Today's date is 07th October 2024. \n",
" Your task is to compare the patient's information (Gender, Age, Race, Ethnic Group, Language, Vital Signs) \n",
" with the clinical trial's inclusion and exclusion criteria using the identified keywords. \n",
" If any medication is used by the patient, then check the last usage of the medication.\n",
" \n",
" \n",
" For each inclusion criterion, respond with one of the following:\n",
" - \"Yes\" if the patient meets the criterion\n",
" - \"No\" if there is evidence that the criterion is not met\n",
" - \"Yes\" if there is no information available to determine eligibility.\n",
" \n",
" For each exclusion criterion, respond with one of the following:\n",
" - \"Yes\" if the patient does not meet the criterion\n",
" - \"No\" if there is evidence that the criterion is met\n",
" - \"Yes\" if there is no information available to determine eligibility.\n",
" \n",
" \"\"\"\n",
"\n",
" # Initialize the OpenAI LLM model\n",
" llm = ChatOpenAI(temperature=0, model = 'gpt-4o-mini', openai_api_key=openai_api_key)\n",
"\n",
" # Extract relevant patient data from EHR\n",
" relevant_patient_data = extract_relevant_patient_data(patient_ehr)\n",
"\n",
" # Create the prompt for evaluating eligibility\n",
" prompt_template = PromptTemplate(\n",
" input_variables=[\"criteria_keywords\", \"patient_data\"],\n",
" template=f\"\"\"\n",
" {system_message}\n",
"\n",
" Criteria Keywords: {{criteria_keywords}}\n",
"\n",
" Patient Information: {{patient_data}}\n",
"\n",
" For each criterion keyword, respond with:\n",
" - \"Yes\" if the patient meets the criterion\n",
" - \"No\" if the patient does not meet the criterion and reason\n",
" \n",
" While evaluating one criteria, consider only the respective criteria but not any other criteria.\n",
" While rating the criteria, with 'Yes' or 'No', do not give any reasoning\n",
" \n",
" The format of response should be as below:\n",
" \n",
" Inclusion Criteria:\n",
" - Keyword Placeholder 1: Yes \n",
" - Keyword Placeholder 2: No \n",
" - Keyword Placeholder 3: No \n",
" - Keyword Placeholder 4: Yes \n",
" .\n",
" .\n",
" .\n",
" - Keyword Placeholder N: Yes\n",
" \n",
" Exclusion Criteria:\n",
" - Keyword Placeholder 1: No\n",
" - Keyword Placeholder 2: Yes\n",
" - Keyword Placeholder 3: No\n",
" - Keyword Placeholder 4: Yes\n",
" .\n",
" .\n",
" .\n",
" - Keyword Placeholder N: No\n",
" \n",
" While giving the response, do not output the whole criteria mentioned in the txt file. Instead, just give the keyword and the response.\n",
" \n",
" \n",
" \"\"\"\n",
" )\n",
"\n",
" # Format the prompt with the criteria keywords and patient data\n",
" prompt = prompt_template.format(\n",
" criteria_keywords=criteria_keywords,\n",
" patient_data=relevant_patient_data\n",
" )\n",
" \n",
" # Send the prompt to the LLM for processing\n",
" response = llm(prompt)\n",
" \n",
" return response"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def process_patient_eligibility(trial_criteria, patient_ehr):\n",
" # Step 1: Identify keywords from trial criteria\n",
" criteria_keywords = identify_criteria_keywords(trial_criteria)\n",
" \n",
" # Step 2: Evaluate patient eligibility based on identified keywords\n",
" eligibility_results = evaluate_criteria_by_keywords(criteria_keywords, patient_ehr)\n",
" \n",
" return eligibility_results.content\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/vh/y8k1dgkj76s01krzn8qtn3nw0000gn/T/ipykernel_81061/552846339.py:32: LangChainDeprecationWarning: The method `BaseChatModel.__call__` was deprecated in langchain-core 0.1.7 and will be removed in 1.0. Use :meth:`~invoke` instead.\n",
" response = llm(prompt)\n"
]
},
{
"data": {
"text/plain": [
"'Inclusion Criteria:\\n- Gender: No \\n- Age: No \\n- BMI: No \\n\\nExclusion Criteria:\\n- Health Status: Yes\\n- Medical History: Yes\\n- Medication Use: No \\n- Substance Use: Yes\\n- Infectious Disease Status: Yes\\n- Environmental Exposure: Yes'"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"process_patient_eligibility(trial_criteria, patient_ehr)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Inclusion Criteria:\n",
"- Gender: No \n",
"- Age: No \n",
"- BMI: No \n",
"\n",
"Exclusion Criteria:\n",
"- Health Status: Yes\n",
"- Medical History: Yes\n",
"- Medication Use: No \n",
"- Substance Use: Yes\n",
"- Infectious Disease Status: Yes\n",
"- Environmental Exposure: Yes\n"
]
}
],
"source": [
"print('Inclusion Criteria:\\n- Gender: No \\n- Age: No \\n- BMI: No \\n\\nExclusion Criteria:\\n- Health Status: Yes\\n- Medical History: Yes\\n- Medication Use: No \\n- Substance Use: Yes\\n- Infectious Disease Status: Yes\\n- Environmental Exposure: Yes'\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Patient Eligibility Details:\n",
"Inclusion Criteria:\n",
"- Gender: No \n",
"- Age: No \n",
"- BMI: No \n",
"- Weight: Yes \n",
"\n",
"Exclusion Criteria:\n",
"- Health Status: Yes\n",
"- Medical History: Yes\n",
"- Medication Use: No\n",
"- Substance Use: Yes\n",
"- Infectious Disease Status: Yes\n",
"- Environmental Exposure: Yes\n",
"\n",
"Overall Patient Eligibility: No\n"
]
}
],
"source": [
"def determine_overall_eligibility(eligibility_results):\n",
" # Split the results into inclusion and exclusion criteria\n",
" inclusion_criteria, exclusion_criteria = eligibility_results.split('Exclusion Criteria:')\n",
" \n",
" # Check inclusion criteria\n",
" inclusion_eligible = all(result.strip().endswith(': Yes') for result in inclusion_criteria.split('\\n') if result.strip())\n",
" \n",
" # Check exclusion criteria\n",
" exclusion_eligible = all(result.strip().endswith(': No') for result in exclusion_criteria.split('\\n') if result.strip())\n",
" \n",
" # Determine overall eligibility\n",
" overall_eligible = inclusion_eligible and exclusion_eligible\n",
" \n",
" return \"Yes\" if overall_eligible else \"No\"\n",
"\n",
"# Use the function\n",
"eligibility_results = process_patient_eligibility(trial_criteria, patient_ehr)\n",
"final_eligibility = determine_overall_eligibility(eligibility_results)\n",
"\n",
"print(f\"Patient Eligibility Details:\\n{eligibility_results}\")\n",
"print(f\"\\nOverall Patient Eligibility: {final_eligibility}\")"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Patient Eligibility Details:\n",
"Gender: No\n",
"Age: No\n",
"BMI: No\n",
"Weight: Yes\n",
"Health Status: Yes\n",
"Medical History: Yes\n",
"Medication Use: No\n",
"Substance Use: Yes\n",
"Infectious Disease Status: Yes\n",
"Environmental Exposure: Yes\n",
"\n",
"Overall Patient Eligibility: No\n"
]
}
],
"source": [
"def parse_eligibility_results(eligibility_results):\n",
" # Split the results into inclusion and exclusion criteria\n",
" inclusion_criteria, exclusion_criteria = eligibility_results.split('Exclusion Criteria:')\n",
" \n",
" eligibility_dict = {}\n",
" \n",
" # Process inclusion criteria\n",
" for line in inclusion_criteria.split('\\n'):\n",
" if line.strip().startswith('-'):\n",
" key, value = line.strip('- ').split(': ')\n",
" eligibility_dict[key.strip()] = value.strip()\n",
" \n",
" # Process exclusion criteria\n",
" for line in exclusion_criteria.split('\\n'):\n",
" if line.strip().startswith('-'):\n",
" key, value = line.strip('- ').split(': ')\n",
" eligibility_dict[key.strip()] = value.strip()\n",
" \n",
" return eligibility_dict\n",
"\n",
"def determine_overall_eligibility(eligibility_dict):\n",
" # Check if all values are 'Yes'\n",
" return \"Yes\" if all(value == 'Yes' for value in eligibility_dict.values()) else \"No\"\n",
"\n",
"# Use the functions\n",
"eligibility_dict = parse_eligibility_results(eligibility_results)\n",
"final_eligibility = determine_overall_eligibility(eligibility_dict)\n",
"\n",
"print(\"Patient Eligibility Details:\")\n",
"for criterion, status in eligibility_dict.items():\n",
" print(f\"{criterion}: {status}\")\n",
"\n",
"print(f\"\\nOverall Patient Eligibility: {final_eligibility}\")"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Eligibility results saved to 1a654b50-5c1d-ec96-1d56-8d7c12140983_eligibility.json\n",
"\n",
"Overall Patient Eligibility: No\n"
]
}
],
"source": [
"def extract_ids(patient_ehr_path, trial_criteria_path):\n",
" # Extract patient ID from the file name\n",
" patient_id = os.path.basename(patient_ehr_path).split('_')[0]\n",
" \n",
" # Extract trial ID from the file name\n",
" trial_id = os.path.basename(trial_criteria_path).split('_')[0]\n",
" \n",
" return patient_id, trial_id\n",
"\n",
"def extract_study_title(trial_criteria_path):\n",
" # Read the first line of the trial criteria file to get the study title\n",
" with open(trial_criteria_path, 'r') as f:\n",
" first_line = f.readline().strip()\n",
" # Assuming the format is \"Study Title: {title}\"\n",
" if first_line.startswith(\"Study Title:\"):\n",
" return first_line.replace(\"Study Title:\", \"\").strip()\n",
" return None\n",
"\n",
"def create_eligibility_json(patient_id, trial_id, study_title, eligibility_dict):\n",
" # Create the JSON structure\n",
" eligibility_json = {\n",
" \"patientId\": patient_id,\n",
" \"eligibleTrials\": [\n",
" {\n",
" \"trialId\": trial_id,\n",
" \"studyTitle\": study_title,\n",
" \"eligibilityCriteriaMet\": [\n",
" criterion for criterion, status in eligibility_dict.items() if status == \"Yes\"\n",
" ]\n",
" }\n",
" ]\n",
" }\n",
" return eligibility_json\n",
"\n",
"# Main process\n",
"patient_id, trial_id = extract_ids(single_patient_ehr_path, single_trial_criteria_path)\n",
"study_title = extract_study_title(single_trial_criteria_path)\n",
"\n",
"# Process eligibility\n",
"# eligibility_results = process_patient_eligibility(trial_criteria, patient_ehr)\n",
"eligibility_dict = parse_eligibility_results(eligibility_results)\n",
"\n",
"# Create the JSON structure\n",
"eligibility_json = create_eligibility_json(patient_id, trial_id, study_title, eligibility_dict)\n",
"\n",
"# Save the JSON file\n",
"output_filename = f\"{patient_id}_eligibility.json\"\n",
"with open(output_filename, 'w') as f:\n",
" json.dump(eligibility_json, f, indent=2)\n",
"\n",
"print(f\"Eligibility results saved to {output_filename}\")\n",
"\n",
"# Print overall eligibility\n",
"final_eligibility = determine_overall_eligibility(eligibility_dict)\n",
"print(f\"\\nOverall Patient Eligibility: {final_eligibility}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Scraping Environment",
"language": "python",
"name": "scraping_env"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
}
},
"nbformat": 4,
"nbformat_minor": 2
}