[96a5a0]: / src / services / identifier.py

Download this file

224 lines (185 with data), 8.4 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# services/identifier.py
"""
Clinical Trial Criteria Identification Service
This module is responsible for extracting atomic criteria from clinical trial eligibility
criteria text. It uses LLM-based extraction to identify individual criteria and their
properties from natural language descriptions.
The identification process follows these steps:
1. Split criteria text into lines
2. For each line, extract atomic criteria using an LLM
3. Verify that extracted criteria match the source text
4. Organize results into structured data models
Functions:
identify_criterions_from_rawTrial: Extract criteria from a raw trial.
process_lines: Process a section of eligibility criteria into structured lines.
extract_atomic_criteria_from_line: Extract atomic criteria from a single line.
verify: Verify that extracted criteria match the source text.
"""
import logging
import re
from typing import List, Tuple
import rich
from src.models.identified_criteria import (
IdentifiedLine,
IdentifiedTrial,
LLMIdentifiedLineResponse,
LLMMultiRequirementCriterion,
RawTrialData,
)
from src.utils.config import TEMPERATURE, TIMEOUT
from src.utils.openai_client import get_openai_client
logger = logging.getLogger(__name__)
# Initialize OpenAI client
client = get_openai_client()
def identify_criterions_from_rawTrial(trial: RawTrialData) -> IdentifiedTrial:
"""
Extract and structure atomic criteria from a clinical trial's eligibility criteria.
Args:
trial (RawTrialData): The trial containing raw eligibility criteria.
Returns:
IdentifiedTrial: The structured criteria extracted from the trial.
Raises:
ValueError: If no atomic criteria could be extracted from the trial.
"""
logger.info(
"Starting identification of criteria for trial NCT ID: %s", trial.nct_id
)
if logger.level <= logging.DEBUG:
rich.print(trial) # Using rich.print for better readability in debug mode
# Process each section of criteria
inclusion_criteria_lines, inclusion_failed = process_lines(trial.inclusion_criteria)
exclusion_criteria_lines, exclusion_failed = process_lines(trial.exclusion_criteria)
miscellaneous_criteria_lines, miscellaneous_failed = process_lines(
trial.miscellaneous_criteria
)
# Combine the results into the IdentifiedTrial object
identified_trial = IdentifiedTrial(
info=trial,
inclusion_lines=inclusion_criteria_lines,
exclusion_lines=exclusion_criteria_lines,
miscellaneous_lines=miscellaneous_criteria_lines,
failed_inclusion=inclusion_failed,
failed_exclusion=exclusion_failed,
failed_miscellaneous=miscellaneous_failed,
)
# Check if any criteria were successfully extracted
if (
inclusion_criteria_lines
or exclusion_criteria_lines
or miscellaneous_criteria_lines
):
logger.info(
"Successfully identified criteria for trial NCT ID: %s", trial.nct_id
)
return identified_trial
else:
logger.warning(
"No atomic criteria extracted for trial NCT ID: %s", trial.nct_id
)
raise ValueError(
f"No atomic criteria extracted for trial NCT ID: {trial.nct_id}\nThis was the failed result: {identified_trial}"
)
def process_lines(
section_text: str,
) -> Tuple[List[IdentifiedLine], List[IdentifiedLine]]:
"""
Process a section of eligibility criteria text into structured lines.
Args:
section_text (str): The text of a criteria section (inclusion, exclusion, or miscellaneous).
Returns:
tuple: A tuple containing:
- List of successfully identified lines with structured criteria
- List of lines that failed to be processed correctly
"""
# Split the text into individual lines
lines = [
line.strip() for line in re.split(r"[\n\r]+", section_text) if line.strip()
]
identified_criteria_lines: List[IdentifiedLine] = []
failed: List[IdentifiedLine] = []
for index, line in enumerate(lines):
logger.debug("Processing line %d: %s", index + 1, line)
# Extract atomic criteria from the line
extracted_response = extract_atomic_criteria_from_line(line)
extracted_criteria = extracted_response.atomic_criteria
if extracted_criteria:
try:
# Verify that criteria match the source text
verify(line, extracted_criteria)
identified_criteria_lines.append(
IdentifiedLine(line=line, criterions=extracted_criteria)
)
except ValueError as e:
logger.error("Error validating line %d: %s", index + 1, e)
failed.append(IdentifiedLine(line=line, criterions=extracted_criteria))
else:
logger.warning("Failed to extract criteria from line %d.", index + 1)
failed.append(IdentifiedLine(line=line, criterions=[]))
return identified_criteria_lines, failed
def extract_atomic_criteria_from_line(line: str) -> LLMIdentifiedLineResponse:
"""
Extract atomic criteria from a single line of eligibility criteria text using an LLM.
Args:
line (str): A single line of eligibility criteria text.
Returns:
LLMIdentifiedLineResponse: The structured criteria extracted from the line.
Raises:
ValueError: If the LLM extraction process fails.
"""
logger.debug("Extracting atomic criteria from line: %s", line)
# Define the system prompt for the LLM
prompt = (
"You are an expert in clinical trial eligibility criteria."
"Given the following line from an Oncological Clinical Trial Eligibility Criteria, extract every individual criterion they are testing the patient for."
"In other words, what are the specific properties/attributes/conditions that are being tested for in the patient?"
"For each criterion, provide the exact snippets from the line that you used to identify it."
"Should your exact snippets be non-contiguous then provide multiple short exact snippets"
)
try:
# Send the line to the LLM for processing
completion = client.beta.chat.completions.parse(
model="gpt-4o",
messages=[
{"role": "system", "content": prompt},
{"role": "user", "content": line},
],
temperature=TEMPERATURE,
response_format=LLMIdentifiedLineResponse,
timeout=TIMEOUT,
)
if response := completion.choices[0].message.parsed:
logger.debug("Successfully extracted atomic criteria from line: %s", line)
return response
else:
logger.warning("Failed to parse LLM response.")
raise ValueError(f"Failed to parse LLM response for line: '{line}'")
except Exception as e:
logger.error("Error during LLM extraction: %s", e)
raise ValueError(f"Error during LLM extraction: {e}") from e
def verify(line: str, criteria_list: List[LLMMultiRequirementCriterion]) -> None:
"""
Verify that each criterion's exact snippets are found in the line.
Args:
line (str): The original line of eligibility criteria text.
criteria_list (List[LLMSingleRawCriterion]): List of extracted criteria.
Raises:
ValueError: If any criterion's snippet is not found in the line.
"""
logger.info("Verifying criteria snippets.")
for criterion in criteria_list:
# Split the exact_snippets string by ellipses and verify each part
snippets = [
snippet.strip() for snippet in criterion.exact_snippets.split("...")
]
for snippet in snippets:
# Find the snippet in the line (removing backslashes for comparison)
index = line.replace("\\", "").find(snippet.replace("\\", ""))
if index == -1:
logger.error(
"Criterion snippet not found in line. Line: '%s', Raw text: '%s'",
line,
snippet,
)
raise ValueError(
f"Criterion raw_text not found in line. Line: '{line}', Raw text: '{snippet}'"
)