|
a |
|
b/src/services/identifier.py |
|
|
1 |
# services/identifier.py |
|
|
2 |
""" |
|
|
3 |
Clinical Trial Criteria Identification Service |
|
|
4 |
|
|
|
5 |
This module is responsible for extracting atomic criteria from clinical trial eligibility |
|
|
6 |
criteria text. It uses LLM-based extraction to identify individual criteria and their |
|
|
7 |
properties from natural language descriptions. |
|
|
8 |
|
|
|
9 |
The identification process follows these steps: |
|
|
10 |
1. Split criteria text into lines |
|
|
11 |
2. For each line, extract atomic criteria using an LLM |
|
|
12 |
3. Verify that extracted criteria match the source text |
|
|
13 |
4. Organize results into structured data models |
|
|
14 |
|
|
|
15 |
Functions: |
|
|
16 |
identify_criterions_from_rawTrial: Extract criteria from a raw trial. |
|
|
17 |
process_lines: Process a section of eligibility criteria into structured lines. |
|
|
18 |
extract_atomic_criteria_from_line: Extract atomic criteria from a single line. |
|
|
19 |
verify: Verify that extracted criteria match the source text. |
|
|
20 |
""" |
|
|
21 |
|
|
|
22 |
import logging |
|
|
23 |
import re |
|
|
24 |
from typing import List, Tuple |
|
|
25 |
|
|
|
26 |
import rich |
|
|
27 |
|
|
|
28 |
from src.models.identified_criteria import ( |
|
|
29 |
IdentifiedLine, |
|
|
30 |
IdentifiedTrial, |
|
|
31 |
LLMIdentifiedLineResponse, |
|
|
32 |
LLMMultiRequirementCriterion, |
|
|
33 |
RawTrialData, |
|
|
34 |
) |
|
|
35 |
from src.utils.config import TEMPERATURE, TIMEOUT |
|
|
36 |
from src.utils.openai_client import get_openai_client |
|
|
37 |
|
|
|
38 |
logger = logging.getLogger(__name__) |
|
|
39 |
|
|
|
40 |
# Initialize OpenAI client |
|
|
41 |
client = get_openai_client() |
|
|
42 |
|
|
|
43 |
|
|
|
44 |
def identify_criterions_from_rawTrial(trial: RawTrialData) -> IdentifiedTrial: |
|
|
45 |
""" |
|
|
46 |
Extract and structure atomic criteria from a clinical trial's eligibility criteria. |
|
|
47 |
|
|
|
48 |
Args: |
|
|
49 |
trial (RawTrialData): The trial containing raw eligibility criteria. |
|
|
50 |
|
|
|
51 |
Returns: |
|
|
52 |
IdentifiedTrial: The structured criteria extracted from the trial. |
|
|
53 |
|
|
|
54 |
Raises: |
|
|
55 |
ValueError: If no atomic criteria could be extracted from the trial. |
|
|
56 |
""" |
|
|
57 |
logger.info( |
|
|
58 |
"Starting identification of criteria for trial NCT ID: %s", trial.nct_id |
|
|
59 |
) |
|
|
60 |
if logger.level <= logging.DEBUG: |
|
|
61 |
rich.print(trial) # Using rich.print for better readability in debug mode |
|
|
62 |
|
|
|
63 |
# Process each section of criteria |
|
|
64 |
inclusion_criteria_lines, inclusion_failed = process_lines(trial.inclusion_criteria) |
|
|
65 |
exclusion_criteria_lines, exclusion_failed = process_lines(trial.exclusion_criteria) |
|
|
66 |
miscellaneous_criteria_lines, miscellaneous_failed = process_lines( |
|
|
67 |
trial.miscellaneous_criteria |
|
|
68 |
) |
|
|
69 |
|
|
|
70 |
# Combine the results into the IdentifiedTrial object |
|
|
71 |
identified_trial = IdentifiedTrial( |
|
|
72 |
info=trial, |
|
|
73 |
inclusion_lines=inclusion_criteria_lines, |
|
|
74 |
exclusion_lines=exclusion_criteria_lines, |
|
|
75 |
miscellaneous_lines=miscellaneous_criteria_lines, |
|
|
76 |
failed_inclusion=inclusion_failed, |
|
|
77 |
failed_exclusion=exclusion_failed, |
|
|
78 |
failed_miscellaneous=miscellaneous_failed, |
|
|
79 |
) |
|
|
80 |
|
|
|
81 |
# Check if any criteria were successfully extracted |
|
|
82 |
if ( |
|
|
83 |
inclusion_criteria_lines |
|
|
84 |
or exclusion_criteria_lines |
|
|
85 |
or miscellaneous_criteria_lines |
|
|
86 |
): |
|
|
87 |
logger.info( |
|
|
88 |
"Successfully identified criteria for trial NCT ID: %s", trial.nct_id |
|
|
89 |
) |
|
|
90 |
return identified_trial |
|
|
91 |
else: |
|
|
92 |
logger.warning( |
|
|
93 |
"No atomic criteria extracted for trial NCT ID: %s", trial.nct_id |
|
|
94 |
) |
|
|
95 |
raise ValueError( |
|
|
96 |
f"No atomic criteria extracted for trial NCT ID: {trial.nct_id}\nThis was the failed result: {identified_trial}" |
|
|
97 |
) |
|
|
98 |
|
|
|
99 |
|
|
|
100 |
def process_lines( |
|
|
101 |
section_text: str, |
|
|
102 |
) -> Tuple[List[IdentifiedLine], List[IdentifiedLine]]: |
|
|
103 |
""" |
|
|
104 |
Process a section of eligibility criteria text into structured lines. |
|
|
105 |
|
|
|
106 |
Args: |
|
|
107 |
section_text (str): The text of a criteria section (inclusion, exclusion, or miscellaneous). |
|
|
108 |
|
|
|
109 |
Returns: |
|
|
110 |
tuple: A tuple containing: |
|
|
111 |
- List of successfully identified lines with structured criteria |
|
|
112 |
- List of lines that failed to be processed correctly |
|
|
113 |
""" |
|
|
114 |
# Split the text into individual lines |
|
|
115 |
lines = [ |
|
|
116 |
line.strip() for line in re.split(r"[\n\r]+", section_text) if line.strip() |
|
|
117 |
] |
|
|
118 |
identified_criteria_lines: List[IdentifiedLine] = [] |
|
|
119 |
failed: List[IdentifiedLine] = [] |
|
|
120 |
|
|
|
121 |
for index, line in enumerate(lines): |
|
|
122 |
logger.debug("Processing line %d: %s", index + 1, line) |
|
|
123 |
|
|
|
124 |
# Extract atomic criteria from the line |
|
|
125 |
extracted_response = extract_atomic_criteria_from_line(line) |
|
|
126 |
extracted_criteria = extracted_response.atomic_criteria |
|
|
127 |
|
|
|
128 |
if extracted_criteria: |
|
|
129 |
try: |
|
|
130 |
# Verify that criteria match the source text |
|
|
131 |
verify(line, extracted_criteria) |
|
|
132 |
identified_criteria_lines.append( |
|
|
133 |
IdentifiedLine(line=line, criterions=extracted_criteria) |
|
|
134 |
) |
|
|
135 |
except ValueError as e: |
|
|
136 |
logger.error("Error validating line %d: %s", index + 1, e) |
|
|
137 |
failed.append(IdentifiedLine(line=line, criterions=extracted_criteria)) |
|
|
138 |
else: |
|
|
139 |
logger.warning("Failed to extract criteria from line %d.", index + 1) |
|
|
140 |
failed.append(IdentifiedLine(line=line, criterions=[])) |
|
|
141 |
|
|
|
142 |
return identified_criteria_lines, failed |
|
|
143 |
|
|
|
144 |
|
|
|
145 |
def extract_atomic_criteria_from_line(line: str) -> LLMIdentifiedLineResponse: |
|
|
146 |
""" |
|
|
147 |
Extract atomic criteria from a single line of eligibility criteria text using an LLM. |
|
|
148 |
|
|
|
149 |
Args: |
|
|
150 |
line (str): A single line of eligibility criteria text. |
|
|
151 |
|
|
|
152 |
Returns: |
|
|
153 |
LLMIdentifiedLineResponse: The structured criteria extracted from the line. |
|
|
154 |
|
|
|
155 |
Raises: |
|
|
156 |
ValueError: If the LLM extraction process fails. |
|
|
157 |
""" |
|
|
158 |
logger.debug("Extracting atomic criteria from line: %s", line) |
|
|
159 |
|
|
|
160 |
# Define the system prompt for the LLM |
|
|
161 |
prompt = ( |
|
|
162 |
"You are an expert in clinical trial eligibility criteria." |
|
|
163 |
"Given the following line from an Oncological Clinical Trial Eligibility Criteria, extract every individual criterion they are testing the patient for." |
|
|
164 |
"In other words, what are the specific properties/attributes/conditions that are being tested for in the patient?" |
|
|
165 |
"For each criterion, provide the exact snippets from the line that you used to identify it." |
|
|
166 |
"Should your exact snippets be non-contiguous then provide multiple short exact snippets" |
|
|
167 |
) |
|
|
168 |
|
|
|
169 |
try: |
|
|
170 |
# Send the line to the LLM for processing |
|
|
171 |
completion = client.beta.chat.completions.parse( |
|
|
172 |
model="gpt-4o", |
|
|
173 |
messages=[ |
|
|
174 |
{"role": "system", "content": prompt}, |
|
|
175 |
{"role": "user", "content": line}, |
|
|
176 |
], |
|
|
177 |
temperature=TEMPERATURE, |
|
|
178 |
response_format=LLMIdentifiedLineResponse, |
|
|
179 |
timeout=TIMEOUT, |
|
|
180 |
) |
|
|
181 |
|
|
|
182 |
if response := completion.choices[0].message.parsed: |
|
|
183 |
logger.debug("Successfully extracted atomic criteria from line: %s", line) |
|
|
184 |
return response |
|
|
185 |
else: |
|
|
186 |
logger.warning("Failed to parse LLM response.") |
|
|
187 |
raise ValueError(f"Failed to parse LLM response for line: '{line}'") |
|
|
188 |
|
|
|
189 |
except Exception as e: |
|
|
190 |
logger.error("Error during LLM extraction: %s", e) |
|
|
191 |
raise ValueError(f"Error during LLM extraction: {e}") from e |
|
|
192 |
|
|
|
193 |
|
|
|
194 |
def verify(line: str, criteria_list: List[LLMMultiRequirementCriterion]) -> None: |
|
|
195 |
""" |
|
|
196 |
Verify that each criterion's exact snippets are found in the line. |
|
|
197 |
|
|
|
198 |
Args: |
|
|
199 |
line (str): The original line of eligibility criteria text. |
|
|
200 |
criteria_list (List[LLMSingleRawCriterion]): List of extracted criteria. |
|
|
201 |
|
|
|
202 |
Raises: |
|
|
203 |
ValueError: If any criterion's snippet is not found in the line. |
|
|
204 |
""" |
|
|
205 |
logger.info("Verifying criteria snippets.") |
|
|
206 |
|
|
|
207 |
for criterion in criteria_list: |
|
|
208 |
# Split the exact_snippets string by ellipses and verify each part |
|
|
209 |
snippets = [ |
|
|
210 |
snippet.strip() for snippet in criterion.exact_snippets.split("...") |
|
|
211 |
] |
|
|
212 |
for snippet in snippets: |
|
|
213 |
# Find the snippet in the line (removing backslashes for comparison) |
|
|
214 |
index = line.replace("\\", "").find(snippet.replace("\\", "")) |
|
|
215 |
if index == -1: |
|
|
216 |
logger.error( |
|
|
217 |
"Criterion snippet not found in line. Line: '%s', Raw text: '%s'", |
|
|
218 |
line, |
|
|
219 |
snippet, |
|
|
220 |
) |
|
|
221 |
raise ValueError( |
|
|
222 |
f"Criterion raw_text not found in line. Line: '{line}', Raw text: '{snippet}'" |
|
|
223 |
) |