a b/src/services/identifier.py
1
# services/identifier.py
2
"""
3
Clinical Trial Criteria Identification Service
4
5
This module is responsible for extracting atomic criteria from clinical trial eligibility
6
criteria text. It uses LLM-based extraction to identify individual criteria and their
7
properties from natural language descriptions.
8
9
The identification process follows these steps:
10
1. Split criteria text into lines
11
2. For each line, extract atomic criteria using an LLM
12
3. Verify that extracted criteria match the source text
13
4. Organize results into structured data models
14
15
Functions:
16
    identify_criterions_from_rawTrial: Extract criteria from a raw trial.
17
    process_lines: Process a section of eligibility criteria into structured lines.
18
    extract_atomic_criteria_from_line: Extract atomic criteria from a single line.
19
    verify: Verify that extracted criteria match the source text.
20
"""
21
22
import logging
23
import re
24
from typing import List, Tuple
25
26
import rich
27
28
from src.models.identified_criteria import (
29
    IdentifiedLine,
30
    IdentifiedTrial,
31
    LLMIdentifiedLineResponse,
32
    LLMMultiRequirementCriterion,
33
    RawTrialData,
34
)
35
from src.utils.config import TEMPERATURE, TIMEOUT
36
from src.utils.openai_client import get_openai_client
37
38
logger = logging.getLogger(__name__)
39
40
# Initialize OpenAI client
41
client = get_openai_client()
42
43
44
def identify_criterions_from_rawTrial(trial: RawTrialData) -> IdentifiedTrial:
45
    """
46
    Extract and structure atomic criteria from a clinical trial's eligibility criteria.
47
48
    Args:
49
        trial (RawTrialData): The trial containing raw eligibility criteria.
50
51
    Returns:
52
        IdentifiedTrial: The structured criteria extracted from the trial.
53
54
    Raises:
55
        ValueError: If no atomic criteria could be extracted from the trial.
56
    """
57
    logger.info(
58
        "Starting identification of criteria for trial NCT ID: %s", trial.nct_id
59
    )
60
    if logger.level <= logging.DEBUG:
61
        rich.print(trial)  # Using rich.print for better readability in debug mode
62
63
    # Process each section of criteria
64
    inclusion_criteria_lines, inclusion_failed = process_lines(trial.inclusion_criteria)
65
    exclusion_criteria_lines, exclusion_failed = process_lines(trial.exclusion_criteria)
66
    miscellaneous_criteria_lines, miscellaneous_failed = process_lines(
67
        trial.miscellaneous_criteria
68
    )
69
70
    # Combine the results into the IdentifiedTrial object
71
    identified_trial = IdentifiedTrial(
72
        info=trial,
73
        inclusion_lines=inclusion_criteria_lines,
74
        exclusion_lines=exclusion_criteria_lines,
75
        miscellaneous_lines=miscellaneous_criteria_lines,
76
        failed_inclusion=inclusion_failed,
77
        failed_exclusion=exclusion_failed,
78
        failed_miscellaneous=miscellaneous_failed,
79
    )
80
81
    # Check if any criteria were successfully extracted
82
    if (
83
        inclusion_criteria_lines
84
        or exclusion_criteria_lines
85
        or miscellaneous_criteria_lines
86
    ):
87
        logger.info(
88
            "Successfully identified criteria for trial NCT ID: %s", trial.nct_id
89
        )
90
        return identified_trial
91
    else:
92
        logger.warning(
93
            "No atomic criteria extracted for trial NCT ID: %s", trial.nct_id
94
        )
95
        raise ValueError(
96
            f"No atomic criteria extracted for trial NCT ID: {trial.nct_id}\nThis was the failed result: {identified_trial}"
97
        )
98
99
100
def process_lines(
101
    section_text: str,
102
) -> Tuple[List[IdentifiedLine], List[IdentifiedLine]]:
103
    """
104
    Process a section of eligibility criteria text into structured lines.
105
106
    Args:
107
        section_text (str): The text of a criteria section (inclusion, exclusion, or miscellaneous).
108
109
    Returns:
110
        tuple: A tuple containing:
111
            - List of successfully identified lines with structured criteria
112
            - List of lines that failed to be processed correctly
113
    """
114
    # Split the text into individual lines
115
    lines = [
116
        line.strip() for line in re.split(r"[\n\r]+", section_text) if line.strip()
117
    ]
118
    identified_criteria_lines: List[IdentifiedLine] = []
119
    failed: List[IdentifiedLine] = []
120
121
    for index, line in enumerate(lines):
122
        logger.debug("Processing line %d: %s", index + 1, line)
123
124
        # Extract atomic criteria from the line
125
        extracted_response = extract_atomic_criteria_from_line(line)
126
        extracted_criteria = extracted_response.atomic_criteria
127
128
        if extracted_criteria:
129
            try:
130
                # Verify that criteria match the source text
131
                verify(line, extracted_criteria)
132
                identified_criteria_lines.append(
133
                    IdentifiedLine(line=line, criterions=extracted_criteria)
134
                )
135
            except ValueError as e:
136
                logger.error("Error validating line %d: %s", index + 1, e)
137
                failed.append(IdentifiedLine(line=line, criterions=extracted_criteria))
138
        else:
139
            logger.warning("Failed to extract criteria from line %d.", index + 1)
140
            failed.append(IdentifiedLine(line=line, criterions=[]))
141
142
    return identified_criteria_lines, failed
143
144
145
def extract_atomic_criteria_from_line(line: str) -> LLMIdentifiedLineResponse:
146
    """
147
    Extract atomic criteria from a single line of eligibility criteria text using an LLM.
148
149
    Args:
150
        line (str): A single line of eligibility criteria text.
151
152
    Returns:
153
        LLMIdentifiedLineResponse: The structured criteria extracted from the line.
154
155
    Raises:
156
        ValueError: If the LLM extraction process fails.
157
    """
158
    logger.debug("Extracting atomic criteria from line: %s", line)
159
160
    # Define the system prompt for the LLM
161
    prompt = (
162
        "You are an expert in clinical trial eligibility criteria."
163
        "Given the following line from an Oncological Clinical Trial Eligibility Criteria, extract every individual criterion they are testing the patient for."
164
        "In other words, what are the specific properties/attributes/conditions that are being tested for in the patient?"
165
        "For each criterion, provide the exact snippets from the line that you used to identify it."
166
        "Should your exact snippets be non-contiguous then provide multiple short exact snippets"
167
    )
168
169
    try:
170
        # Send the line to the LLM for processing
171
        completion = client.beta.chat.completions.parse(
172
            model="gpt-4o",
173
            messages=[
174
                {"role": "system", "content": prompt},
175
                {"role": "user", "content": line},
176
            ],
177
            temperature=TEMPERATURE,
178
            response_format=LLMIdentifiedLineResponse,
179
            timeout=TIMEOUT,
180
        )
181
182
        if response := completion.choices[0].message.parsed:
183
            logger.debug("Successfully extracted atomic criteria from line: %s", line)
184
            return response
185
        else:
186
            logger.warning("Failed to parse LLM response.")
187
            raise ValueError(f"Failed to parse LLM response for line: '{line}'")
188
189
    except Exception as e:
190
        logger.error("Error during LLM extraction: %s", e)
191
        raise ValueError(f"Error during LLM extraction: {e}") from e
192
193
194
def verify(line: str, criteria_list: List[LLMMultiRequirementCriterion]) -> None:
195
    """
196
    Verify that each criterion's exact snippets are found in the line.
197
198
    Args:
199
        line (str): The original line of eligibility criteria text.
200
        criteria_list (List[LLMSingleRawCriterion]): List of extracted criteria.
201
202
    Raises:
203
        ValueError: If any criterion's snippet is not found in the line.
204
    """
205
    logger.info("Verifying criteria snippets.")
206
207
    for criterion in criteria_list:
208
        # Split the exact_snippets string by ellipses and verify each part
209
        snippets = [
210
            snippet.strip() for snippet in criterion.exact_snippets.split("...")
211
        ]
212
        for snippet in snippets:
213
            # Find the snippet in the line (removing backslashes for comparison)
214
            index = line.replace("\\", "").find(snippet.replace("\\", ""))
215
            if index == -1:
216
                logger.error(
217
                    "Criterion snippet not found in line. Line: '%s', Raw text: '%s'",
218
                    line,
219
                    snippet,
220
                )
221
                raise ValueError(
222
                    f"Criterion raw_text not found in line. Line: '{line}', Raw text: '{snippet}'"
223
                )