Switch to side-by-side view

--- a
+++ b/src/services/logical_structurizer.py
@@ -0,0 +1,355 @@
+# services/logical_structurizer.py
+"""
+Clinical Trial Logical Structurizing Service
+
+This module is responsible for organizing identified criteria from clinical trial eligibility
+text into logical structures. It uses LLM-based processing to determine the relationships
+between atomic criteria (AND, OR, NOT, XOR, conditional statements).
+
+The structurizing process follows these steps:
+1. Process each line in the identified trial data
+2. Transform atomic criteria into logical structures using an LLM
+3. Verify that all identified criteria are present in the logical structure
+4. Organize results into comprehensive logical models
+
+Functions:
+    logically_structurize_line: Transform an identified line into a logical structure.
+    confirm_criteria_presence: Verify completeness of logical structure against identified criteria.
+    extract_criteria_from_logical_structure: Recursively extract all criteria from a logical structure.
+    logically_structurize_trial: Process an entire trial's criteria into logical structures.
+"""
+
+import logging
+from enum import Enum
+from typing import List, Union
+
+from src.models.identified_criteria import IdentifiedLine, IdentifiedTrial, Requirement
+from src.models.logical_criteria import (
+    IdentifiedUnrolledLine,
+    LLMLogicalAnd,
+    LLMLogicalConditional,
+    LLMLogicalNot,
+    LLMLogicalOr,
+    LLMLogicalWrapperResponse,
+    LLMLogicalXor,
+    LogicalLine,
+    LogicalTrial,
+    SingleRequirementCriterion,
+)
+from src.utils.config import TEMPERATURE, TIMEOUT
+from src.utils.openai_client import get_openai_client
+
+logger = logging.getLogger(__name__)
+
+# Initialize OpenAI client
+client = get_openai_client()
+
+
+class CriteriaType(Enum):
+    """Enum for different types of clinical trial criteria."""
+
+    INCLUSION = "inclusion"
+    EXCLUSION = "exclusion"
+    MISCELLANEOUS = "miscellaneous"
+
+
+# Mapping of criteria types to explanatory notes for the LLM
+type_to_note = {
+    CriteriaType.INCLUSION: "Note that this is an inclusion criterion, so should this evaluate to true, the patient will qualify for the clinical trial",
+    CriteriaType.EXCLUSION: "Note that this is an exclusion criterion, so should this evaluate to true, the patient will not qualify for the clinical trial",
+    CriteriaType.MISCELLANEOUS: "Treat the line evaluating to true as qualifying for the clinical trial and false as not qualifying for the clinical trial",
+}
+
+
+def UnrollLine(line: IdentifiedLine) -> IdentifiedUnrolledLine:
+    """
+    Split each LLMMultiRequirementCriterion with multiple requirements into multiple criteria, each with one requirement.
+
+    Args:
+        line (IdentifiedLine): The line containing identified atomic criteria.
+
+    Returns:
+        IdentifiedUnrolledLine: The line with criteria, each with a single requirement.
+    """
+    split_criteria: list[SingleRequirementCriterion] = []
+    for criterion in line.criterions:
+        split_criteria.extend(
+            SingleRequirementCriterion(
+                exact_snippets=criterion.exact_snippets,
+                criterion=criterion.criterion,
+                requirement=requirement,
+            )
+            for requirement in criterion.requirements
+        )
+    return IdentifiedUnrolledLine(line=line.line, criterions=split_criteria)
+
+
+def logically_structurize_line(
+    line: IdentifiedUnrolledLine, criteria_type: CriteriaType
+) -> LogicalLine:
+    """
+    Transform an identified line's criteria into logical relationships.
+
+    Args:
+        line (IdentifiedUnrolledLine): The line containing identified atomic criteria.
+        criteria_type (CriteriaType): The type of criteria (inclusion, exclusion, miscellaneous).
+
+    Returns:
+        LogicalLine: The line with structured logical relationships between criteria.
+
+    Raises:
+        ValueError: If the LLM structurizing process fails.
+    """
+    logger.debug("Structurizing line: %s", line)
+
+    # Define the system prompt for the LLM
+    prompt = (
+        "You are an expert in clinical trial eligibility criteria.\n"
+        "Given the following line from an Oncological Clinical Trial Eligibility Criteria and its individual atomic criteria, structurize the criteria into logical relationships.\n"
+        "You may only use the atomic criteria provided, do not add any new criteria. Instead you are to \n"
+    )
+
+    # Add context about how this criteria type affects qualification
+    prompt += type_to_note[criteria_type]
+
+    try:
+        # Send the line to the LLM for logical structurizing
+        completion = client.beta.chat.completions.parse(
+            model="gpt-4o",
+            messages=[
+                {"role": "system", "content": prompt},
+                {"role": "user", "content": str(line)},
+            ],
+            temperature=TEMPERATURE,
+            response_format=LLMLogicalWrapperResponse,
+            timeout=TIMEOUT,
+        )
+
+        if response := completion.choices[0].message.parsed:
+            logger.debug("Successfully structured line logically: %s", line)
+            logger.debug("LLM response: %s", response)
+        else:
+            logger.warning("Failed to parse LLM response.\n%s", completion.choices[0])
+            raise ValueError(f"Failed to parse LLM response for line: '{line}'")
+
+    except Exception as e:
+        logger.error("Error during LLM structurizing: %s", e)
+        raise ValueError(f"Error during LLM structurizing: {e}") from e
+
+    return LogicalLine(
+        identified_line=line, logical_structure=response.logicalRepresentation
+    )
+
+
+def confirm_criteria_presence(logical_line: LogicalLine) -> None:
+    """
+    Verifies that all SingleRawCriterion instances in the IdentifiedLine are present in the logical structure.
+
+    Args:
+        logical_line (LogicalLine): The logical line to verify.
+
+    Raises:
+        ValueError: If any identified criteria are missing from the logical structure.
+    """
+
+    logger.info("Verifying criteria presence for line: %s", logical_line)
+    # Extract all criteria from the identified line
+    identified_criteria: List[SingleRequirementCriterion] = (
+        logical_line.identified_line.criterions
+    )
+
+    # Extract all criteria from the logical structure
+    logical_criteria: list[SingleRequirementCriterion] = (
+        extract_criteria_from_logical_structure(logical_line.logical_structure)
+    )
+
+    logger.info("Identified criteria: %s", identified_criteria)
+    logger.info("Logical criteria: %s", logical_criteria)
+    logger.info("Checking for missing criteria...")
+
+    # Check if criteria sets match exactly (no missing, no extras)
+    identified_set = set(identified_criteria)
+    logical_set = set(logical_criteria)
+
+    if identified_set != logical_set:
+        identifyDifference(identified_set, logical_set)
+    else:
+        logger.info("Criteria sets match exactly - no missing or extra criteria.")
+
+
+# TODO Rename this here and in `confirm_criteria_presence`
+def identifyDifference(identified_set, logical_set):
+    missing = identified_set - logical_set
+    extra = logical_set - identified_set
+
+    error_msg = ""
+    if missing:
+        error_msg += f"Missing criteria in logical structure: {missing}. "
+    if extra:
+        error_msg += f"Extra criteria in logical structure: {extra}. "
+
+    logger.error(error_msg)
+    raise ValueError(error_msg)
+
+
+def extract_criteria_from_logical_structure(
+    logical_structure: Union[
+        SingleRequirementCriterion,
+        LLMLogicalAnd,
+        LLMLogicalOr,
+        LLMLogicalNot,
+        LLMLogicalXor,
+        LLMLogicalConditional,
+    ],
+) -> List[SingleRequirementCriterion]:
+    """
+    Recursively extracts criteria from the logical structure to make a set of all criteria involved.
+
+    Args:
+        logical_structure: The logical structure to extract criteria from (can be any logical type).
+
+    Returns:
+        set: A set of criteria found in the logical structure.
+    """
+    criteria: list[SingleRequirementCriterion] = []
+
+    # Handle different types of logical structures recursively
+    if isinstance(logical_structure, SingleRequirementCriterion):
+        criteria.append(logical_structure)
+    elif isinstance(logical_structure, LLMLogicalAnd):
+        for sub_criteria in logical_structure.and_criteria:
+            criteria.extend(extract_criteria_from_logical_structure(sub_criteria))
+    elif isinstance(logical_structure, LLMLogicalOr):
+        for sub_criteria in logical_structure.or_criteria:
+            criteria.extend(extract_criteria_from_logical_structure(sub_criteria))
+    elif isinstance(logical_structure, LLMLogicalNot):
+
+        criteria.extend(
+            extract_criteria_from_logical_structure(logical_structure.not_criteria)
+        )
+    elif isinstance(logical_structure, LLMLogicalXor):
+        for sub_criteria in logical_structure.xor_criteria:
+            criteria.extend(extract_criteria_from_logical_structure(sub_criteria))
+    elif isinstance(logical_structure, LLMLogicalConditional):
+        criteria.extend(
+            extract_criteria_from_logical_structure(logical_structure.condition)
+        )
+        if logical_structure.then_criteria:
+            criteria.extend(
+                extract_criteria_from_logical_structure(logical_structure.then_criteria)
+            )
+        if logical_structure.else_criteria:
+            criteria.extend(
+                extract_criteria_from_logical_structure(logical_structure.else_criteria)
+            )
+
+    return criteria
+
+
+def process_criteria_lines(
+    lines: list[IdentifiedLine], criteria_type: CriteriaType
+) -> tuple[list[LogicalLine], list[LogicalLine]]:
+    """
+    Process a list of criteria lines into logical structures.
+
+    Args:
+        lines (list[IdentifiedLine]): The list of identified lines to process.
+        criteria_type (CriteriaType): The type of criteria (inclusion, exclusion, miscellaneous).
+
+    Returns:
+        tuple: A tuple containing:
+            - List of successfully structured logical lines
+            - List of lines that failed to be structured
+    """
+    successful_lines = []
+    failed_lines = []
+
+    for line in lines:
+        unrolled_line = UnrollLine(line)
+        try:
+            # Attempt to structurize the line
+            logical_line = logically_structurize_line(unrolled_line, criteria_type)
+            try:
+                # Verify criteria completeness
+                confirm_criteria_presence(logical_line)
+                successful_lines.append(logical_line)
+            except ValueError as validation_error:
+                logger.error(
+                    "Validation failed for %s line: %s", criteria_type.value, line
+                )
+                logger.error("Validation error: %s", validation_error)
+                failed_lines.append(logical_line)
+        except Exception as e:
+            logger.error("Failed to structurize %s line: %s", criteria_type.value, line)
+            logger.error("Error: %s", e)
+            # Create a placeholder for failed lines
+            failed_lines.append(
+                LogicalLine(
+                    identified_line=unrolled_line,
+                    logical_structure=SingleRequirementCriterion(
+                        exact_snippets="failed",
+                        criterion="failed",
+                        requirement=Requirement(
+                            requirement_type="failed", expected_value="failed"
+                        ),
+                    ),
+                )
+            )
+
+    return successful_lines, failed_lines
+
+
+def logically_structurize_trial(trial: IdentifiedTrial) -> LogicalTrial:
+    """
+    Structurizes all criteria lines of a trial into logical relationships.
+
+    Args:
+        trial (IdentifiedTrial): The identified trial to be structurized.
+
+    Returns:
+        LogicalTrial: The trial with logically structured criteria.
+    """
+    logger.info(
+        "Starting logical structurizing for trial NCT ID: %s", trial.info.nct_id
+    )
+
+    # Process inclusion criteria
+    logger.debug(
+        "Processing %d inclusion criteria lines",
+        len(trial.inclusion_lines) + len(trial.failed_inclusion),
+    )
+    inclusion_lines, failed_inclusion = process_criteria_lines(
+        trial.inclusion_lines + trial.failed_inclusion, CriteriaType.INCLUSION
+    )
+
+    # Process exclusion criteria
+    logger.debug(
+        "Processing %d exclusion criteria lines",
+        len(trial.exclusion_lines) + len(trial.failed_exclusion),
+    )
+    exclusion_lines, failed_exclusion = process_criteria_lines(
+        trial.exclusion_lines + trial.failed_exclusion, CriteriaType.EXCLUSION
+    )
+
+    # Process miscellaneous criteria
+    logger.debug(
+        "Processing %d miscellaneous criteria lines",
+        len(trial.miscellaneous_lines) + len(trial.failed_miscellaneous),
+    )
+    miscellaneous_lines, failed_miscellaneous = process_criteria_lines(
+        trial.miscellaneous_lines + trial.failed_miscellaneous,
+        CriteriaType.MISCELLANEOUS,
+    )
+
+    logger.info(
+        "Completed logical structurizing for trial NCT ID: %s", trial.info.nct_id
+    )
+    return LogicalTrial(
+        info=trial.info,
+        inclusion_lines=inclusion_lines,
+        exclusion_lines=exclusion_lines,
+        miscellaneous_lines=miscellaneous_lines,
+        failed_inclusion=failed_inclusion,
+        failed_exclusion=failed_exclusion,
+        failed_miscellaneous=failed_miscellaneous,
+    )