ehrql / Git / [e988c2] /ehrql/codes.py

Models:
philipB/
ehrql
Downloads: 1
[e988c2]: / ehrql / codes.py
History
Download this file
233 lines (187 with data), 7.5 kB

"""
We make each coding system a distinct type. The query model's type checking will then
enforce that queries use the appropriate coding system for a given column.
"""

import csv
import dataclasses
import re
from pathlib import Path


class CodelistError(ValueError): ...


@dataclasses.dataclass(frozen=True)
class BaseCode:
    value: str

    def __post_init__(self):
        if not self.regex.fullmatch(self.value):
            raise ValueError(f"Invalid {self.__class__.__name__}: {self.value}")

    @classmethod
    def _primitive_type(cls):
        return str

    # The presence of this method allows query engines to work with values of this type,
    # despite not being explicitly told about them beforehand
    def _to_primitive_type(self):
        return self.value


# A base class for fields that are concatenated lists of clinical codes. This occurs
# in the admitted patient care spell (apcs) table of hospital episode statistics for
# all_diagnoses (ICD10 codes), and all_procedures (OPCS4 codes).
#
# This inherits from str because that's what the underlying data is, but is in this
# file as it's sort of a code. In future a better implementation might be to parse the
# field value into a Set of clinical codes.
class BaseMultiCodeString(str):
    @classmethod
    def _code_type(cls):
        raise NotImplementedError(
            "BaseMultiCodeString subclasses must implement the _code_type method"
        )

    @classmethod
    def _primitive_type(cls):
        return str


class BNFCode(BaseCode):
    "Pseudo BNF"

    regex = re.compile(
        r"""
        # Standard BNF code
          # Chapter, Section, Paragraph, Sub-paragraph
          [01][0-9]{6}
          # Chemical
          [0-9A-Z]{2}
          # Product, strength-formulation, generic equivalent
          ([A-Z][0-9A-Z]){3}
        | # OR
        # Appliances
        2[0-3][0-9]{9}
        """,
        re.VERBOSE,
    )


class CTV3Code(BaseCode):
    "CTV3 (Read v3)"

    # Some of the CTV3 codes in the OpenCodelists coding system database (though not any
    # actually used in codelists) violate the below format, either by having a leading
    # dot or by starting with a tilde. However I have confirmed that, aside from a tiny
    # handful of cases, these invalid codes are not used in the database so there should
    # never be a need to create codelists which use them.
    regex = re.compile(
        r"""
        [0-9A-Za-z]{5}
        | [0-9A-Za-z]{4}\.{1}
        | [0-9A-Za-z]{3}\.{2}
        | [0-9A-Za-z]{2}\.{3}
        """,
        re.VERBOSE,
    )


class ICD10Code(BaseCode):
    "ICD-10"

    regex = re.compile(r"[A-Z][0-9]{2,3}")


class OPCS4Code(BaseCode):
    "OPCS-4"

    # The documented structure requires three digits, and a dot between the 2nd and 3rd
    # digit, but the codes we have in OpenCodelists omit the dot and sometimes have only
    # two digits.
    # https://en.wikipedia.org/wiki/OPCS-4#Code_structure
    regex = re.compile(
        r"""
        # Uppercase letter excluding I
        [ABCDEFGHJKLMNOPQRSTUVWXYZ]
        [0-9]{2,3}
        """,
        re.VERBOSE,
    )


class SNOMEDCTCode(BaseCode):
    "SNOMED-CT"

    # 6-18 digit number with no leading zeros
    # https://confluence.ihtsdotools.org/display/DOCRELFMT/6.1+SCTID+Data+Type
    regex = re.compile(r"[1-9][0-9]{5,17}")


# Dictionary of Medicines and Devices
class DMDCode(BaseCode):
    "dm+d"

    # Syntactically equivalent to SNOMED-CT
    regex = SNOMEDCTCode.regex


#
# ICD10 codelist as concatenated string
#
# This is specifically for fields in the admitted patient care (APC) part
# of the hospital episode statistics (HES) data where there are fields
# that are a concatenation of all diagnosis codes for a patient's episode
# or spell.
class ICD10MultiCodeString(BaseMultiCodeString):
    "Multiple ICD-10 codes"

    @classmethod
    def _code_type(cls):
        return ICD10Code

    # We want to allow prefix searching on this field so users can
    # search this field for a string prefix. This ensures they pass
    # a valid prefix so we can throw an error, rather than silently
    # failing by running but returning 0 records
    regex = re.compile(r"[A-Z][0-9]{0,3}")


#
# OPCS4 codelist as concatenated string
#
# This is specifically for fields in the admitted patient care (APC) part
# of the hospital episode statistics (HES) data where there are fields
# that are a concatenation of all procedure codes for a patient's episode
# or spell.
class OPCS4MultiCodeString(BaseMultiCodeString):
    "Multiple OPCS4 codes"

    @classmethod
    def _code_type(cls):
        return OPCS4Code

    # We want to allow prefix searching on this field so users can
    # search this field for a string prefix. This ensures they pass
    # a valid prefix so we can throw an error, rather than silently
    # failing by running but returning 0 records
    regex = re.compile(r"[A-Z][0-9]{0,3}")


def codelist_from_csv(filename, *, column, category_column=None):
    """
    Read a codelist from a CSV file as either a list or a dictionary (for categorised
    codelists).

    _filename_<br>
    Path to the file on disk, relative to the root of your repository. (Remember to use
    UNIX/style/forward-slashes not Windows\\style\\backslashes.)

    _column_<br>
    Name of the column in the CSV file which contains the codes.

    _category_column_<br>
    Optional name of a column in the CSV file which contains categories to which each
    code should be mapped. If this argument is passed then the resulting codelist will
    be a dictionary mapping each code to its corresponding category. This can be passed
    to the [`to_category()`](#CodePatientSeries.to_category) method to map a series of
    codes to a series of categories.

    For more detail see the [how-to guide](../how-to/examples.md/#using-codelists-with-category-columns).
    """
    filename = Path(filename)
    if not filename.exists():
        # If the character which comes after the backslash in the string literal happens
        # to form a valid escape sequence then no backslash will appear in the compiled
        # string. Checking the repr for backslashes has false positives (e.g. a tab
        # character will trigger it) but that seems OK in this context.
        if "\\" in repr(filename):
            hint = (
                "\n\n"
                "HINT: Use forward slash (/) instead of backslash (\\) in file paths"
            )
        else:
            hint = ""
        raise CodelistError(f"No CSV file at {filename}{hint}")
    with filename.open("r") as f:
        return codelist_from_csv_lines(
            f, column=column, category_column=category_column
        )


def codelist_from_csv_lines(lines, *, column, category_column=None):
    if category_column is None:
        category_column = column
        return_list = True
    else:
        return_list = False
    # Using `restval=""` ensures we never get None instead of string, so we can always
    # call `.strip()` without blowing up
    reader = csv.DictReader(iter(lines), restval="")
    if column not in reader.fieldnames:
        raise CodelistError(f"No column '{column}' in CSV")
    if category_column not in reader.fieldnames:
        raise CodelistError(f"No column '{category_column}' in CSV")
    code_map = {row[column].strip(): row[category_column].strip() for row in reader}
    # Discard any empty codes
    code_map.pop("", None)
    if return_list:
        return list(code_map)
    else:
        return code_map