Diff of /ehrql/codes.py [000000] .. [e988c2]

Switch to side-by-side view

--- a
+++ b/ehrql/codes.py
@@ -0,0 +1,232 @@
+"""
+We make each coding system a distinct type. The query model's type checking will then
+enforce that queries use the appropriate coding system for a given column.
+"""
+
+import csv
+import dataclasses
+import re
+from pathlib import Path
+
+
+class CodelistError(ValueError): ...
+
+
+@dataclasses.dataclass(frozen=True)
+class BaseCode:
+    value: str
+
+    def __post_init__(self):
+        if not self.regex.fullmatch(self.value):
+            raise ValueError(f"Invalid {self.__class__.__name__}: {self.value}")
+
+    @classmethod
+    def _primitive_type(cls):
+        return str
+
+    # The presence of this method allows query engines to work with values of this type,
+    # despite not being explicitly told about them beforehand
+    def _to_primitive_type(self):
+        return self.value
+
+
+# A base class for fields that are concatenated lists of clinical codes. This occurs
+# in the admitted patient care spell (apcs) table of hospital episode statistics for
+# all_diagnoses (ICD10 codes), and all_procedures (OPCS4 codes).
+#
+# This inherits from str because that's what the underlying data is, but is in this
+# file as it's sort of a code. In future a better implementation might be to parse the
+# field value into a Set of clinical codes.
+class BaseMultiCodeString(str):
+    @classmethod
+    def _code_type(cls):
+        raise NotImplementedError(
+            "BaseMultiCodeString subclasses must implement the _code_type method"
+        )
+
+    @classmethod
+    def _primitive_type(cls):
+        return str
+
+
+class BNFCode(BaseCode):
+    "Pseudo BNF"
+
+    regex = re.compile(
+        r"""
+        # Standard BNF code
+          # Chapter, Section, Paragraph, Sub-paragraph
+          [01][0-9]{6}
+          # Chemical
+          [0-9A-Z]{2}
+          # Product, strength-formulation, generic equivalent
+          ([A-Z][0-9A-Z]){3}
+        | # OR
+        # Appliances
+        2[0-3][0-9]{9}
+        """,
+        re.VERBOSE,
+    )
+
+
+class CTV3Code(BaseCode):
+    "CTV3 (Read v3)"
+
+    # Some of the CTV3 codes in the OpenCodelists coding system database (though not any
+    # actually used in codelists) violate the below format, either by having a leading
+    # dot or by starting with a tilde. However I have confirmed that, aside from a tiny
+    # handful of cases, these invalid codes are not used in the database so there should
+    # never be a need to create codelists which use them.
+    regex = re.compile(
+        r"""
+        [0-9A-Za-z]{5}
+        | [0-9A-Za-z]{4}\.{1}
+        | [0-9A-Za-z]{3}\.{2}
+        | [0-9A-Za-z]{2}\.{3}
+        """,
+        re.VERBOSE,
+    )
+
+
+class ICD10Code(BaseCode):
+    "ICD-10"
+
+    regex = re.compile(r"[A-Z][0-9]{2,3}")
+
+
+class OPCS4Code(BaseCode):
+    "OPCS-4"
+
+    # The documented structure requires three digits, and a dot between the 2nd and 3rd
+    # digit, but the codes we have in OpenCodelists omit the dot and sometimes have only
+    # two digits.
+    # https://en.wikipedia.org/wiki/OPCS-4#Code_structure
+    regex = re.compile(
+        r"""
+        # Uppercase letter excluding I
+        [ABCDEFGHJKLMNOPQRSTUVWXYZ]
+        [0-9]{2,3}
+        """,
+        re.VERBOSE,
+    )
+
+
+class SNOMEDCTCode(BaseCode):
+    "SNOMED-CT"
+
+    # 6-18 digit number with no leading zeros
+    # https://confluence.ihtsdotools.org/display/DOCRELFMT/6.1+SCTID+Data+Type
+    regex = re.compile(r"[1-9][0-9]{5,17}")
+
+
+# Dictionary of Medicines and Devices
+class DMDCode(BaseCode):
+    "dm+d"
+
+    # Syntactically equivalent to SNOMED-CT
+    regex = SNOMEDCTCode.regex
+
+
+#
+# ICD10 codelist as concatenated string
+#
+# This is specifically for fields in the admitted patient care (APC) part
+# of the hospital episode statistics (HES) data where there are fields
+# that are a concatenation of all diagnosis codes for a patient's episode
+# or spell.
+class ICD10MultiCodeString(BaseMultiCodeString):
+    "Multiple ICD-10 codes"
+
+    @classmethod
+    def _code_type(cls):
+        return ICD10Code
+
+    # We want to allow prefix searching on this field so users can
+    # search this field for a string prefix. This ensures they pass
+    # a valid prefix so we can throw an error, rather than silently
+    # failing by running but returning 0 records
+    regex = re.compile(r"[A-Z][0-9]{0,3}")
+
+
+#
+# OPCS4 codelist as concatenated string
+#
+# This is specifically for fields in the admitted patient care (APC) part
+# of the hospital episode statistics (HES) data where there are fields
+# that are a concatenation of all procedure codes for a patient's episode
+# or spell.
+class OPCS4MultiCodeString(BaseMultiCodeString):
+    "Multiple OPCS4 codes"
+
+    @classmethod
+    def _code_type(cls):
+        return OPCS4Code
+
+    # We want to allow prefix searching on this field so users can
+    # search this field for a string prefix. This ensures they pass
+    # a valid prefix so we can throw an error, rather than silently
+    # failing by running but returning 0 records
+    regex = re.compile(r"[A-Z][0-9]{0,3}")
+
+
+def codelist_from_csv(filename, *, column, category_column=None):
+    """
+    Read a codelist from a CSV file as either a list or a dictionary (for categorised
+    codelists).
+
+    _filename_<br>
+    Path to the file on disk, relative to the root of your repository. (Remember to use
+    UNIX/style/forward-slashes not Windows\\style\\backslashes.)
+
+    _column_<br>
+    Name of the column in the CSV file which contains the codes.
+
+    _category_column_<br>
+    Optional name of a column in the CSV file which contains categories to which each
+    code should be mapped. If this argument is passed then the resulting codelist will
+    be a dictionary mapping each code to its corresponding category. This can be passed
+    to the [`to_category()`](#CodePatientSeries.to_category) method to map a series of
+    codes to a series of categories.
+
+    For more detail see the [how-to guide](../how-to/examples.md/#using-codelists-with-category-columns).
+    """
+    filename = Path(filename)
+    if not filename.exists():
+        # If the character which comes after the backslash in the string literal happens
+        # to form a valid escape sequence then no backslash will appear in the compiled
+        # string. Checking the repr for backslashes has false positives (e.g. a tab
+        # character will trigger it) but that seems OK in this context.
+        if "\\" in repr(filename):
+            hint = (
+                "\n\n"
+                "HINT: Use forward slash (/) instead of backslash (\\) in file paths"
+            )
+        else:
+            hint = ""
+        raise CodelistError(f"No CSV file at {filename}{hint}")
+    with filename.open("r") as f:
+        return codelist_from_csv_lines(
+            f, column=column, category_column=category_column
+        )
+
+
+def codelist_from_csv_lines(lines, *, column, category_column=None):
+    if category_column is None:
+        category_column = column
+        return_list = True
+    else:
+        return_list = False
+    # Using `restval=""` ensures we never get None instead of string, so we can always
+    # call `.strip()` without blowing up
+    reader = csv.DictReader(iter(lines), restval="")
+    if column not in reader.fieldnames:
+        raise CodelistError(f"No column '{column}' in CSV")
+    if category_column not in reader.fieldnames:
+        raise CodelistError(f"No column '{category_column}' in CSV")
+    code_map = {row[column].strip(): row[category_column].strip() for row in reader}
+    # Discard any empty codes
+    code_map.pop("", None)
+    if return_list:
+        return list(code_map)
+    else:
+        return code_map