--- a +++ b/ehrql/codes.py @@ -0,0 +1,232 @@ +""" +We make each coding system a distinct type. The query model's type checking will then +enforce that queries use the appropriate coding system for a given column. +""" + +import csv +import dataclasses +import re +from pathlib import Path + + +class CodelistError(ValueError): ... + + +@dataclasses.dataclass(frozen=True) +class BaseCode: + value: str + + def __post_init__(self): + if not self.regex.fullmatch(self.value): + raise ValueError(f"Invalid {self.__class__.__name__}: {self.value}") + + @classmethod + def _primitive_type(cls): + return str + + # The presence of this method allows query engines to work with values of this type, + # despite not being explicitly told about them beforehand + def _to_primitive_type(self): + return self.value + + +# A base class for fields that are concatenated lists of clinical codes. This occurs +# in the admitted patient care spell (apcs) table of hospital episode statistics for +# all_diagnoses (ICD10 codes), and all_procedures (OPCS4 codes). +# +# This inherits from str because that's what the underlying data is, but is in this +# file as it's sort of a code. In future a better implementation might be to parse the +# field value into a Set of clinical codes. +class BaseMultiCodeString(str): + @classmethod + def _code_type(cls): + raise NotImplementedError( + "BaseMultiCodeString subclasses must implement the _code_type method" + ) + + @classmethod + def _primitive_type(cls): + return str + + +class BNFCode(BaseCode): + "Pseudo BNF" + + regex = re.compile( + r""" + # Standard BNF code + # Chapter, Section, Paragraph, Sub-paragraph + [01][0-9]{6} + # Chemical + [0-9A-Z]{2} + # Product, strength-formulation, generic equivalent + ([A-Z][0-9A-Z]){3} + | # OR + # Appliances + 2[0-3][0-9]{9} + """, + re.VERBOSE, + ) + + +class CTV3Code(BaseCode): + "CTV3 (Read v3)" + + # Some of the CTV3 codes in the OpenCodelists coding system database (though not any + # actually used in codelists) violate the below format, either by having a leading + # dot or by starting with a tilde. However I have confirmed that, aside from a tiny + # handful of cases, these invalid codes are not used in the database so there should + # never be a need to create codelists which use them. + regex = re.compile( + r""" + [0-9A-Za-z]{5} + | [0-9A-Za-z]{4}\.{1} + | [0-9A-Za-z]{3}\.{2} + | [0-9A-Za-z]{2}\.{3} + """, + re.VERBOSE, + ) + + +class ICD10Code(BaseCode): + "ICD-10" + + regex = re.compile(r"[A-Z][0-9]{2,3}") + + +class OPCS4Code(BaseCode): + "OPCS-4" + + # The documented structure requires three digits, and a dot between the 2nd and 3rd + # digit, but the codes we have in OpenCodelists omit the dot and sometimes have only + # two digits. + # https://en.wikipedia.org/wiki/OPCS-4#Code_structure + regex = re.compile( + r""" + # Uppercase letter excluding I + [ABCDEFGHJKLMNOPQRSTUVWXYZ] + [0-9]{2,3} + """, + re.VERBOSE, + ) + + +class SNOMEDCTCode(BaseCode): + "SNOMED-CT" + + # 6-18 digit number with no leading zeros + # https://confluence.ihtsdotools.org/display/DOCRELFMT/6.1+SCTID+Data+Type + regex = re.compile(r"[1-9][0-9]{5,17}") + + +# Dictionary of Medicines and Devices +class DMDCode(BaseCode): + "dm+d" + + # Syntactically equivalent to SNOMED-CT + regex = SNOMEDCTCode.regex + + +# +# ICD10 codelist as concatenated string +# +# This is specifically for fields in the admitted patient care (APC) part +# of the hospital episode statistics (HES) data where there are fields +# that are a concatenation of all diagnosis codes for a patient's episode +# or spell. +class ICD10MultiCodeString(BaseMultiCodeString): + "Multiple ICD-10 codes" + + @classmethod + def _code_type(cls): + return ICD10Code + + # We want to allow prefix searching on this field so users can + # search this field for a string prefix. This ensures they pass + # a valid prefix so we can throw an error, rather than silently + # failing by running but returning 0 records + regex = re.compile(r"[A-Z][0-9]{0,3}") + + +# +# OPCS4 codelist as concatenated string +# +# This is specifically for fields in the admitted patient care (APC) part +# of the hospital episode statistics (HES) data where there are fields +# that are a concatenation of all procedure codes for a patient's episode +# or spell. +class OPCS4MultiCodeString(BaseMultiCodeString): + "Multiple OPCS4 codes" + + @classmethod + def _code_type(cls): + return OPCS4Code + + # We want to allow prefix searching on this field so users can + # search this field for a string prefix. This ensures they pass + # a valid prefix so we can throw an error, rather than silently + # failing by running but returning 0 records + regex = re.compile(r"[A-Z][0-9]{0,3}") + + +def codelist_from_csv(filename, *, column, category_column=None): + """ + Read a codelist from a CSV file as either a list or a dictionary (for categorised + codelists). + + _filename_<br> + Path to the file on disk, relative to the root of your repository. (Remember to use + UNIX/style/forward-slashes not Windows\\style\\backslashes.) + + _column_<br> + Name of the column in the CSV file which contains the codes. + + _category_column_<br> + Optional name of a column in the CSV file which contains categories to which each + code should be mapped. If this argument is passed then the resulting codelist will + be a dictionary mapping each code to its corresponding category. This can be passed + to the [`to_category()`](#CodePatientSeries.to_category) method to map a series of + codes to a series of categories. + + For more detail see the [how-to guide](../how-to/examples.md/#using-codelists-with-category-columns). + """ + filename = Path(filename) + if not filename.exists(): + # If the character which comes after the backslash in the string literal happens + # to form a valid escape sequence then no backslash will appear in the compiled + # string. Checking the repr for backslashes has false positives (e.g. a tab + # character will trigger it) but that seems OK in this context. + if "\\" in repr(filename): + hint = ( + "\n\n" + "HINT: Use forward slash (/) instead of backslash (\\) in file paths" + ) + else: + hint = "" + raise CodelistError(f"No CSV file at {filename}{hint}") + with filename.open("r") as f: + return codelist_from_csv_lines( + f, column=column, category_column=category_column + ) + + +def codelist_from_csv_lines(lines, *, column, category_column=None): + if category_column is None: + category_column = column + return_list = True + else: + return_list = False + # Using `restval=""` ensures we never get None instead of string, so we can always + # call `.strip()` without blowing up + reader = csv.DictReader(iter(lines), restval="") + if column not in reader.fieldnames: + raise CodelistError(f"No column '{column}' in CSV") + if category_column not in reader.fieldnames: + raise CodelistError(f"No column '{category_column}' in CSV") + code_map = {row[column].strip(): row[category_column].strip() for row in reader} + # Discard any empty codes + code_map.pop("", None) + if return_list: + return list(code_map) + else: + return code_map