Diff of /ehrql/codes.py [000000] .. [e988c2]

Switch to unified view

a b/ehrql/codes.py
1
"""
2
We make each coding system a distinct type. The query model's type checking will then
3
enforce that queries use the appropriate coding system for a given column.
4
"""
5
6
import csv
7
import dataclasses
8
import re
9
from pathlib import Path
10
11
12
class CodelistError(ValueError): ...
13
14
15
@dataclasses.dataclass(frozen=True)
16
class BaseCode:
17
    value: str
18
19
    def __post_init__(self):
20
        if not self.regex.fullmatch(self.value):
21
            raise ValueError(f"Invalid {self.__class__.__name__}: {self.value}")
22
23
    @classmethod
24
    def _primitive_type(cls):
25
        return str
26
27
    # The presence of this method allows query engines to work with values of this type,
28
    # despite not being explicitly told about them beforehand
29
    def _to_primitive_type(self):
30
        return self.value
31
32
33
# A base class for fields that are concatenated lists of clinical codes. This occurs
34
# in the admitted patient care spell (apcs) table of hospital episode statistics for
35
# all_diagnoses (ICD10 codes), and all_procedures (OPCS4 codes).
36
#
37
# This inherits from str because that's what the underlying data is, but is in this
38
# file as it's sort of a code. In future a better implementation might be to parse the
39
# field value into a Set of clinical codes.
40
class BaseMultiCodeString(str):
41
    @classmethod
42
    def _code_type(cls):
43
        raise NotImplementedError(
44
            "BaseMultiCodeString subclasses must implement the _code_type method"
45
        )
46
47
    @classmethod
48
    def _primitive_type(cls):
49
        return str
50
51
52
class BNFCode(BaseCode):
53
    "Pseudo BNF"
54
55
    regex = re.compile(
56
        r"""
57
        # Standard BNF code
58
          # Chapter, Section, Paragraph, Sub-paragraph
59
          [01][0-9]{6}
60
          # Chemical
61
          [0-9A-Z]{2}
62
          # Product, strength-formulation, generic equivalent
63
          ([A-Z][0-9A-Z]){3}
64
        | # OR
65
        # Appliances
66
        2[0-3][0-9]{9}
67
        """,
68
        re.VERBOSE,
69
    )
70
71
72
class CTV3Code(BaseCode):
73
    "CTV3 (Read v3)"
74
75
    # Some of the CTV3 codes in the OpenCodelists coding system database (though not any
76
    # actually used in codelists) violate the below format, either by having a leading
77
    # dot or by starting with a tilde. However I have confirmed that, aside from a tiny
78
    # handful of cases, these invalid codes are not used in the database so there should
79
    # never be a need to create codelists which use them.
80
    regex = re.compile(
81
        r"""
82
        [0-9A-Za-z]{5}
83
        | [0-9A-Za-z]{4}\.{1}
84
        | [0-9A-Za-z]{3}\.{2}
85
        | [0-9A-Za-z]{2}\.{3}
86
        """,
87
        re.VERBOSE,
88
    )
89
90
91
class ICD10Code(BaseCode):
92
    "ICD-10"
93
94
    regex = re.compile(r"[A-Z][0-9]{2,3}")
95
96
97
class OPCS4Code(BaseCode):
98
    "OPCS-4"
99
100
    # The documented structure requires three digits, and a dot between the 2nd and 3rd
101
    # digit, but the codes we have in OpenCodelists omit the dot and sometimes have only
102
    # two digits.
103
    # https://en.wikipedia.org/wiki/OPCS-4#Code_structure
104
    regex = re.compile(
105
        r"""
106
        # Uppercase letter excluding I
107
        [ABCDEFGHJKLMNOPQRSTUVWXYZ]
108
        [0-9]{2,3}
109
        """,
110
        re.VERBOSE,
111
    )
112
113
114
class SNOMEDCTCode(BaseCode):
115
    "SNOMED-CT"
116
117
    # 6-18 digit number with no leading zeros
118
    # https://confluence.ihtsdotools.org/display/DOCRELFMT/6.1+SCTID+Data+Type
119
    regex = re.compile(r"[1-9][0-9]{5,17}")
120
121
122
# Dictionary of Medicines and Devices
123
class DMDCode(BaseCode):
124
    "dm+d"
125
126
    # Syntactically equivalent to SNOMED-CT
127
    regex = SNOMEDCTCode.regex
128
129
130
#
131
# ICD10 codelist as concatenated string
132
#
133
# This is specifically for fields in the admitted patient care (APC) part
134
# of the hospital episode statistics (HES) data where there are fields
135
# that are a concatenation of all diagnosis codes for a patient's episode
136
# or spell.
137
class ICD10MultiCodeString(BaseMultiCodeString):
138
    "Multiple ICD-10 codes"
139
140
    @classmethod
141
    def _code_type(cls):
142
        return ICD10Code
143
144
    # We want to allow prefix searching on this field so users can
145
    # search this field for a string prefix. This ensures they pass
146
    # a valid prefix so we can throw an error, rather than silently
147
    # failing by running but returning 0 records
148
    regex = re.compile(r"[A-Z][0-9]{0,3}")
149
150
151
#
152
# OPCS4 codelist as concatenated string
153
#
154
# This is specifically for fields in the admitted patient care (APC) part
155
# of the hospital episode statistics (HES) data where there are fields
156
# that are a concatenation of all procedure codes for a patient's episode
157
# or spell.
158
class OPCS4MultiCodeString(BaseMultiCodeString):
159
    "Multiple OPCS4 codes"
160
161
    @classmethod
162
    def _code_type(cls):
163
        return OPCS4Code
164
165
    # We want to allow prefix searching on this field so users can
166
    # search this field for a string prefix. This ensures they pass
167
    # a valid prefix so we can throw an error, rather than silently
168
    # failing by running but returning 0 records
169
    regex = re.compile(r"[A-Z][0-9]{0,3}")
170
171
172
def codelist_from_csv(filename, *, column, category_column=None):
173
    """
174
    Read a codelist from a CSV file as either a list or a dictionary (for categorised
175
    codelists).
176
177
    _filename_<br>
178
    Path to the file on disk, relative to the root of your repository. (Remember to use
179
    UNIX/style/forward-slashes not Windows\\style\\backslashes.)
180
181
    _column_<br>
182
    Name of the column in the CSV file which contains the codes.
183
184
    _category_column_<br>
185
    Optional name of a column in the CSV file which contains categories to which each
186
    code should be mapped. If this argument is passed then the resulting codelist will
187
    be a dictionary mapping each code to its corresponding category. This can be passed
188
    to the [`to_category()`](#CodePatientSeries.to_category) method to map a series of
189
    codes to a series of categories.
190
191
    For more detail see the [how-to guide](../how-to/examples.md/#using-codelists-with-category-columns).
192
    """
193
    filename = Path(filename)
194
    if not filename.exists():
195
        # If the character which comes after the backslash in the string literal happens
196
        # to form a valid escape sequence then no backslash will appear in the compiled
197
        # string. Checking the repr for backslashes has false positives (e.g. a tab
198
        # character will trigger it) but that seems OK in this context.
199
        if "\\" in repr(filename):
200
            hint = (
201
                "\n\n"
202
                "HINT: Use forward slash (/) instead of backslash (\\) in file paths"
203
            )
204
        else:
205
            hint = ""
206
        raise CodelistError(f"No CSV file at {filename}{hint}")
207
    with filename.open("r") as f:
208
        return codelist_from_csv_lines(
209
            f, column=column, category_column=category_column
210
        )
211
212
213
def codelist_from_csv_lines(lines, *, column, category_column=None):
214
    if category_column is None:
215
        category_column = column
216
        return_list = True
217
    else:
218
        return_list = False
219
    # Using `restval=""` ensures we never get None instead of string, so we can always
220
    # call `.strip()` without blowing up
221
    reader = csv.DictReader(iter(lines), restval="")
222
    if column not in reader.fieldnames:
223
        raise CodelistError(f"No column '{column}' in CSV")
224
    if category_column not in reader.fieldnames:
225
        raise CodelistError(f"No column '{category_column}' in CSV")
226
    code_map = {row[column].strip(): row[category_column].strip() for row in reader}
227
    # Discard any empty codes
228
    code_map.pop("", None)
229
    if return_list:
230
        return list(code_map)
231
    else:
232
        return code_map