[e988c2]: / ehrql / codes.py

Download this file

233 lines (187 with data), 7.5 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
"""
We make each coding system a distinct type. The query model's type checking will then
enforce that queries use the appropriate coding system for a given column.
"""
import csv
import dataclasses
import re
from pathlib import Path
class CodelistError(ValueError): ...
@dataclasses.dataclass(frozen=True)
class BaseCode:
value: str
def __post_init__(self):
if not self.regex.fullmatch(self.value):
raise ValueError(f"Invalid {self.__class__.__name__}: {self.value}")
@classmethod
def _primitive_type(cls):
return str
# The presence of this method allows query engines to work with values of this type,
# despite not being explicitly told about them beforehand
def _to_primitive_type(self):
return self.value
# A base class for fields that are concatenated lists of clinical codes. This occurs
# in the admitted patient care spell (apcs) table of hospital episode statistics for
# all_diagnoses (ICD10 codes), and all_procedures (OPCS4 codes).
#
# This inherits from str because that's what the underlying data is, but is in this
# file as it's sort of a code. In future a better implementation might be to parse the
# field value into a Set of clinical codes.
class BaseMultiCodeString(str):
@classmethod
def _code_type(cls):
raise NotImplementedError(
"BaseMultiCodeString subclasses must implement the _code_type method"
)
@classmethod
def _primitive_type(cls):
return str
class BNFCode(BaseCode):
"Pseudo BNF"
regex = re.compile(
r"""
# Standard BNF code
# Chapter, Section, Paragraph, Sub-paragraph
[01][0-9]{6}
# Chemical
[0-9A-Z]{2}
# Product, strength-formulation, generic equivalent
([A-Z][0-9A-Z]){3}
| # OR
# Appliances
2[0-3][0-9]{9}
""",
re.VERBOSE,
)
class CTV3Code(BaseCode):
"CTV3 (Read v3)"
# Some of the CTV3 codes in the OpenCodelists coding system database (though not any
# actually used in codelists) violate the below format, either by having a leading
# dot or by starting with a tilde. However I have confirmed that, aside from a tiny
# handful of cases, these invalid codes are not used in the database so there should
# never be a need to create codelists which use them.
regex = re.compile(
r"""
[0-9A-Za-z]{5}
| [0-9A-Za-z]{4}\.{1}
| [0-9A-Za-z]{3}\.{2}
| [0-9A-Za-z]{2}\.{3}
""",
re.VERBOSE,
)
class ICD10Code(BaseCode):
"ICD-10"
regex = re.compile(r"[A-Z][0-9]{2,3}")
class OPCS4Code(BaseCode):
"OPCS-4"
# The documented structure requires three digits, and a dot between the 2nd and 3rd
# digit, but the codes we have in OpenCodelists omit the dot and sometimes have only
# two digits.
# https://en.wikipedia.org/wiki/OPCS-4#Code_structure
regex = re.compile(
r"""
# Uppercase letter excluding I
[ABCDEFGHJKLMNOPQRSTUVWXYZ]
[0-9]{2,3}
""",
re.VERBOSE,
)
class SNOMEDCTCode(BaseCode):
"SNOMED-CT"
# 6-18 digit number with no leading zeros
# https://confluence.ihtsdotools.org/display/DOCRELFMT/6.1+SCTID+Data+Type
regex = re.compile(r"[1-9][0-9]{5,17}")
# Dictionary of Medicines and Devices
class DMDCode(BaseCode):
"dm+d"
# Syntactically equivalent to SNOMED-CT
regex = SNOMEDCTCode.regex
#
# ICD10 codelist as concatenated string
#
# This is specifically for fields in the admitted patient care (APC) part
# of the hospital episode statistics (HES) data where there are fields
# that are a concatenation of all diagnosis codes for a patient's episode
# or spell.
class ICD10MultiCodeString(BaseMultiCodeString):
"Multiple ICD-10 codes"
@classmethod
def _code_type(cls):
return ICD10Code
# We want to allow prefix searching on this field so users can
# search this field for a string prefix. This ensures they pass
# a valid prefix so we can throw an error, rather than silently
# failing by running but returning 0 records
regex = re.compile(r"[A-Z][0-9]{0,3}")
#
# OPCS4 codelist as concatenated string
#
# This is specifically for fields in the admitted patient care (APC) part
# of the hospital episode statistics (HES) data where there are fields
# that are a concatenation of all procedure codes for a patient's episode
# or spell.
class OPCS4MultiCodeString(BaseMultiCodeString):
"Multiple OPCS4 codes"
@classmethod
def _code_type(cls):
return OPCS4Code
# We want to allow prefix searching on this field so users can
# search this field for a string prefix. This ensures they pass
# a valid prefix so we can throw an error, rather than silently
# failing by running but returning 0 records
regex = re.compile(r"[A-Z][0-9]{0,3}")
def codelist_from_csv(filename, *, column, category_column=None):
"""
Read a codelist from a CSV file as either a list or a dictionary (for categorised
codelists).
_filename_<br>
Path to the file on disk, relative to the root of your repository. (Remember to use
UNIX/style/forward-slashes not Windows\\style\\backslashes.)
_column_<br>
Name of the column in the CSV file which contains the codes.
_category_column_<br>
Optional name of a column in the CSV file which contains categories to which each
code should be mapped. If this argument is passed then the resulting codelist will
be a dictionary mapping each code to its corresponding category. This can be passed
to the [`to_category()`](#CodePatientSeries.to_category) method to map a series of
codes to a series of categories.
For more detail see the [how-to guide](../how-to/examples.md/#using-codelists-with-category-columns).
"""
filename = Path(filename)
if not filename.exists():
# If the character which comes after the backslash in the string literal happens
# to form a valid escape sequence then no backslash will appear in the compiled
# string. Checking the repr for backslashes has false positives (e.g. a tab
# character will trigger it) but that seems OK in this context.
if "\\" in repr(filename):
hint = (
"\n\n"
"HINT: Use forward slash (/) instead of backslash (\\) in file paths"
)
else:
hint = ""
raise CodelistError(f"No CSV file at {filename}{hint}")
with filename.open("r") as f:
return codelist_from_csv_lines(
f, column=column, category_column=category_column
)
def codelist_from_csv_lines(lines, *, column, category_column=None):
if category_column is None:
category_column = column
return_list = True
else:
return_list = False
# Using `restval=""` ensures we never get None instead of string, so we can always
# call `.strip()` without blowing up
reader = csv.DictReader(iter(lines), restval="")
if column not in reader.fieldnames:
raise CodelistError(f"No column '{column}' in CSV")
if category_column not in reader.fieldnames:
raise CodelistError(f"No column '{category_column}' in CSV")
code_map = {row[column].strip(): row[category_column].strip() for row in reader}
# Discard any empty codes
code_map.pop("", None)
if return_list:
return list(code_map)
else:
return code_map