|
a |
|
b/ehrql/codes.py |
|
|
1 |
""" |
|
|
2 |
We make each coding system a distinct type. The query model's type checking will then |
|
|
3 |
enforce that queries use the appropriate coding system for a given column. |
|
|
4 |
""" |
|
|
5 |
|
|
|
6 |
import csv |
|
|
7 |
import dataclasses |
|
|
8 |
import re |
|
|
9 |
from pathlib import Path |
|
|
10 |
|
|
|
11 |
|
|
|
12 |
class CodelistError(ValueError): ... |
|
|
13 |
|
|
|
14 |
|
|
|
15 |
@dataclasses.dataclass(frozen=True) |
|
|
16 |
class BaseCode: |
|
|
17 |
value: str |
|
|
18 |
|
|
|
19 |
def __post_init__(self): |
|
|
20 |
if not self.regex.fullmatch(self.value): |
|
|
21 |
raise ValueError(f"Invalid {self.__class__.__name__}: {self.value}") |
|
|
22 |
|
|
|
23 |
@classmethod |
|
|
24 |
def _primitive_type(cls): |
|
|
25 |
return str |
|
|
26 |
|
|
|
27 |
# The presence of this method allows query engines to work with values of this type, |
|
|
28 |
# despite not being explicitly told about them beforehand |
|
|
29 |
def _to_primitive_type(self): |
|
|
30 |
return self.value |
|
|
31 |
|
|
|
32 |
|
|
|
33 |
# A base class for fields that are concatenated lists of clinical codes. This occurs |
|
|
34 |
# in the admitted patient care spell (apcs) table of hospital episode statistics for |
|
|
35 |
# all_diagnoses (ICD10 codes), and all_procedures (OPCS4 codes). |
|
|
36 |
# |
|
|
37 |
# This inherits from str because that's what the underlying data is, but is in this |
|
|
38 |
# file as it's sort of a code. In future a better implementation might be to parse the |
|
|
39 |
# field value into a Set of clinical codes. |
|
|
40 |
class BaseMultiCodeString(str): |
|
|
41 |
@classmethod |
|
|
42 |
def _code_type(cls): |
|
|
43 |
raise NotImplementedError( |
|
|
44 |
"BaseMultiCodeString subclasses must implement the _code_type method" |
|
|
45 |
) |
|
|
46 |
|
|
|
47 |
@classmethod |
|
|
48 |
def _primitive_type(cls): |
|
|
49 |
return str |
|
|
50 |
|
|
|
51 |
|
|
|
52 |
class BNFCode(BaseCode): |
|
|
53 |
"Pseudo BNF" |
|
|
54 |
|
|
|
55 |
regex = re.compile( |
|
|
56 |
r""" |
|
|
57 |
# Standard BNF code |
|
|
58 |
# Chapter, Section, Paragraph, Sub-paragraph |
|
|
59 |
[01][0-9]{6} |
|
|
60 |
# Chemical |
|
|
61 |
[0-9A-Z]{2} |
|
|
62 |
# Product, strength-formulation, generic equivalent |
|
|
63 |
([A-Z][0-9A-Z]){3} |
|
|
64 |
| # OR |
|
|
65 |
# Appliances |
|
|
66 |
2[0-3][0-9]{9} |
|
|
67 |
""", |
|
|
68 |
re.VERBOSE, |
|
|
69 |
) |
|
|
70 |
|
|
|
71 |
|
|
|
72 |
class CTV3Code(BaseCode): |
|
|
73 |
"CTV3 (Read v3)" |
|
|
74 |
|
|
|
75 |
# Some of the CTV3 codes in the OpenCodelists coding system database (though not any |
|
|
76 |
# actually used in codelists) violate the below format, either by having a leading |
|
|
77 |
# dot or by starting with a tilde. However I have confirmed that, aside from a tiny |
|
|
78 |
# handful of cases, these invalid codes are not used in the database so there should |
|
|
79 |
# never be a need to create codelists which use them. |
|
|
80 |
regex = re.compile( |
|
|
81 |
r""" |
|
|
82 |
[0-9A-Za-z]{5} |
|
|
83 |
| [0-9A-Za-z]{4}\.{1} |
|
|
84 |
| [0-9A-Za-z]{3}\.{2} |
|
|
85 |
| [0-9A-Za-z]{2}\.{3} |
|
|
86 |
""", |
|
|
87 |
re.VERBOSE, |
|
|
88 |
) |
|
|
89 |
|
|
|
90 |
|
|
|
91 |
class ICD10Code(BaseCode): |
|
|
92 |
"ICD-10" |
|
|
93 |
|
|
|
94 |
regex = re.compile(r"[A-Z][0-9]{2,3}") |
|
|
95 |
|
|
|
96 |
|
|
|
97 |
class OPCS4Code(BaseCode): |
|
|
98 |
"OPCS-4" |
|
|
99 |
|
|
|
100 |
# The documented structure requires three digits, and a dot between the 2nd and 3rd |
|
|
101 |
# digit, but the codes we have in OpenCodelists omit the dot and sometimes have only |
|
|
102 |
# two digits. |
|
|
103 |
# https://en.wikipedia.org/wiki/OPCS-4#Code_structure |
|
|
104 |
regex = re.compile( |
|
|
105 |
r""" |
|
|
106 |
# Uppercase letter excluding I |
|
|
107 |
[ABCDEFGHJKLMNOPQRSTUVWXYZ] |
|
|
108 |
[0-9]{2,3} |
|
|
109 |
""", |
|
|
110 |
re.VERBOSE, |
|
|
111 |
) |
|
|
112 |
|
|
|
113 |
|
|
|
114 |
class SNOMEDCTCode(BaseCode): |
|
|
115 |
"SNOMED-CT" |
|
|
116 |
|
|
|
117 |
# 6-18 digit number with no leading zeros |
|
|
118 |
# https://confluence.ihtsdotools.org/display/DOCRELFMT/6.1+SCTID+Data+Type |
|
|
119 |
regex = re.compile(r"[1-9][0-9]{5,17}") |
|
|
120 |
|
|
|
121 |
|
|
|
122 |
# Dictionary of Medicines and Devices |
|
|
123 |
class DMDCode(BaseCode): |
|
|
124 |
"dm+d" |
|
|
125 |
|
|
|
126 |
# Syntactically equivalent to SNOMED-CT |
|
|
127 |
regex = SNOMEDCTCode.regex |
|
|
128 |
|
|
|
129 |
|
|
|
130 |
# |
|
|
131 |
# ICD10 codelist as concatenated string |
|
|
132 |
# |
|
|
133 |
# This is specifically for fields in the admitted patient care (APC) part |
|
|
134 |
# of the hospital episode statistics (HES) data where there are fields |
|
|
135 |
# that are a concatenation of all diagnosis codes for a patient's episode |
|
|
136 |
# or spell. |
|
|
137 |
class ICD10MultiCodeString(BaseMultiCodeString): |
|
|
138 |
"Multiple ICD-10 codes" |
|
|
139 |
|
|
|
140 |
@classmethod |
|
|
141 |
def _code_type(cls): |
|
|
142 |
return ICD10Code |
|
|
143 |
|
|
|
144 |
# We want to allow prefix searching on this field so users can |
|
|
145 |
# search this field for a string prefix. This ensures they pass |
|
|
146 |
# a valid prefix so we can throw an error, rather than silently |
|
|
147 |
# failing by running but returning 0 records |
|
|
148 |
regex = re.compile(r"[A-Z][0-9]{0,3}") |
|
|
149 |
|
|
|
150 |
|
|
|
151 |
# |
|
|
152 |
# OPCS4 codelist as concatenated string |
|
|
153 |
# |
|
|
154 |
# This is specifically for fields in the admitted patient care (APC) part |
|
|
155 |
# of the hospital episode statistics (HES) data where there are fields |
|
|
156 |
# that are a concatenation of all procedure codes for a patient's episode |
|
|
157 |
# or spell. |
|
|
158 |
class OPCS4MultiCodeString(BaseMultiCodeString): |
|
|
159 |
"Multiple OPCS4 codes" |
|
|
160 |
|
|
|
161 |
@classmethod |
|
|
162 |
def _code_type(cls): |
|
|
163 |
return OPCS4Code |
|
|
164 |
|
|
|
165 |
# We want to allow prefix searching on this field so users can |
|
|
166 |
# search this field for a string prefix. This ensures they pass |
|
|
167 |
# a valid prefix so we can throw an error, rather than silently |
|
|
168 |
# failing by running but returning 0 records |
|
|
169 |
regex = re.compile(r"[A-Z][0-9]{0,3}") |
|
|
170 |
|
|
|
171 |
|
|
|
172 |
def codelist_from_csv(filename, *, column, category_column=None): |
|
|
173 |
""" |
|
|
174 |
Read a codelist from a CSV file as either a list or a dictionary (for categorised |
|
|
175 |
codelists). |
|
|
176 |
|
|
|
177 |
_filename_<br> |
|
|
178 |
Path to the file on disk, relative to the root of your repository. (Remember to use |
|
|
179 |
UNIX/style/forward-slashes not Windows\\style\\backslashes.) |
|
|
180 |
|
|
|
181 |
_column_<br> |
|
|
182 |
Name of the column in the CSV file which contains the codes. |
|
|
183 |
|
|
|
184 |
_category_column_<br> |
|
|
185 |
Optional name of a column in the CSV file which contains categories to which each |
|
|
186 |
code should be mapped. If this argument is passed then the resulting codelist will |
|
|
187 |
be a dictionary mapping each code to its corresponding category. This can be passed |
|
|
188 |
to the [`to_category()`](#CodePatientSeries.to_category) method to map a series of |
|
|
189 |
codes to a series of categories. |
|
|
190 |
|
|
|
191 |
For more detail see the [how-to guide](../how-to/examples.md/#using-codelists-with-category-columns). |
|
|
192 |
""" |
|
|
193 |
filename = Path(filename) |
|
|
194 |
if not filename.exists(): |
|
|
195 |
# If the character which comes after the backslash in the string literal happens |
|
|
196 |
# to form a valid escape sequence then no backslash will appear in the compiled |
|
|
197 |
# string. Checking the repr for backslashes has false positives (e.g. a tab |
|
|
198 |
# character will trigger it) but that seems OK in this context. |
|
|
199 |
if "\\" in repr(filename): |
|
|
200 |
hint = ( |
|
|
201 |
"\n\n" |
|
|
202 |
"HINT: Use forward slash (/) instead of backslash (\\) in file paths" |
|
|
203 |
) |
|
|
204 |
else: |
|
|
205 |
hint = "" |
|
|
206 |
raise CodelistError(f"No CSV file at {filename}{hint}") |
|
|
207 |
with filename.open("r") as f: |
|
|
208 |
return codelist_from_csv_lines( |
|
|
209 |
f, column=column, category_column=category_column |
|
|
210 |
) |
|
|
211 |
|
|
|
212 |
|
|
|
213 |
def codelist_from_csv_lines(lines, *, column, category_column=None): |
|
|
214 |
if category_column is None: |
|
|
215 |
category_column = column |
|
|
216 |
return_list = True |
|
|
217 |
else: |
|
|
218 |
return_list = False |
|
|
219 |
# Using `restval=""` ensures we never get None instead of string, so we can always |
|
|
220 |
# call `.strip()` without blowing up |
|
|
221 |
reader = csv.DictReader(iter(lines), restval="") |
|
|
222 |
if column not in reader.fieldnames: |
|
|
223 |
raise CodelistError(f"No column '{column}' in CSV") |
|
|
224 |
if category_column not in reader.fieldnames: |
|
|
225 |
raise CodelistError(f"No column '{category_column}' in CSV") |
|
|
226 |
code_map = {row[column].strip(): row[category_column].strip() for row in reader} |
|
|
227 |
# Discard any empty codes |
|
|
228 |
code_map.pop("", None) |
|
|
229 |
if return_list: |
|
|
230 |
return list(code_map) |
|
|
231 |
else: |
|
|
232 |
return code_map |