|
a |
|
b/tests/test_specific_cases.py |
|
|
1 |
import pytest |
|
|
2 |
|
|
|
3 |
import selfies as sf |
|
|
4 |
|
|
|
5 |
|
|
|
6 |
def decode_eq(selfies, smiles): |
|
|
7 |
s = sf.decoder(selfies) |
|
|
8 |
return s == smiles |
|
|
9 |
|
|
|
10 |
|
|
|
11 |
def roundtrip_eq(smiles_in, smiles_out): |
|
|
12 |
sel = sf.encoder(smiles_in) |
|
|
13 |
smi = sf.decoder(sel) |
|
|
14 |
return smi == smiles_out |
|
|
15 |
|
|
|
16 |
|
|
|
17 |
def test_branch_and_ring_at_state_X0(): |
|
|
18 |
"""Tests SELFIES with branches and rings at state X0 (i.e. at the |
|
|
19 |
very beginning of a SELFIES). These symbols should be skipped. |
|
|
20 |
""" |
|
|
21 |
|
|
|
22 |
assert decode_eq("[Branch3][C][S][C][O]", "CSCO") |
|
|
23 |
assert decode_eq("[Ring3][C][S][C][O]", "CSCO") |
|
|
24 |
assert decode_eq("[Branch1][Ring1][Ring3][C][S][C][O]", "CSCO") |
|
|
25 |
|
|
|
26 |
|
|
|
27 |
def test_branch_at_state_X1(): |
|
|
28 |
"""Test SELFIES with branches at state X1 (i.e. at an atom that |
|
|
29 |
can only make one bond. In this case, the branch symbol should be skipped. |
|
|
30 |
""" |
|
|
31 |
|
|
|
32 |
assert decode_eq("[C][C][O][Branch1][C][I]", "CCOCI") |
|
|
33 |
assert decode_eq("[C][C][C][O][#Branch3][C][I]", "CCCOCI") |
|
|
34 |
|
|
|
35 |
|
|
|
36 |
def test_branch_and_ring_decrement_state(): |
|
|
37 |
"""Tests that the branch and ring symbols properly decrement the |
|
|
38 |
derivation state. |
|
|
39 |
""" |
|
|
40 |
|
|
|
41 |
assert decode_eq("[C][C][C][Ring1][Ring1][#C]", "C1CC1=C") |
|
|
42 |
assert decode_eq("[C][=C][C][C][#Ring1][Ring1][#C]", "C=C1CC1") |
|
|
43 |
assert decode_eq("[C][O][C][C][=Ring1][Ring1][#C]", "COCCC") |
|
|
44 |
|
|
|
45 |
assert decode_eq("[C][=C][Branch1][C][=C][#C]", "C=C(C)C") |
|
|
46 |
|
|
|
47 |
|
|
|
48 |
def test_branch_at_end_of_selfies(): |
|
|
49 |
"""Test SELFIES that have a branch symbol as its very last symbol. |
|
|
50 |
""" |
|
|
51 |
|
|
|
52 |
assert decode_eq("[C][C][C][C][Branch1]", "CCCC") |
|
|
53 |
assert decode_eq("[C][C][C][C][#Branch3]", "CCCC") |
|
|
54 |
|
|
|
55 |
|
|
|
56 |
def test_ring_at_end_of_selfies(): |
|
|
57 |
"""Test SELFIES that have a ring symbol as its very last symbol. |
|
|
58 |
""" |
|
|
59 |
|
|
|
60 |
assert decode_eq("[C][C][C][C][C][Ring1]", "CCCC=C") |
|
|
61 |
assert decode_eq("[C][C][C][C][C][Ring3]", "CCCC=C") |
|
|
62 |
|
|
|
63 |
|
|
|
64 |
def test_branch_with_no_atoms(): |
|
|
65 |
"""Test SELFIES that have a branch, but the branch has no atoms in it. |
|
|
66 |
Such branches should not be made in the outputted SMILES. |
|
|
67 |
""" |
|
|
68 |
|
|
|
69 |
s = "[C][Branch1][Ring2][Branch1][Branch1][Branch1][F]" |
|
|
70 |
assert decode_eq(s, "CF") |
|
|
71 |
|
|
|
72 |
s = "[C][Branch1][Ring2][Ring1][Ring1][Branch1][F]" |
|
|
73 |
assert decode_eq(s, "CF") |
|
|
74 |
|
|
|
75 |
s = "[C][=Branch1][Ring2][Branch1][C][Cl][F]" |
|
|
76 |
assert decode_eq(s, "C(Cl)F") |
|
|
77 |
|
|
|
78 |
# special case: #Branch3 takes Q_1, Q_2 = [O] and Q_3 = ''. However, |
|
|
79 |
# there are no more symbols in the branch. |
|
|
80 |
assert decode_eq("[C][C][C][C][#Branch3][O][O]", "CCCC") |
|
|
81 |
|
|
|
82 |
|
|
|
83 |
def test_oversized_branch(): |
|
|
84 |
"""Test SELFIES that have a branch, with Q larger than the length |
|
|
85 |
of the SELFIES |
|
|
86 |
""" |
|
|
87 |
|
|
|
88 |
assert decode_eq("[C][Branch2][O][O][C][C][S][F][C]", "CCCSF") |
|
|
89 |
assert decode_eq("[C][#Branch2][O][O][#C][C][S][F]", "C#CCSF") |
|
|
90 |
|
|
|
91 |
|
|
|
92 |
def test_oversized_ring(): |
|
|
93 |
"""Test SELFIES that have a ring, with Q so large that the (Q + 1)-th |
|
|
94 |
previously derived atom does not exist. |
|
|
95 |
""" |
|
|
96 |
|
|
|
97 |
assert decode_eq("[C][C][C][C][Ring1][O]", "C1CCC1") |
|
|
98 |
assert decode_eq("[C][C][C][C][Ring2][O][C]", "C1CCC1") |
|
|
99 |
|
|
|
100 |
# special case: Ring2 takes Q_1 = [O] and Q_2 = '', leading to |
|
|
101 |
# Q = 9 * 16 + 0 (i.e. an oversized ring) |
|
|
102 |
assert decode_eq("[C][C][C][C][Ring2][O]", "C1CCC1") |
|
|
103 |
|
|
|
104 |
# special case: ring between 1st atom and 1st atom should not be formed |
|
|
105 |
assert decode_eq("[C][Ring1][O]", "C") |
|
|
106 |
|
|
|
107 |
|
|
|
108 |
def test_branch_at_beginning_of_branch(): |
|
|
109 |
"""Test SELFIES that have a branch immediately at the start of a branch. |
|
|
110 |
""" |
|
|
111 |
|
|
|
112 |
# [C@]((Br)Cl)F |
|
|
113 |
s = "[C@][=Branch1][Branch1][Branch1][C][Br][Cl][F]" |
|
|
114 |
assert decode_eq(s, "[C@](Br)(Cl)F") |
|
|
115 |
|
|
|
116 |
# [C@](((Br)Cl)I)F |
|
|
117 |
s = "[C@][#Branch1][Branch2][=Branch1][Branch1][Branch1][C][Br][Cl][I][F]" |
|
|
118 |
assert decode_eq(s, "[C@](Br)(Cl)(I)F") |
|
|
119 |
|
|
|
120 |
# [C@]((Br)(Cl)I)F |
|
|
121 |
s = "[C@][#Branch1][Branch2][Branch1][C][Br][Branch1][C][Cl][I][F]" |
|
|
122 |
assert decode_eq(s, "[C@](Br)(Cl)(I)F") |
|
|
123 |
|
|
|
124 |
|
|
|
125 |
def test_ring_at_beginning_of_branch(): |
|
|
126 |
"""Test SELFIES that have a ring immediately at the start of a branch. |
|
|
127 |
""" |
|
|
128 |
|
|
|
129 |
# CC1CCC(1CCl)F |
|
|
130 |
s = "[C][C][C][C][C][=Branch1][Branch1][Ring1][Ring2][C][Cl][F]" |
|
|
131 |
assert decode_eq(s, "CC1CCC1(CCl)F") |
|
|
132 |
|
|
|
133 |
# CC1CCS(Br)(1CCl)F |
|
|
134 |
s = "[C][C][C][C][S][Branch1][C][Br]" \ |
|
|
135 |
"[=Branch1][Branch1][Ring1][Ring2][C][Cl][F]" |
|
|
136 |
assert decode_eq(s, "CC1CCS1(Br)(CCl)F") |
|
|
137 |
|
|
|
138 |
|
|
|
139 |
def test_branch_and_ring_at_beginning_of_branch(): |
|
|
140 |
"""Test SELFIES that have a branch and ring immediately at the start |
|
|
141 |
of a branch. |
|
|
142 |
""" |
|
|
143 |
|
|
|
144 |
# CC1CCCS((Br)1Cl)F |
|
|
145 |
s = "[C][C][C][C][C][S][#Branch1][#Branch1][Branch1][C][Br]" \ |
|
|
146 |
"[Ring1][Branch1][Cl][F]" |
|
|
147 |
assert decode_eq(s, "CC1CCCS1(Br)(Cl)F") |
|
|
148 |
|
|
|
149 |
# CC1CCCS(1(Br)Cl)F |
|
|
150 |
s = "[C][C][C][C][C][S][#Branch1][#Branch1][Ring1][Branch1]" \ |
|
|
151 |
"[Branch1][C][Br][Cl][F]" |
|
|
152 |
assert decode_eq(s, "CC1CCCS1(Br)(Cl)F") |
|
|
153 |
|
|
|
154 |
|
|
|
155 |
def test_ring_immediately_following_branch(): |
|
|
156 |
"""Test SELFIES that have a ring immediately following after a branch. |
|
|
157 |
""" |
|
|
158 |
|
|
|
159 |
# CCC1CCCC(OCO)1 |
|
|
160 |
s = "[C][C][C][C][C][C][C][Branch1][Ring2][O][C][O][Ring1][Branch1]" |
|
|
161 |
assert decode_eq(s, "CCC1CCCC1OCO") |
|
|
162 |
|
|
|
163 |
# CCC1CCCC(OCO)(F)1 |
|
|
164 |
s = "[C][C][C][C][C][C][C][Branch1][Ring2][O][C][O]" \ |
|
|
165 |
"[Branch1][C][F][Ring1][Branch1]" |
|
|
166 |
assert decode_eq(s, "CCC1CCCC1(OCO)F") |
|
|
167 |
|
|
|
168 |
|
|
|
169 |
def test_ring_after_branch(): |
|
|
170 |
"""Tests SELFIES that have a ring following a branch, but not |
|
|
171 |
immediately after a branch. |
|
|
172 |
""" |
|
|
173 |
|
|
|
174 |
# CCCCCCC1(OCO)1 |
|
|
175 |
s = "[C][C][C][C][C][C][C][Branch1][Ring2][O][C][O][C][Ring1][Branch1]" |
|
|
176 |
assert decode_eq(s, "CCCCCCC(OCO)=C") |
|
|
177 |
|
|
|
178 |
s = "[C][C][C][C][C][C][C][Branch1][Ring2][O][C][O]" \ |
|
|
179 |
"[Branch1][C][F][C][C][Ring1][=Branch2]" |
|
|
180 |
assert decode_eq(s, "CCCCC1CC(OCO)(F)CC1") |
|
|
181 |
|
|
|
182 |
|
|
|
183 |
def test_ring_on_top_of_existing_bond(): |
|
|
184 |
"""Tests SELFIES with rings between two atoms that are already bonded |
|
|
185 |
in the main scaffold. |
|
|
186 |
""" |
|
|
187 |
|
|
|
188 |
# C1C1, C1C=1, C1C#1, ... |
|
|
189 |
assert decode_eq("[C][C][Ring1][C]", "C=C") |
|
|
190 |
assert decode_eq("[C][/C][Ring1][C]", "C=C") |
|
|
191 |
assert decode_eq("[C][C][=Ring1][C]", "C#C") |
|
|
192 |
assert decode_eq("[C][C][#Ring1][C]", "C#C") |
|
|
193 |
|
|
|
194 |
|
|
|
195 |
def test_consecutive_rings(): |
|
|
196 |
"""Test SELFIES which have multiple consecutive rings. |
|
|
197 |
""" |
|
|
198 |
|
|
|
199 |
s = "[C][C][C][C][Ring1][Ring2][Ring1][Ring2]" |
|
|
200 |
assert decode_eq(s, "C=1CCC=1") # 1 + 1 |
|
|
201 |
|
|
|
202 |
s = "[C][C][C][C][Ring1][Ring2][Ring1][Ring2][Ring1][Ring2]" |
|
|
203 |
assert decode_eq(s, "C#1CCC#1") # 1 + 1 + 1 |
|
|
204 |
|
|
|
205 |
s = "[C][C][C][C][=Ring1][Ring2][Ring1][Ring2]" |
|
|
206 |
assert decode_eq(s, "C#1CCC#1") # 2 + 1 |
|
|
207 |
|
|
|
208 |
s = "[C][C][C][C][Ring1][Ring2][=Ring1][Ring2]" |
|
|
209 |
assert decode_eq(s, "C#1CCC#1") # 1 + 2 |
|
|
210 |
|
|
|
211 |
# consecutive rings that exceed bond constraints |
|
|
212 |
s = "[C][C][C][C][#Ring1][Ring2][=Ring1][Ring2]" |
|
|
213 |
assert decode_eq(s, "C#1CCC#1") # 3 + 2 |
|
|
214 |
|
|
|
215 |
s = "[C][C][C][C][=Ring1][Ring2][#Ring1][Ring2]" |
|
|
216 |
assert decode_eq(s, "C#1CCC#1") # 2 + 3 |
|
|
217 |
|
|
|
218 |
s = "[C][C][C][C][=Ring1][Ring2][=Ring1][Ring2]" |
|
|
219 |
assert decode_eq(s, "C#1CCC#1") # 2 + 2 |
|
|
220 |
|
|
|
221 |
# consecutive rings with stereochemical single bond |
|
|
222 |
s = "[C][C][C][C][\\/Ring1][Ring2]" |
|
|
223 |
assert decode_eq(s, "C\\1CCC/1") |
|
|
224 |
|
|
|
225 |
s = "[C][C][C][C][\\/Ring1][Ring2][Ring1][Ring2]" |
|
|
226 |
assert decode_eq(s, "C=1CCC=1") |
|
|
227 |
|
|
|
228 |
|
|
|
229 |
def test_unconstrained_symbols(): |
|
|
230 |
"""Tests SELFIES with symbols that are not semantically constrained. |
|
|
231 |
""" |
|
|
232 |
|
|
|
233 |
f_branch = "[Branch1][C][F]" |
|
|
234 |
s = "[Xe-2]" + (f_branch * 8) |
|
|
235 |
assert decode_eq(s, "[Xe-2](F)(F)(F)(F)(F)(F)(F)CF") |
|
|
236 |
|
|
|
237 |
# change default semantic constraints |
|
|
238 |
constraints = sf.get_semantic_constraints() |
|
|
239 |
constraints["?"] = 2 |
|
|
240 |
sf.set_semantic_constraints(constraints) |
|
|
241 |
|
|
|
242 |
assert decode_eq(s, "[Xe-2](F)CF") |
|
|
243 |
|
|
|
244 |
sf.set_semantic_constraints() |
|
|
245 |
|
|
|
246 |
|
|
|
247 |
def test_isotope_symbols(): |
|
|
248 |
"""Tests that SELFIES symbols with isotope specifications are |
|
|
249 |
constrained properly. |
|
|
250 |
""" |
|
|
251 |
|
|
|
252 |
s = "[13C][Branch1][C][Cl][Branch1][C][F][Branch1][C][Br][Branch1][C][I]" |
|
|
253 |
assert decode_eq(s, "[13C](Cl)(F)(Br)CI") |
|
|
254 |
|
|
|
255 |
assert decode_eq("[C][36Cl][C]", "C[36Cl]") |
|
|
256 |
|
|
|
257 |
|
|
|
258 |
def test_chiral_symbols(): |
|
|
259 |
"""Tests that SELFIES symbols with chirality specifications are |
|
|
260 |
constrained properly. |
|
|
261 |
""" |
|
|
262 |
|
|
|
263 |
s = "[C@@][Branch1][C][Cl][Branch1][C][F][Branch1][C][Br][Branch1][C][I]" |
|
|
264 |
assert decode_eq(s, "[C@@](Cl)(F)(Br)CI") |
|
|
265 |
|
|
|
266 |
s = "[C@H1][Branch1][C][Cl][Branch1][C][F][Branch1][C][Br]" |
|
|
267 |
assert decode_eq(s, "[C@H1](Cl)(F)CBr") |
|
|
268 |
|
|
|
269 |
|
|
|
270 |
def test_explicit_hydrogen_symbols(): |
|
|
271 |
"""Tests that SELFIES symbols with explicit hydrogen specifications |
|
|
272 |
are constrained properly. |
|
|
273 |
""" |
|
|
274 |
|
|
|
275 |
assert decode_eq("[CH1][Branch1][C][Cl][#C]", "[CH1](Cl)=C") |
|
|
276 |
assert decode_eq("[CH3][=C]", "[CH3]C") |
|
|
277 |
|
|
|
278 |
assert decode_eq("[CH4][C][C]", "[CH4]") |
|
|
279 |
assert decode_eq("[C][C][C][CH4]", "CCC") |
|
|
280 |
assert decode_eq("[C][Branch1][Ring2][C][=CH4][C][=C]", "C(C)=C") |
|
|
281 |
|
|
|
282 |
with pytest.raises(sf.DecoderError): |
|
|
283 |
sf.decoder("[C][C][CH5]") |
|
|
284 |
with pytest.raises(sf.DecoderError): |
|
|
285 |
sf.decoder("[C][C][C][OH9]") |
|
|
286 |
|
|
|
287 |
|
|
|
288 |
def test_charged_symbols(): |
|
|
289 |
"""Tests that SELFIES symbols with charges are constrained properly. |
|
|
290 |
""" |
|
|
291 |
|
|
|
292 |
constraints = sf.get_semantic_constraints() |
|
|
293 |
constraints["Sn+4"] = 1 |
|
|
294 |
constraints["O-2"] = 2 |
|
|
295 |
sf.set_semantic_constraints(constraints) |
|
|
296 |
|
|
|
297 |
# the following molecules don't make sense, but we use them to test |
|
|
298 |
# selfies. Hence, we can't verify them with RDKit |
|
|
299 |
assert decode_eq("[Sn+4][=C]", "[Sn+4]C") |
|
|
300 |
assert decode_eq("[O-2][#C]", "[O-2]=C") |
|
|
301 |
|
|
|
302 |
# mixing many symbol types |
|
|
303 |
assert decode_eq("[17O@@H1-2][#C]", "[17O@@H1-2]C") |
|
|
304 |
|
|
|
305 |
sf.set_semantic_constraints() |
|
|
306 |
|
|
|
307 |
|
|
|
308 |
def test_standardized_alphabet(): |
|
|
309 |
"""Tests that equivalent SMILES atom symbols are translated into the |
|
|
310 |
same SELFIES atom symbol. |
|
|
311 |
""" |
|
|
312 |
|
|
|
313 |
assert sf.encoder("[C][O][N][P][F]") == "[CH0][OH0][NH0][PH0][FH0]" |
|
|
314 |
assert sf.encoder("[Fe][Si]") == "[Fe][Si]" |
|
|
315 |
assert sf.encoder("[Fe++][Fe+2]") == "[Fe+2][Fe+2]" |
|
|
316 |
assert sf.encoder("[CH][CH1]") == "[CH1][CH1]" |
|
|
317 |
|
|
|
318 |
|
|
|
319 |
def test_old_symbols(): |
|
|
320 |
"""Tests backward compatibility of SELFIES with old (<v2) symbols. |
|
|
321 |
""" |
|
|
322 |
|
|
|
323 |
s = "[C@@Hexpl][Branch1_2][Branch1_1][Branch1_1][C][C][Cl][F]" |
|
|
324 |
assert sf.decoder(s, compatible=True) == "[C@@H1](C)(Cl)F" |
|
|
325 |
|
|
|
326 |
s = "[C][C][C][C][Expl=Ring1][Ring2][Expl#Ring1][Ring2]" |
|
|
327 |
assert sf.decoder(s, compatible=True) == "C#1CCC#1" |
|
|
328 |
|
|
|
329 |
long_s = "[C@@Hexpl][=C][C@@Hexpl][N+expl][=C][C+expl][N+expl][O+expl]" \ |
|
|
330 |
"[Fe++expl][C@@Hexpl][C][N+expl][Branch1_2][Fe++expl][S+expl]" \ |
|
|
331 |
"[=C][Expl=Ring1][Fe++expl][S+expl][Expl=Ring1][O+expl]" \ |
|
|
332 |
"[C@@Hexpl][Expl=Ring1][C@@Hexpl][C@@Hexpl][N+expl][Expl=Ring1]" \ |
|
|
333 |
"[Expl=Ring1][S+expl][=C]" |
|
|
334 |
try: |
|
|
335 |
sf.decoder(long_s, compatible=True) |
|
|
336 |
except Exception: |
|
|
337 |
assert False |
|
|
338 |
|
|
|
339 |
|
|
|
340 |
def test_large_selfies_decoding(): |
|
|
341 |
"""Test that we can decode extremely large SELFIES strings (used to cause a RecursionError) |
|
|
342 |
""" |
|
|
343 |
|
|
|
344 |
large_selfies = "[C]" * 1024 |
|
|
345 |
expected_smiles = "C" * 1024 |
|
|
346 |
|
|
|
347 |
assert decode_eq(large_selfies, expected_smiles) |
|
|
348 |
|
|
|
349 |
|
|
|
350 |
def test_radical_kekulization(): |
|
|
351 |
"""Tests kekulization of aromatic systems with radicals and charges. |
|
|
352 |
""" |
|
|
353 |
|
|
|
354 |
assert roundtrip_eq("c1ccc[c]c1", "C1=CC=C[CH0]=C1") |
|
|
355 |
assert roundtrip_eq("c1[c]n1(C)", "C1=[CH0]N1C") |
|
|
356 |
assert roundtrip_eq("c1[C][n+]1(C)", "C=1[CH0][N+1]=1C") |
|
|
357 |
assert roundtrip_eq("c1nnn[n-]1", "C1=NN=N[N-1]1") |
|
|
358 |
assert roundtrip_eq("c1ccn[c-](C)[n+]1=O", "C1=CC=N[C-1](C)[N+1]1=O") |
|
|
359 |
assert roundtrip_eq("c1ccs[n+]1c2ccccc2", "C=1C=CS[N+1]=1C2=CC=CC=C2") |
|
|
360 |
assert roundtrip_eq("c1ccs[nH+]1", "C=1C=CS[NH1+1]=1") |
|
|
361 |
|
|
|
362 |
|
|
|
363 |
def test_novel_charged_symbols(): |
|
|
364 |
"""Test decoding of updated constraints for charged atoms (update in 2.2.0).""" |
|
|
365 |
assert decode_eq("[N][#C+1][#NH1][#C@H1]", "N#[C+1]") |
|
|
366 |
assert decode_eq("[O+1][=P+1][#P-1][#C@@]", "[O+1]=[P+1]=[P-1]#[C@@]") |
|
|
367 |
assert decode_eq("[=C-1][#S+1][#B]", "[C-1]#[S+1]=B") |
|
|
368 |
|