Switch to unified view

a b/tests/test_fingerprints.py
1
import os
2
import sys
3
from itertools import combinations
4
5
import numpy as np
6
from scipy.sparse import vstack as sparse_vstack
7
from numpy.testing import (assert_array_equal,
8
                           assert_array_almost_equal,
9
                           assert_almost_equal)
10
11
import pytest
12
13
import oddt
14
from oddt.fingerprints import (InteractionFingerprint,
15
                               SimpleInteractionFingerprint,
16
                               ECFP,
17
                               _ECFP_atom_repr,
18
                               SPLIF,
19
                               similarity_SPLIF,
20
                               PLEC,
21
                               fold,
22
                               MIN_HASH_VALUE,
23
                               MAX_HASH_VALUE,
24
                               sparse_to_dense,
25
                               sparse_to_csr_matrix,
26
                               csr_matrix_to_sparse,
27
                               dense_to_sparse,
28
                               get_molecular_shingles,
29
                               hash_fnv1a_python,
30
                               dice,
31
                               tanimoto)
32
from .utils import shuffle_mol
33
34
35
test_data_dir = os.path.dirname(os.path.abspath(__file__))
36
37
protein = next(oddt.toolkit.readfile('pdb', os.path.join(
38
    test_data_dir, 'data/pdbbind/10gs/10gs_pocket.pdb')))
39
protein.protein = True
40
protein.addh(only_polar=True)
41
42
ligand = next(oddt.toolkit.readfile('sdf', os.path.join(
43
    test_data_dir, 'data/pdbbind/10gs/10gs_ligand.sdf')))
44
ligand.addh(only_polar=True)
45
46
47
def test_folding():
48
    """FP Folding"""
49
    # Upper bound
50
    assert_array_equal(fold([MAX_HASH_VALUE], 1024), [1023])
51
    assert_array_equal(fold([MAX_HASH_VALUE], 1234567890), [1234567889])
52
    assert_array_equal(fold([MAX_HASH_VALUE], MAX_HASH_VALUE / 2),
53
                       [MAX_HASH_VALUE / 2 - 1])
54
    assert_array_equal(fold([MAX_HASH_VALUE], MAX_HASH_VALUE - 1),
55
                       [MAX_HASH_VALUE - 2])
56
    # Lower bound
57
    assert_array_equal(fold([MIN_HASH_VALUE], 1024), [0])
58
    assert_array_equal(fold([MIN_HASH_VALUE], 1234567890), [0])
59
    assert_array_equal(fold([MIN_HASH_VALUE], MAX_HASH_VALUE / 2), [0])
60
    assert_array_equal(fold([MIN_HASH_VALUE], MAX_HASH_VALUE - 1), [0])
61
62
    # Range check
63
    fp = np.arange(1, MAX_HASH_VALUE, 1e6, dtype=int)
64
    assert_array_equal(fold(fp, MAX_HASH_VALUE), fp - 1)
65
66
@pytest.mark.skipif(sys.version_info > (3, 7), reason="Only testable with old Python Hash implementation")
67
def test_hashing_function():
68
    """Verify the implementation of Python 2.4-3.7 hash function in Python"""
69
    sample_list = list(range(-10, 10))
70
    # add nested structure
71
    sample_list.append(tuple(sample_list))
72
    sample_list.append(tuple(sample_list))
73
    for sample_tuple in combinations(sample_list, r=5):
74
        python_hash = hash(sample_tuple)
75
        custom_hash = hash_fnv1a_python(sample_tuple)
76
        assert python_hash == custom_hash
77
78
79
def test_sparse_densify():
80
    """FP densify"""
81
    sparse_fp = [0, 33, 49, 53, 107, 156, 161, 203, 215, 230, 251, 269, 299,
82
                 323, 331, 376, 389, 410, 427, 430, 450, 484, 538, 592, 593,
83
                 636, 646, 658, 698, 699, 702, 741, 753, 807, 850, 861, 882,
84
                 915, 915, 915, 969, 969, 1023]
85
86
    # count vectors
87
    dense = sparse_to_dense(sparse_fp, size=1024, count_bits=True)
88
    csr = sparse_to_csr_matrix(sparse_fp, size=1024, count_bits=True)
89
    assert_array_equal(dense.reshape(1, -1), csr.toarray())
90
    resparsed = dense_to_sparse(dense)
91
    resparsed_csr = csr_matrix_to_sparse(csr)
92
    assert_array_equal(sparse_fp, resparsed)
93
    assert_array_equal(sparse_fp, resparsed_csr)
94
95
    # bool vectors
96
    dense = sparse_to_dense(sparse_fp, size=1024, count_bits=False)
97
    csr = sparse_to_csr_matrix(sparse_fp, size=1024, count_bits=False)
98
    assert_array_equal(dense.reshape(1, -1), csr.toarray())
99
    resparsed = dense_to_sparse(dense)
100
    resparsed_csr = csr_matrix_to_sparse(csr)
101
    assert_array_equal(np.unique(sparse_fp), resparsed)
102
    assert_array_equal(np.unique(sparse_fp), resparsed_csr)
103
104
    # test stacking
105
    np.random.seed(0)
106
    sparse_fps = np.random.randint(0, 1024, size=(20, 100))
107
    dense = np.vstack([sparse_to_dense(fp, size=1024) for fp in sparse_fps])
108
    csr = sparse_vstack(sparse_to_csr_matrix(fp, size=1024) for fp in sparse_fps)
109
    assert_array_equal(dense, csr.toarray())
110
111
    # test exceptions
112
    with pytest.raises(ValueError):
113
        csr_matrix_to_sparse(np.array([1, 2, 3]))
114
115
116
def test_InteractionFingerprint():
117
    """Interaction Fingerprint test"""
118
    if oddt.toolkit.backend == 'ob':
119
        IFP = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0,
120
               0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
121
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0,
122
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
123
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
124
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
125
               0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
126
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
127
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
128
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
129
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
130
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
131
               1, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
132
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
133
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
134
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
135
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
136
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
137
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
138
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
139
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
140
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
141
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0,
142
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
143
    else:
144
        IFP = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0,
145
               0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
146
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0,
147
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
148
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
149
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
150
               0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
151
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
152
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
153
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
154
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
155
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
156
               1, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
157
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
158
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
159
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
160
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
161
               0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
162
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
163
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
164
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
165
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
166
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
167
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
168
    assert_array_equal(IFP, InteractionFingerprint(ligand, protein))
169
170
171
def test_SimpleInteractionFingerprint():
172
    """Simple Interaction Fingerprint test """
173
    SIFP = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2,
174
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0,
175
            0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
176
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
177
            0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
178
            0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,
179
            1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
180
            0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0]
181
    assert_array_equal(SIFP, SimpleInteractionFingerprint(ligand, protein))
182
183
184
def test_IFP_SIFP_Folding_cum_sum():
185
    """Checks, whether InteractionFingerprint and SimpleInteractionFingerprint outcomes matches"""
186
    IFP = np.sum(InteractionFingerprint(ligand, protein), axis=0)
187
    SIFP = np.sum(SimpleInteractionFingerprint(ligand, protein), axis=0)
188
    assert_array_equal(IFP, SIFP)
189
190
191
def test_similarity():
192
    """FP similarity"""
193
    mols = list(oddt.toolkit.readfile('sdf', os.path.join(
194
        test_data_dir, 'data/dude/xiap/actives_docked.sdf')))
195
    mols = list(filter(lambda x: x.title == '312335', mols))
196
    list(map(lambda x: x.addh(only_polar=True), mols))
197
    receptor = next(oddt.toolkit.readfile('pdb', os.path.join(
198
        test_data_dir, 'data/dude/xiap/receptor_rdkit.pdb')))
199
    receptor.protein = True
200
    receptor.addh(only_polar=True)
201
    ref = SimpleInteractionFingerprint(mols[0], receptor)
202
203
    outcome = [dice(ref, SimpleInteractionFingerprint(
204
        mol, receptor)) for mol in mols[1:]]
205
    target_outcome = np.array([0.742857, 0.645161, 0.727273, 0.571429,
206
                               0.727273, 0.588235, 0.75, 0.551724,
207
                               0.551724, 0.6875, 0.514286, 0.6875,
208
                               0.592593, 0.647059, 0.736842, 0.62069,
209
                               0.545455, 0.533333, 0.606061])
210
    assert_array_almost_equal(outcome, target_outcome)
211
212
    outcome = [tanimoto(ref, SimpleInteractionFingerprint(
213
        mol, receptor)) for mol in mols[1:]]
214
    target_outcome = np.array([0.636364, 0.5, 0.666667, 0.384615, 0.666667,
215
                               0.545455, 0.666667, 0.5, 0.363636, 0.666667,
216
                               0.555556, 0.555556, 0.625, 0.6, 0.727273,
217
                               0.555556, 0.5, 0.4, 0.363636])
218
    assert_array_almost_equal(outcome, target_outcome)
219
220
221
def test_sparse_similarity():
222
    """Sparse similarity"""
223
    mol1 = oddt.toolkit.readstring("smi", "CC1=C(C(=CC=C1)C)NC(=O)CN2CCN(CC2)CC(=O)N3CCC4=C(C3)C=CS4")
224
    mol2 = oddt.toolkit.readstring("smi", "CC1=C(C(=CC=C1)O)NC(=O)CN2CCN(CC2)CC(=O)N3CCC4=C(C3)C=CS4")
225
226
    mol1_fp_dense = ECFP(mol1, depth=8, size=4096, sparse=False)
227
    mol2_fp_dense = ECFP(mol2, depth=8, size=4096, sparse=False)
228
229
    mol1_fp_sparse = ECFP(mol1, depth=8, size=4096, sparse=True)
230
    mol2_fp_sparse = ECFP(mol2, depth=8, size=4096, sparse=True)
231
232
    assert_almost_equal(dice(mol1_fp_sparse, mol2_fp_sparse, sparse=True),
233
                        dice(mol1_fp_dense, mol2_fp_dense))
234
    assert dice([], [], sparse=True) == 0.
235
    assert dice(np.zeros(10), np.zeros(10), sparse=False) == 0.
236
    assert_almost_equal(tanimoto(mol1_fp_sparse, mol2_fp_sparse, sparse=True),
237
                        tanimoto(mol1_fp_dense, mol2_fp_dense))
238
    assert tanimoto([], [], sparse=True) == 0.
239
    assert tanimoto(np.zeros(10), np.zeros(10), sparse=False) == 0.
240
241
242
def test_ecfp_repr():
243
    """Test exact ECFP representation to track down the changes"""
244
    mol = oddt.toolkit.readstring("smi", "CC1=C(C(=CC=C1)C)NC(=O)CN2CCN(CC2)CC(=O)N3CCC4=C(C3)C=CS4")
245
246
    res = [(6, 0, 1, 3, 0, 0, 0), (6, 0, 3, 0, 0, 1, 1), (6, 0, 3, 0, 0, 1, 1), (6, 0, 3, 0, 0, 1, 1),
247
           (6, 0, 2, 1, 0, 1, 1), (6, 0, 2, 1, 0, 1, 1), (6, 0, 2, 1, 0, 1, 1), (6, 0, 1, 3, 0, 0, 0),
248
           (7, 0, 2, 1, 0, 0, 0), (6, 0, 3, 0, 0, 0, 0), (8, 0, 1, 0, 0, 0, 0), (6, 0, 2, 2, 0, 0, 0),
249
           (7, 0, 3, 0, 0, 1, 0), (6, 0, 2, 2, 0, 1, 0), (6, 0, 2, 2, 0, 1, 0), (7, 0, 3, 0, 0, 1, 0),
250
           (6, 0, 2, 2, 0, 1, 0), (6, 0, 2, 2, 0, 1, 0), (6, 0, 2, 2, 0, 0, 0), (6, 0, 3, 0, 0, 0, 0),
251
           (8, 0, 1, 0, 0, 0, 0), (7, 0, 3, 0, 0, 1, 0), (6, 0, 2, 2, 0, 1, 0), (6, 0, 2, 2, 0, 1, 0),
252
           (6, 0, 3, 0, 0, 1, 1), (6, 0, 3, 0, 0, 1, 1), (6, 0, 2, 2, 0, 1, 0), (6, 0, 2, 1, 0, 1, 1),
253
           (6, 0, 2, 1, 0, 1, 1), (16, 0, 2, 0, 0, 1, 1)]
254
255
    assert_array_equal([_ECFP_atom_repr(mol, i) for i in range(len(mol.atoms))], res)
256
257
258
def test_ecfp():
259
    """ECFP fingerprints"""
260
    mol1 = oddt.toolkit.readstring("smi", "CC1=C(C(=CC=C1)C)NC(=O)CN2CCN(CC2)CC(=O)N3CCC4=C(C3)C=CS4")
261
    mol2 = oddt.toolkit.readstring("smi", "CC1=C(C(=CC=C1)O)NC(=O)CN2CCN(CC2)CC(=O)N3CCC4=C(C3)C=CS4")
262
263
    mol1_fp = ECFP(mol1, depth=8, size=4096, sparse=False)
264
    mol2_fp = ECFP(mol2, depth=8, size=4096, sparse=False)
265
266
    ref1 = [2, 100, 176, 185, 200, 203, 359, 382, 447, 509, 518, 550, 572, 583,
267
            598, 606, 607, 684, 818, 821, 832, 861, 960, 992, 1006, 1019, 1042,
268
            1050, 1059, 1103, 1175, 1281, 1315, 1377, 1431, 1470, 1479, 1512,
269
            1577, 1588, 1598, 1620, 1633, 1647, 1663, 1723, 1749, 1751, 1775,
270
            1781, 1821, 1837, 1899, 1963, 1969, 1986, 2013, 2253, 2343, 2355,
271
            2368, 2435, 2547, 2654, 2657, 2702, 2722, 2725, 2803, 2816, 2853,
272
            2870, 2920, 2992, 3028, 3056, 3074, 3103, 3190, 3203, 3277, 3321,
273
            3362, 3377, 3383, 3401, 3512, 3546, 3552, 3585, 3593, 3617, 3674,
274
            3759, 3784, 3790, 3832, 3895, 3937, 3956, 3974, 4007, 4033]
275
276
    ref2 = [43, 100, 176, 200, 203, 231, 382, 396, 447, 490, 518, 583, 606,
277
            607, 650, 818, 821, 832, 840, 861, 907, 950, 960, 992, 1006, 1013,
278
            1019, 1042, 1050, 1059, 1103, 1104, 1112, 1175, 1281, 1293, 1315,
279
            1377, 1431, 1470, 1512, 1543, 1577, 1588, 1598, 1633, 1647, 1663,
280
            1723, 1749, 1751, 1757, 1759, 1775, 1781, 1821, 1837, 1880, 1963,
281
            1969, 1986, 2253, 2355, 2368, 2435, 2544, 2547, 2654, 2702, 2722,
282
            2725, 2726, 2799, 2816, 2853, 2870, 2920, 2992, 3028, 3074, 3190,
283
            3203, 3277, 3290, 3333, 3362, 3383, 3401, 3512, 3546, 3552, 3585,
284
            3593, 3617, 3640, 3660, 3674, 3759, 3784, 3790, 3805, 3832, 3856,
285
            3895, 3924, 3956, 3974, 3992, 4007, 4033]
286
287
    assert_array_equal(ref1, np.where(mol1_fp)[0])
288
    assert_array_equal(ref2, np.where(mol2_fp)[0])
289
290
    assert_almost_equal(dice(mol1_fp, mol2_fp), 0.69999999)
291
    assert_almost_equal(tanimoto(mol1_fp, mol2_fp), 0.63846153)
292
293
    # adding Hs should not change anything
294
    mol1.addh()
295
    mol2.addh()
296
297
    mol1_fp = ECFP(mol1, depth=8, size=4096, sparse=False)
298
    mol2_fp = ECFP(mol2, depth=8, size=4096, sparse=False)
299
300
    assert_array_equal(ref1, np.where(mol1_fp)[0])
301
    assert_array_equal(ref2, np.where(mol2_fp)[0])
302
303
    assert_almost_equal(dice(mol1_fp, mol2_fp), 0.69999999)
304
    assert_almost_equal(tanimoto(mol1_fp, mol2_fp), 0.63846153)
305
306
    # removig Hs should not change anything
307
    mol1.removeh()
308
    mol2.removeh()
309
310
    mol1_fp = ECFP(mol1, depth=8, size=4096, sparse=False)
311
    mol2_fp = ECFP(mol2, depth=8, size=4096, sparse=False)
312
313
    assert_array_equal(ref1, np.where(mol1_fp)[0])
314
    assert_array_equal(ref2, np.where(mol2_fp)[0])
315
316
    assert_almost_equal(dice(mol1_fp, mol2_fp), 0.69999999)
317
    assert_almost_equal(tanimoto(mol1_fp, mol2_fp), 0.63846153)
318
319
320
def test_fcfp():
321
    """FCFP fingerprints"""
322
    mol1 = oddt.toolkit.readstring("smi", "CC1=C(C(=CC=C1)C)NC(=O)CN2CCN(CC2)CC(=O)N3CCC4=C(C3)C=CS4")
323
    mol2 = oddt.toolkit.readstring("smi", "CC1=C(C(=CC=C1)O)NC(=O)CN2CCN(CC2)CC(=O)N3CCC4=C(C3)C=CS4")
324
325
    mol1_fp = ECFP(mol1, depth=8, size=4096,
326
                   sparse=False, use_pharm_features=True)
327
    mol2_fp = ECFP(mol2, depth=8, size=4096,
328
                   sparse=False, use_pharm_features=True)
329
330
    ref1 = [46, 111, 305, 310, 362, 384, 409, 451, 467, 548, 572, 595, 607,
331
            608, 620, 659, 691, 699, 724, 743, 752, 842, 926, 935, 974, 1037,
332
            1072, 1094, 1135, 1143, 1161, 1172, 1313, 1325, 1368, 1399, 1461,
333
            1486, 1488, 1492, 1603, 1619, 1648, 1665, 1666, 1838, 1887, 1900,
334
            1948, 1961, 1972, 1975, 1996, 2000, 2052, 2085, 2094, 2174, 2232,
335
            2236, 2368, 2382, 2383, 2402, 2483, 2492, 2527, 2593, 2616, 2706,
336
            2789, 2899, 2922, 2945, 2966, 3102, 3117, 3176, 3189, 3215, 3225,
337
            3297, 3326, 3349, 3373, 3513, 3525, 3535, 3601, 3619, 3780, 3820,
338
            3897, 3919, 3976, 3981, 4050, 4079, 4091]
339
340
    ref2 = [46, 111, 143, 172, 259, 305, 362, 409, 451, 467, 507, 518, 548,
341
            583, 595, 607, 608, 620, 639, 691, 693, 724, 752, 784, 825, 842,
342
            926, 1037, 1087, 1094, 1098, 1135, 1143, 1161, 1172, 1286, 1325,
343
            1368, 1371, 1395, 1399, 1461, 1486, 1488, 1492, 1565, 1619, 1648,
344
            1655, 1665, 1887, 1890, 1900, 1948, 1961, 1968, 1972, 1975, 1976,
345
            1996, 2000, 2007, 2094, 2125, 2174, 2232, 2236, 2368, 2382, 2383,
346
            2483, 2492, 2571, 2593, 2606, 2638, 2706, 2789, 2922, 2945, 2966,
347
            2986, 3030, 3100, 3102, 3117, 3227, 3326, 3350, 3373, 3406, 3419,
348
            3535, 3577, 3619, 3697, 3742, 3820, 3839, 3919, 3981, 4043, 4050,
349
            4079, 4091]
350
351
    assert_array_equal(ref1, np.where(mol1_fp)[0])
352
    assert_array_equal(ref2, np.where(mol2_fp)[0])
353
354
    assert_almost_equal(dice(mol1_fp, mol2_fp), 0.64074074)
355
    assert_almost_equal(tanimoto(mol1_fp, mol2_fp), 0.5)
356
357
    # adding Hs should not change anything
358
    mol1.addh()
359
    mol2.addh()
360
361
    assert_array_equal(ref1, np.where(mol1_fp)[0])
362
    assert_array_equal(ref2, np.where(mol2_fp)[0])
363
364
    assert_almost_equal(dice(mol1_fp, mol2_fp), 0.64074074)
365
    assert_almost_equal(tanimoto(mol1_fp, mol2_fp), 0.5)
366
367
368
def test_ecfp_invaraiants():
369
    """ECFP: test random reordering"""
370
    sildenafil = oddt.toolkit.readstring("smi", "CCCc1nn(C)c2c(=O)[nH]c(-c3cc(S(=O)(=O)N4CCN(C)CC4)ccc3OCC)nc12")
371
372
    params = {'depth': 4, 'size': 4096, 'sparse': True}
373
    fp = ECFP(sildenafil, **params)
374
375
    for n in range(10):
376
        sildenafil = shuffle_mol(sildenafil)
377
        assert_array_equal(fp, ECFP(sildenafil, **params))
378
379
380
def test_splif():
381
    """SPLIF fingerprints"""
382
    mols = list(oddt.toolkit.readfile('sdf', os.path.join(
383
        test_data_dir, 'data/dude/xiap/actives_docked.sdf')))
384
    mols = list(filter(lambda x: x.title == '312335', mols))
385
    list(map(lambda x: x.addh(only_polar=True), mols))
386
    receptor = next(oddt.toolkit.readfile('pdb', os.path.join(
387
        test_data_dir, 'data/dude/xiap/receptor_rdkit.pdb')))
388
    receptor.protein = True
389
    receptor.addh(only_polar=True)
390
    splif = SPLIF(mols[0], receptor)
391
    reference = [6, 38, 49, 53, 53, 53, 70, 70, 81, 81, 81, 81, 165, 216, 219,
392
                 249, 330, 330, 333, 377, 380, 396, 396, 396, 423, 423, 479,
393
                 479, 498, 498, 498, 570, 592, 625, 638, 768, 768, 817, 818,
394
                 818, 818, 818, 858, 884, 888, 907, 930, 934, 935, 971, 1023,
395
                 1041, 1115, 1142, 1184, 1184, 1252, 1263, 1269, 1275, 1275,
396
                 1275, 1315, 1315, 1315, 1337, 1337, 1344, 1351, 1396, 1435,
397
                 1465, 1502, 1502, 1502, 1502, 1569, 1569, 1569, 1569, 1569,
398
                 1569, 1569, 1569, 1640, 1645, 1660, 1660, 1697, 1697, 1716,
399
                 1746, 1756, 1778, 1901, 1937, 1997, 2000, 2000, 2000, 2007,
400
                 2007, 2020, 2070, 2195, 2274, 2294, 2319, 2415, 2417, 2509,
401
                 2528, 2578, 2578, 2584, 2590, 2590, 2624, 2636, 2678, 2678,
402
                 2678, 2678, 2678, 2776, 2776, 2789, 2862, 2862, 2894, 2894,
403
                 2894, 2923, 2923, 3058, 3073, 3073, 3073, 3073, 3137, 3159,
404
                 3159, 3159, 3186, 3218, 3218, 3279, 3279, 3281, 3338, 3358,
405
                 3360, 3368, 3387, 3609, 3636, 3636, 3713, 3713, 3716, 3716,
406
                 3748, 3767, 3769, 3854, 3871, 3912, 3968, 3986, 3994, 3994,
407
                 4069]
408
409
    assert splif['hash'].shape == (172,)
410
    assert_array_equal(splif['ligand_coords'].shape, (172, 7, 3))
411
    assert_array_equal(splif['protein_coords'].shape, (172, 7, 3))
412
    assert_array_equal(reference, splif['hash'])
413
414
415
def test_splif_similarity():
416
    """SPLIF similarity"""
417
    mols = list(oddt.toolkit.readfile('sdf', os.path.join(
418
        test_data_dir, 'data/dude/xiap/actives_docked.sdf')))
419
    mols = list(filter(lambda x: x.title == '312335', mols))
420
    list(map(lambda x: x.addh(only_polar=True), mols))
421
    receptor = next(oddt.toolkit.readfile('pdb', os.path.join(
422
        test_data_dir, 'data/dude/xiap/receptor_rdkit.pdb')))
423
    receptor.protein = True
424
    receptor.addh(only_polar=True)
425
    ref = SPLIF(mols[0], receptor)
426
    splif_fps = [SPLIF(mol, receptor) for mol in mols]
427
    outcome = [similarity_SPLIF(ref, fp) for fp in splif_fps]
428
    target_outcome = np.array([1.000, 0.779, 0.660, 0.805, 0.630,
429
                               0.802, 0.366, 0.817, 0.378, 0.553,
430
                               0.732, 0.705, 0.856, 0.797, 0.502,
431
                               0.418, 0.653, 0.436, 0.708, 0.688])
432
433
    assert_array_almost_equal(outcome, target_outcome, decimal=3)
434
435
    # check if similarity is symmetric
436
    for fp1, fp2 in combinations(splif_fps, 2):
437
        assert similarity_SPLIF(fp1, fp2) == similarity_SPLIF(fp2, fp1)
438
439
440
def test_plec():
441
    """PLEC fingerprints"""
442
    mols = list(oddt.toolkit.readfile('sdf', os.path.join(
443
        test_data_dir, 'data/dude/xiap/actives_docked.sdf')))
444
    mols = list(filter(lambda x: x.title == '312335', mols))
445
    list(map(lambda x: x.removeh(), mols))
446
    receptor = next(oddt.toolkit.readfile('pdb', os.path.join(
447
        test_data_dir, 'data/dude/xiap/receptor_rdkit.pdb')))
448
    receptor.protein = True
449
    receptor.removeh()
450
    plec = PLEC(mols[0], receptor)
451
    reference = [80, 119, 120, 120, 120, 120, 137, 138, 155, 155, 155, 155,
452
                 155, 155, 155, 161, 199, 214, 214, 214, 226, 226, 233, 266,
453
                 282, 283, 283, 313, 313, 386, 386, 430, 431, 431, 432, 448,
454
                 581, 581, 643, 662, 684, 690, 729, 737, 741, 778, 778, 795,
455
                 799, 799, 812, 812, 876, 877, 894, 907, 924, 924, 925, 925,
456
                 935, 935, 935, 935, 935, 964, 964, 964, 993, 993, 996, 996,
457
                 1002, 1002, 1042, 1042, 1066, 1066, 1077, 1113, 1119, 1224,
458
                 1266, 1266, 1290, 1322, 1322, 1334, 1334, 1403, 1411, 1411,
459
                 1461, 1475, 1480, 1497, 1521, 1584, 1584, 1614, 1618, 1618,
460
                 1618, 1618, 1691, 1694, 1694, 1755, 1755, 1755, 1755, 1786,
461
                 1835, 1835, 1867, 1953, 1953, 1953, 1953, 1963, 1970, 1970,
462
                 1990, 1992, 1992, 1992, 2024, 2024, 2060, 2252, 2373, 2383,
463
                 2383, 2390, 2390, 2451, 2537, 2538, 2552, 2555, 2558, 2640,
464
                 2720, 2752, 2791, 2821, 2821, 2931, 2950, 2957, 2957, 2959,
465
                 2961, 2961, 2961, 2963, 2970, 2970, 2982, 3034, 3049, 3066,
466
                 3084, 3084, 3084, 3104, 3126, 3227, 3248, 3293, 3293, 3293,
467
                 3420, 3439, 3517, 3539, 3546, 3546, 3546, 3546, 3553, 3559,
468
                 3596, 3630, 3643, 3643, 3674, 3707, 3708, 3716, 3738, 3742,
469
                 3828, 3846, 3859, 3876, 3887, 3904, 3904, 3904, 3916, 3916,
470
                 3939, 3941, 3981, 3981, 3991, 3993, 4010, 4097, 4127, 4127,
471
                 4127, 4127, 4165, 4181, 4192, 4316, 4330, 4372, 4391, 4461,
472
                 4462, 4463, 4542, 4542, 4542, 4549, 4549, 4549, 4549, 4614,
473
                 4615, 4657, 4668, 4670, 4686, 4686, 4686, 4688, 4688, 4688,
474
                 4688, 4695, 4729, 4740, 4741, 4744, 4744, 4744, 4744, 4756,
475
                 4814, 4828, 4828, 4861, 4861, 4861, 4861, 4861, 4861, 4861,
476
                 4861, 4861, 4861, 4861, 4861, 4861, 4861, 4861, 4861, 4916,
477
                 4945, 4945, 5011, 5037, 5042, 5044, 5046, 5055, 5078, 5080,
478
                 5101, 5101, 5126, 5139, 5146, 5189, 5193, 5232, 5271, 5314,
479
                 5321, 5350, 5379, 5439, 5439, 5439, 5439, 5481, 5482, 5535,
480
                 5563, 5565, 5565, 5585, 5601, 5601, 5626, 5626, 5631, 5631,
481
                 5631, 5631, 5631, 5631, 5639, 5670, 5688, 5690, 5742, 5804,
482
                 5804, 5864, 5871, 5885, 5983, 5992, 6010, 6010, 6010, 6059,
483
                 6059, 6096, 6164, 6183, 6183, 6197, 6234, 6256, 6261, 6261,
484
                 6277, 6277, 6277, 6277, 6299, 6333, 6333, 6388, 6388, 6404,
485
                 6428, 6428, 6428, 6428, 6431, 6431, 6445, 6449, 6450, 6480,
486
                 6496, 6519, 6519, 6540, 6582, 6642, 6654, 6654, 6671, 6717,
487
                 6722, 6735, 6735, 6735, 6764, 6764, 6781, 6781, 6781, 6781,
488
                 6788, 6788, 6803, 6808, 6833, 6838, 6838, 6950, 6979, 6979,
489
                 6997, 7069, 7115, 7194, 7250, 7254, 7277, 7288, 7352, 7464,
490
                 7493, 7506, 7506, 7520, 7530, 7530, 7530, 7542, 7546, 7561,
491
                 7608, 7678, 7678, 7685, 7701, 7701, 7701, 7752, 7752, 7752,
492
                 7790, 7847, 7957, 7957, 7957, 7959, 8003, 8003, 8003, 8010,
493
                 8083, 8086, 8086, 8086, 8086, 8113, 8116, 8160, 8190, 8230,
494
                 8230, 8262, 8262, 8282, 8284, 8284, 8292, 8297, 8327, 8327,
495
                 8383, 8383, 8383, 8418, 8418, 8426, 8457, 8484, 8484, 8543,
496
                 8543, 8580, 8629, 8651, 8655, 8697, 8726, 8781, 8784, 8796,
497
                 8837, 8850, 8923, 9034, 9040, 9077, 9077, 9099, 9134, 9180,
498
                 9206, 9257, 9281, 9304, 9304, 9333, 9341, 9358, 9393, 9394,
499
                 9432, 9450, 9450, 9455, 9455, 9481, 9493, 9493, 9505, 9537,
500
                 9547, 9572, 9585, 9610, 9610, 9661, 9689, 9690, 9690, 9700,
501
                 9700, 9733, 9736, 9736, 9736, 9736, 9765, 9784, 9885, 9885,
502
                 9885, 9934, 9938, 9968, 9968, 10037, 10080, 10080, 10103,
503
                 10113, 10113, 10114, 10115, 10115, 10115, 10139, 10139, 10139,
504
                 10139, 10139, 10181, 10181, 10181, 10181, 10185, 10286, 10295,
505
                 10317, 10317, 10340, 10340, 10340, 10340, 10352, 10353, 10364,
506
                 10364, 10385, 10490, 10490, 10504, 10535, 10539, 10539, 10589,
507
                 10589, 10591, 10599, 10648, 10648, 10650, 10650, 10681, 10703,
508
                 10714, 10714, 10714, 10739, 10739, 10793, 10806, 10806, 10806,
509
                 10837, 10865, 10865, 10871, 10903, 10978, 10978, 11056, 11056,
510
                 11141, 11159, 11207, 11213, 11257, 11272, 11360, 11362, 11377,
511
                 11454, 11454, 11458, 11458, 11458, 11539, 11563, 11580, 11580,
512
                 11580, 11605, 11605, 11610, 11610, 11613, 11624, 11664, 11664,
513
                 11683, 11683, 11697, 11698, 11701, 11707, 11753, 11835, 11846,
514
                 11852, 11858, 11876, 11879, 11890, 11957, 11957, 12009, 12115,
515
                 12130, 12151, 12222, 12268, 12290, 12290, 12295, 12295, 12320,
516
                 12431, 12448, 12475, 12475, 12475, 12481, 12485, 12487, 12587,
517
                 12632, 12632, 12634, 12641, 12641, 12641, 12664, 12761, 12761,
518
                 12778, 12832, 12878, 12878, 12884, 12958, 12982, 12982, 12982,
519
                 12982, 12992, 13057, 13079, 13121, 13129, 13200, 13200, 13277,
520
                 13277, 13317, 13317, 13320, 13320, 13336, 13388, 13434, 13443,
521
                 13475, 13495, 13517, 13517, 13553, 13602, 13637, 13655, 13658,
522
                 13658, 13688, 13688, 13774, 13774, 13784, 13784, 13784, 13786,
523
                 13791, 13791, 13809, 13839, 13839, 13839, 13839, 13839, 13876,
524
                 13905, 13906, 13906, 13906, 13906, 13920, 13920, 13920, 13920,
525
                 13920, 13949, 13949, 14058, 14122, 14122, 14133, 14133, 14198,
526
                 14259, 14259, 14317, 14332, 14368, 14386, 14423, 14423, 14423,
527
                 14423, 14423, 14423, 14423, 14439, 14440, 14447, 14464, 14464,
528
                 14469, 14505, 14510, 14510, 14513, 14516, 14516, 14529, 14529,
529
                 14529, 14549, 14563, 14563, 14570, 14570, 14570, 14582, 14605,
530
                 14605, 14611, 14748, 14748, 14750, 14757, 14772, 14798, 14802,
531
                 14810, 14854, 14857, 14857, 14878, 14878, 14903, 14903, 14993,
532
                 14993, 14996, 15008, 15012, 15018, 15044, 15044, 15074, 15092,
533
                 15092, 15146, 15146, 15191, 15251, 15251, 15253, 15258, 15311,
534
                 15311, 15317, 15429, 15429, 15441, 15444, 15498, 15518, 15520,
535
                 15622, 15622, 15622, 15651, 15672, 15712, 15715, 15798, 15798,
536
                 15811, 15950, 15982, 15982, 15987, 16023, 16023, 16042, 16049,
537
                 16054, 16080, 16099, 16119, 16119, 16119, 16174, 16174, 16213,
538
                 16225, 16229, 16234, 16234, 16234, 16252, 16252, 16252, 16252,
539
                 16252, 16320, 16328, 16362, 16362]
540
541
    assert_array_equal(reference, plec)
542
    assert_array_equal(plec.shape, (860,))
543
544
    # Hydrogens should not impact the PLEC fingerprint
545
    list(map(lambda x: x.addh(only_polar=True), mols))
546
    receptor.addh(only_polar=True)
547
    plec = PLEC(mols[0], receptor)
548
    assert_array_equal(reference, plec, "Polar Hs break PLEC")
549
550
    list(map(lambda x: x.addh(), mols))
551
    receptor.addh()
552
    plec = PLEC(mols[0], receptor)
553
    assert_array_equal(reference, plec, "Non-polar Hs break PLEC")
554
555
556
def test_plec_binded_hoh():
557
    # if water coordinates metal in PDB and ligand is in contact with it, HOH
558
    # will pop up in metals environment, thus we cannot ignore HOHs in repr_dict
559
560
    if (oddt.toolkit.backend == 'ob' or
561
            (oddt.toolkit.backend == 'rdk' and
562
             oddt.toolkit.__version__ >= '2017.03')):
563
        ligand = next(oddt.toolkit.readfile('sdf', os.path.join(
564
            test_data_dir, 'data', 'pdb', '3kwa_ligand.sdf')))
565
        protein = next(oddt.toolkit.readfile('pdb', os.path.join(
566
            test_data_dir, 'data', 'pdb', '3kwa_5Apocket.pdb')))
567
        protein.protein = True
568
569
        assert len(PLEC(ligand, protein, ignore_hoh=True)) == 465
570
        assert len(PLEC(ligand, protein, ignore_hoh=False)) == 560
571
572
573
def test_plec_similarity():
574
    """PLEC similarity"""
575
    mols = list(oddt.toolkit.readfile('sdf', os.path.join(
576
        test_data_dir, 'data/dude/xiap/actives_docked.sdf')))
577
    mols = list(filter(lambda x: x.title == '312335', mols))
578
    list(map(lambda x: x.addh(only_polar=True), mols))
579
    receptor = next(oddt.toolkit.readfile('pdb', os.path.join(
580
        test_data_dir, 'data/dude/xiap/receptor_rdkit.pdb')))
581
    receptor.protein = True
582
    receptor.addh(only_polar=True)
583
    reference_sparse = PLEC(mols[0], receptor)
584
    outcome_sparse = [dice(reference_sparse, PLEC(mol, receptor),
585
                           sparse=True) for mol in mols[1:]]
586
    target_outcome = np.array([0.833,  0.729,  0.849,  0.785,  0.821,
587
                               0.604,  0.868,  0.656, 0.712,  0.652,
588
                               0.699,  0.785,  0.736,  0.745,  0.661,
589
                               0.667, 0.555,  0.616,  0.714])
590
    reference_dense = PLEC(mols[0], receptor, sparse=False)
591
    outcome_dense = [dice(reference_dense, PLEC(mol, receptor, sparse=False),
592
                          sparse=False) for mol in mols[1:]]
593
    assert_array_almost_equal(outcome_sparse, target_outcome, decimal=2)
594
    assert_array_almost_equal(outcome_dense, target_outcome, decimal=2)
595
596
597
def test_molecular_shingles():
598
    sildenafil = oddt.toolkit.readstring("smi", "CCCc1nn(C)c2c(=O)[nH]c(-c3cc(S(=O)(=O)N4CCN(C)CC4)ccc3OCC)nc12")
599
    if oddt.toolkit.backend == 'ob':
600
        target_shingles = [
601
            'CCC', 'CCCc', 'CCCc(c)n', 'CCN(C)CC', 'CCN(CC)S(=O)(=O)c', 'CCO', 'CCOc', 'CCOc(c)c', 'CCc1nncc1n', 'CN(C)C',
602
            'CN(C)S(=O)(=O)c(c)c', 'CN(S)CCN', 'CN(S)CCN', 'COc(cc)c(c)c', 'Cc1ccn(n1)C', 'Cn(c)n', 'Cn1ncc(c1c(=O)[nH])n',
603
            'Cn1nccc1c', 'NCCN(C)C', 'NCCN(C)C', 'cS(=O)(=O)N', 'cS(=O)(=O)N', 'c[nH]c(=O)c(c)n', 'cc(=O)[nH]',
604
            'cc([nH])nc(c)c', 'cc(c)cc(S)c', 'cc(n)[nH]c(=O)c', 'ccc(c(O)c)c(n)[nH]', 'ccc(cc)S(=O)(=O)N', 'cccc(O)c',
605
            'cccc(S)c', 'cnc([nH]c)c(c)c', 'cnc1c(C)nnc1c']
606
    else:
607
608
        target_shingles = [
609
            'CCC', 'CCN(C)CC', 'CCO', 'CCc1nncc1n', 'CN(C)C', 'CN(C)CCN', 'CN(C)CCN', 'CN(S)CCN', 'CN(S)CCN', 'Cc1ccn(C)n1',
610
            'Cn1ncc(n)c1c([nH])=O', 'c-c([nH])nc(c)c', 'c-c(c)c(cc)OC', 'c-c(c)cc(c)S', 'c-c(n)[nH]c(c)=O', 'cCCC', 'cOCC',
611
            'cS(=O)(=O)N(CC)CC', 'cS(N)(=O)=O', 'cS(N)(=O)=O', 'c[nH]c(=O)c(c)n', 'cc([nH])=O', 'cc(c)OCC',
612
            'cc(c)S(=O)(=O)N(C)C', 'cc(n)CCC', 'cc1ccnn1C', 'ccc(-c(n)[nH])c(c)O', 'ccc(cc)S(N)(=O)=O', 'cccc(c)O',
613
            'cccc(c)S', 'cn(C)n', 'cnc([nH]c)-c(c)c', 'cnc1c(C)nnc1c']
614
615
    for n in range(10):
616
        sildenafil = shuffle_mol(sildenafil)
617
        shingles = sorted(get_molecular_shingles(sildenafil))
618
        assert_array_equal(shingles, target_shingles)