--- a +++ b/tests/test_fingerprints.py @@ -0,0 +1,618 @@ +import os +import sys +from itertools import combinations + +import numpy as np +from scipy.sparse import vstack as sparse_vstack +from numpy.testing import (assert_array_equal, + assert_array_almost_equal, + assert_almost_equal) + +import pytest + +import oddt +from oddt.fingerprints import (InteractionFingerprint, + SimpleInteractionFingerprint, + ECFP, + _ECFP_atom_repr, + SPLIF, + similarity_SPLIF, + PLEC, + fold, + MIN_HASH_VALUE, + MAX_HASH_VALUE, + sparse_to_dense, + sparse_to_csr_matrix, + csr_matrix_to_sparse, + dense_to_sparse, + get_molecular_shingles, + hash_fnv1a_python, + dice, + tanimoto) +from .utils import shuffle_mol + + +test_data_dir = os.path.dirname(os.path.abspath(__file__)) + +protein = next(oddt.toolkit.readfile('pdb', os.path.join( + test_data_dir, 'data/pdbbind/10gs/10gs_pocket.pdb'))) +protein.protein = True +protein.addh(only_polar=True) + +ligand = next(oddt.toolkit.readfile('sdf', os.path.join( + test_data_dir, 'data/pdbbind/10gs/10gs_ligand.sdf'))) +ligand.addh(only_polar=True) + + +def test_folding(): + """FP Folding""" + # Upper bound + assert_array_equal(fold([MAX_HASH_VALUE], 1024), [1023]) + assert_array_equal(fold([MAX_HASH_VALUE], 1234567890), [1234567889]) + assert_array_equal(fold([MAX_HASH_VALUE], MAX_HASH_VALUE / 2), + [MAX_HASH_VALUE / 2 - 1]) + assert_array_equal(fold([MAX_HASH_VALUE], MAX_HASH_VALUE - 1), + [MAX_HASH_VALUE - 2]) + # Lower bound + assert_array_equal(fold([MIN_HASH_VALUE], 1024), [0]) + assert_array_equal(fold([MIN_HASH_VALUE], 1234567890), [0]) + assert_array_equal(fold([MIN_HASH_VALUE], MAX_HASH_VALUE / 2), [0]) + assert_array_equal(fold([MIN_HASH_VALUE], MAX_HASH_VALUE - 1), [0]) + + # Range check + fp = np.arange(1, MAX_HASH_VALUE, 1e6, dtype=int) + assert_array_equal(fold(fp, MAX_HASH_VALUE), fp - 1) + +@pytest.mark.skipif(sys.version_info > (3, 7), reason="Only testable with old Python Hash implementation") +def test_hashing_function(): + """Verify the implementation of Python 2.4-3.7 hash function in Python""" + sample_list = list(range(-10, 10)) + # add nested structure + sample_list.append(tuple(sample_list)) + sample_list.append(tuple(sample_list)) + for sample_tuple in combinations(sample_list, r=5): + python_hash = hash(sample_tuple) + custom_hash = hash_fnv1a_python(sample_tuple) + assert python_hash == custom_hash + + +def test_sparse_densify(): + """FP densify""" + sparse_fp = [0, 33, 49, 53, 107, 156, 161, 203, 215, 230, 251, 269, 299, + 323, 331, 376, 389, 410, 427, 430, 450, 484, 538, 592, 593, + 636, 646, 658, 698, 699, 702, 741, 753, 807, 850, 861, 882, + 915, 915, 915, 969, 969, 1023] + + # count vectors + dense = sparse_to_dense(sparse_fp, size=1024, count_bits=True) + csr = sparse_to_csr_matrix(sparse_fp, size=1024, count_bits=True) + assert_array_equal(dense.reshape(1, -1), csr.toarray()) + resparsed = dense_to_sparse(dense) + resparsed_csr = csr_matrix_to_sparse(csr) + assert_array_equal(sparse_fp, resparsed) + assert_array_equal(sparse_fp, resparsed_csr) + + # bool vectors + dense = sparse_to_dense(sparse_fp, size=1024, count_bits=False) + csr = sparse_to_csr_matrix(sparse_fp, size=1024, count_bits=False) + assert_array_equal(dense.reshape(1, -1), csr.toarray()) + resparsed = dense_to_sparse(dense) + resparsed_csr = csr_matrix_to_sparse(csr) + assert_array_equal(np.unique(sparse_fp), resparsed) + assert_array_equal(np.unique(sparse_fp), resparsed_csr) + + # test stacking + np.random.seed(0) + sparse_fps = np.random.randint(0, 1024, size=(20, 100)) + dense = np.vstack([sparse_to_dense(fp, size=1024) for fp in sparse_fps]) + csr = sparse_vstack(sparse_to_csr_matrix(fp, size=1024) for fp in sparse_fps) + assert_array_equal(dense, csr.toarray()) + + # test exceptions + with pytest.raises(ValueError): + csr_matrix_to_sparse(np.array([1, 2, 3])) + + +def test_InteractionFingerprint(): + """Interaction Fingerprint test""" + if oddt.toolkit.backend == 'ob': + IFP = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + else: + IFP = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + assert_array_equal(IFP, InteractionFingerprint(ligand, protein)) + + +def test_SimpleInteractionFingerprint(): + """Simple Interaction Fingerprint test """ + SIFP = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0] + assert_array_equal(SIFP, SimpleInteractionFingerprint(ligand, protein)) + + +def test_IFP_SIFP_Folding_cum_sum(): + """Checks, whether InteractionFingerprint and SimpleInteractionFingerprint outcomes matches""" + IFP = np.sum(InteractionFingerprint(ligand, protein), axis=0) + SIFP = np.sum(SimpleInteractionFingerprint(ligand, protein), axis=0) + assert_array_equal(IFP, SIFP) + + +def test_similarity(): + """FP similarity""" + mols = list(oddt.toolkit.readfile('sdf', os.path.join( + test_data_dir, 'data/dude/xiap/actives_docked.sdf'))) + mols = list(filter(lambda x: x.title == '312335', mols)) + list(map(lambda x: x.addh(only_polar=True), mols)) + receptor = next(oddt.toolkit.readfile('pdb', os.path.join( + test_data_dir, 'data/dude/xiap/receptor_rdkit.pdb'))) + receptor.protein = True + receptor.addh(only_polar=True) + ref = SimpleInteractionFingerprint(mols[0], receptor) + + outcome = [dice(ref, SimpleInteractionFingerprint( + mol, receptor)) for mol in mols[1:]] + target_outcome = np.array([0.742857, 0.645161, 0.727273, 0.571429, + 0.727273, 0.588235, 0.75, 0.551724, + 0.551724, 0.6875, 0.514286, 0.6875, + 0.592593, 0.647059, 0.736842, 0.62069, + 0.545455, 0.533333, 0.606061]) + assert_array_almost_equal(outcome, target_outcome) + + outcome = [tanimoto(ref, SimpleInteractionFingerprint( + mol, receptor)) for mol in mols[1:]] + target_outcome = np.array([0.636364, 0.5, 0.666667, 0.384615, 0.666667, + 0.545455, 0.666667, 0.5, 0.363636, 0.666667, + 0.555556, 0.555556, 0.625, 0.6, 0.727273, + 0.555556, 0.5, 0.4, 0.363636]) + assert_array_almost_equal(outcome, target_outcome) + + +def test_sparse_similarity(): + """Sparse similarity""" + mol1 = oddt.toolkit.readstring("smi", "CC1=C(C(=CC=C1)C)NC(=O)CN2CCN(CC2)CC(=O)N3CCC4=C(C3)C=CS4") + mol2 = oddt.toolkit.readstring("smi", "CC1=C(C(=CC=C1)O)NC(=O)CN2CCN(CC2)CC(=O)N3CCC4=C(C3)C=CS4") + + mol1_fp_dense = ECFP(mol1, depth=8, size=4096, sparse=False) + mol2_fp_dense = ECFP(mol2, depth=8, size=4096, sparse=False) + + mol1_fp_sparse = ECFP(mol1, depth=8, size=4096, sparse=True) + mol2_fp_sparse = ECFP(mol2, depth=8, size=4096, sparse=True) + + assert_almost_equal(dice(mol1_fp_sparse, mol2_fp_sparse, sparse=True), + dice(mol1_fp_dense, mol2_fp_dense)) + assert dice([], [], sparse=True) == 0. + assert dice(np.zeros(10), np.zeros(10), sparse=False) == 0. + assert_almost_equal(tanimoto(mol1_fp_sparse, mol2_fp_sparse, sparse=True), + tanimoto(mol1_fp_dense, mol2_fp_dense)) + assert tanimoto([], [], sparse=True) == 0. + assert tanimoto(np.zeros(10), np.zeros(10), sparse=False) == 0. + + +def test_ecfp_repr(): + """Test exact ECFP representation to track down the changes""" + mol = oddt.toolkit.readstring("smi", "CC1=C(C(=CC=C1)C)NC(=O)CN2CCN(CC2)CC(=O)N3CCC4=C(C3)C=CS4") + + res = [(6, 0, 1, 3, 0, 0, 0), (6, 0, 3, 0, 0, 1, 1), (6, 0, 3, 0, 0, 1, 1), (6, 0, 3, 0, 0, 1, 1), + (6, 0, 2, 1, 0, 1, 1), (6, 0, 2, 1, 0, 1, 1), (6, 0, 2, 1, 0, 1, 1), (6, 0, 1, 3, 0, 0, 0), + (7, 0, 2, 1, 0, 0, 0), (6, 0, 3, 0, 0, 0, 0), (8, 0, 1, 0, 0, 0, 0), (6, 0, 2, 2, 0, 0, 0), + (7, 0, 3, 0, 0, 1, 0), (6, 0, 2, 2, 0, 1, 0), (6, 0, 2, 2, 0, 1, 0), (7, 0, 3, 0, 0, 1, 0), + (6, 0, 2, 2, 0, 1, 0), (6, 0, 2, 2, 0, 1, 0), (6, 0, 2, 2, 0, 0, 0), (6, 0, 3, 0, 0, 0, 0), + (8, 0, 1, 0, 0, 0, 0), (7, 0, 3, 0, 0, 1, 0), (6, 0, 2, 2, 0, 1, 0), (6, 0, 2, 2, 0, 1, 0), + (6, 0, 3, 0, 0, 1, 1), (6, 0, 3, 0, 0, 1, 1), (6, 0, 2, 2, 0, 1, 0), (6, 0, 2, 1, 0, 1, 1), + (6, 0, 2, 1, 0, 1, 1), (16, 0, 2, 0, 0, 1, 1)] + + assert_array_equal([_ECFP_atom_repr(mol, i) for i in range(len(mol.atoms))], res) + + +def test_ecfp(): + """ECFP fingerprints""" + mol1 = oddt.toolkit.readstring("smi", "CC1=C(C(=CC=C1)C)NC(=O)CN2CCN(CC2)CC(=O)N3CCC4=C(C3)C=CS4") + mol2 = oddt.toolkit.readstring("smi", "CC1=C(C(=CC=C1)O)NC(=O)CN2CCN(CC2)CC(=O)N3CCC4=C(C3)C=CS4") + + mol1_fp = ECFP(mol1, depth=8, size=4096, sparse=False) + mol2_fp = ECFP(mol2, depth=8, size=4096, sparse=False) + + ref1 = [2, 100, 176, 185, 200, 203, 359, 382, 447, 509, 518, 550, 572, 583, + 598, 606, 607, 684, 818, 821, 832, 861, 960, 992, 1006, 1019, 1042, + 1050, 1059, 1103, 1175, 1281, 1315, 1377, 1431, 1470, 1479, 1512, + 1577, 1588, 1598, 1620, 1633, 1647, 1663, 1723, 1749, 1751, 1775, + 1781, 1821, 1837, 1899, 1963, 1969, 1986, 2013, 2253, 2343, 2355, + 2368, 2435, 2547, 2654, 2657, 2702, 2722, 2725, 2803, 2816, 2853, + 2870, 2920, 2992, 3028, 3056, 3074, 3103, 3190, 3203, 3277, 3321, + 3362, 3377, 3383, 3401, 3512, 3546, 3552, 3585, 3593, 3617, 3674, + 3759, 3784, 3790, 3832, 3895, 3937, 3956, 3974, 4007, 4033] + + ref2 = [43, 100, 176, 200, 203, 231, 382, 396, 447, 490, 518, 583, 606, + 607, 650, 818, 821, 832, 840, 861, 907, 950, 960, 992, 1006, 1013, + 1019, 1042, 1050, 1059, 1103, 1104, 1112, 1175, 1281, 1293, 1315, + 1377, 1431, 1470, 1512, 1543, 1577, 1588, 1598, 1633, 1647, 1663, + 1723, 1749, 1751, 1757, 1759, 1775, 1781, 1821, 1837, 1880, 1963, + 1969, 1986, 2253, 2355, 2368, 2435, 2544, 2547, 2654, 2702, 2722, + 2725, 2726, 2799, 2816, 2853, 2870, 2920, 2992, 3028, 3074, 3190, + 3203, 3277, 3290, 3333, 3362, 3383, 3401, 3512, 3546, 3552, 3585, + 3593, 3617, 3640, 3660, 3674, 3759, 3784, 3790, 3805, 3832, 3856, + 3895, 3924, 3956, 3974, 3992, 4007, 4033] + + assert_array_equal(ref1, np.where(mol1_fp)[0]) + assert_array_equal(ref2, np.where(mol2_fp)[0]) + + assert_almost_equal(dice(mol1_fp, mol2_fp), 0.69999999) + assert_almost_equal(tanimoto(mol1_fp, mol2_fp), 0.63846153) + + # adding Hs should not change anything + mol1.addh() + mol2.addh() + + mol1_fp = ECFP(mol1, depth=8, size=4096, sparse=False) + mol2_fp = ECFP(mol2, depth=8, size=4096, sparse=False) + + assert_array_equal(ref1, np.where(mol1_fp)[0]) + assert_array_equal(ref2, np.where(mol2_fp)[0]) + + assert_almost_equal(dice(mol1_fp, mol2_fp), 0.69999999) + assert_almost_equal(tanimoto(mol1_fp, mol2_fp), 0.63846153) + + # removig Hs should not change anything + mol1.removeh() + mol2.removeh() + + mol1_fp = ECFP(mol1, depth=8, size=4096, sparse=False) + mol2_fp = ECFP(mol2, depth=8, size=4096, sparse=False) + + assert_array_equal(ref1, np.where(mol1_fp)[0]) + assert_array_equal(ref2, np.where(mol2_fp)[0]) + + assert_almost_equal(dice(mol1_fp, mol2_fp), 0.69999999) + assert_almost_equal(tanimoto(mol1_fp, mol2_fp), 0.63846153) + + +def test_fcfp(): + """FCFP fingerprints""" + mol1 = oddt.toolkit.readstring("smi", "CC1=C(C(=CC=C1)C)NC(=O)CN2CCN(CC2)CC(=O)N3CCC4=C(C3)C=CS4") + mol2 = oddt.toolkit.readstring("smi", "CC1=C(C(=CC=C1)O)NC(=O)CN2CCN(CC2)CC(=O)N3CCC4=C(C3)C=CS4") + + mol1_fp = ECFP(mol1, depth=8, size=4096, + sparse=False, use_pharm_features=True) + mol2_fp = ECFP(mol2, depth=8, size=4096, + sparse=False, use_pharm_features=True) + + ref1 = [46, 111, 305, 310, 362, 384, 409, 451, 467, 548, 572, 595, 607, + 608, 620, 659, 691, 699, 724, 743, 752, 842, 926, 935, 974, 1037, + 1072, 1094, 1135, 1143, 1161, 1172, 1313, 1325, 1368, 1399, 1461, + 1486, 1488, 1492, 1603, 1619, 1648, 1665, 1666, 1838, 1887, 1900, + 1948, 1961, 1972, 1975, 1996, 2000, 2052, 2085, 2094, 2174, 2232, + 2236, 2368, 2382, 2383, 2402, 2483, 2492, 2527, 2593, 2616, 2706, + 2789, 2899, 2922, 2945, 2966, 3102, 3117, 3176, 3189, 3215, 3225, + 3297, 3326, 3349, 3373, 3513, 3525, 3535, 3601, 3619, 3780, 3820, + 3897, 3919, 3976, 3981, 4050, 4079, 4091] + + ref2 = [46, 111, 143, 172, 259, 305, 362, 409, 451, 467, 507, 518, 548, + 583, 595, 607, 608, 620, 639, 691, 693, 724, 752, 784, 825, 842, + 926, 1037, 1087, 1094, 1098, 1135, 1143, 1161, 1172, 1286, 1325, + 1368, 1371, 1395, 1399, 1461, 1486, 1488, 1492, 1565, 1619, 1648, + 1655, 1665, 1887, 1890, 1900, 1948, 1961, 1968, 1972, 1975, 1976, + 1996, 2000, 2007, 2094, 2125, 2174, 2232, 2236, 2368, 2382, 2383, + 2483, 2492, 2571, 2593, 2606, 2638, 2706, 2789, 2922, 2945, 2966, + 2986, 3030, 3100, 3102, 3117, 3227, 3326, 3350, 3373, 3406, 3419, + 3535, 3577, 3619, 3697, 3742, 3820, 3839, 3919, 3981, 4043, 4050, + 4079, 4091] + + assert_array_equal(ref1, np.where(mol1_fp)[0]) + assert_array_equal(ref2, np.where(mol2_fp)[0]) + + assert_almost_equal(dice(mol1_fp, mol2_fp), 0.64074074) + assert_almost_equal(tanimoto(mol1_fp, mol2_fp), 0.5) + + # adding Hs should not change anything + mol1.addh() + mol2.addh() + + assert_array_equal(ref1, np.where(mol1_fp)[0]) + assert_array_equal(ref2, np.where(mol2_fp)[0]) + + assert_almost_equal(dice(mol1_fp, mol2_fp), 0.64074074) + assert_almost_equal(tanimoto(mol1_fp, mol2_fp), 0.5) + + +def test_ecfp_invaraiants(): + """ECFP: test random reordering""" + sildenafil = oddt.toolkit.readstring("smi", "CCCc1nn(C)c2c(=O)[nH]c(-c3cc(S(=O)(=O)N4CCN(C)CC4)ccc3OCC)nc12") + + params = {'depth': 4, 'size': 4096, 'sparse': True} + fp = ECFP(sildenafil, **params) + + for n in range(10): + sildenafil = shuffle_mol(sildenafil) + assert_array_equal(fp, ECFP(sildenafil, **params)) + + +def test_splif(): + """SPLIF fingerprints""" + mols = list(oddt.toolkit.readfile('sdf', os.path.join( + test_data_dir, 'data/dude/xiap/actives_docked.sdf'))) + mols = list(filter(lambda x: x.title == '312335', mols)) + list(map(lambda x: x.addh(only_polar=True), mols)) + receptor = next(oddt.toolkit.readfile('pdb', os.path.join( + test_data_dir, 'data/dude/xiap/receptor_rdkit.pdb'))) + receptor.protein = True + receptor.addh(only_polar=True) + splif = SPLIF(mols[0], receptor) + reference = [6, 38, 49, 53, 53, 53, 70, 70, 81, 81, 81, 81, 165, 216, 219, + 249, 330, 330, 333, 377, 380, 396, 396, 396, 423, 423, 479, + 479, 498, 498, 498, 570, 592, 625, 638, 768, 768, 817, 818, + 818, 818, 818, 858, 884, 888, 907, 930, 934, 935, 971, 1023, + 1041, 1115, 1142, 1184, 1184, 1252, 1263, 1269, 1275, 1275, + 1275, 1315, 1315, 1315, 1337, 1337, 1344, 1351, 1396, 1435, + 1465, 1502, 1502, 1502, 1502, 1569, 1569, 1569, 1569, 1569, + 1569, 1569, 1569, 1640, 1645, 1660, 1660, 1697, 1697, 1716, + 1746, 1756, 1778, 1901, 1937, 1997, 2000, 2000, 2000, 2007, + 2007, 2020, 2070, 2195, 2274, 2294, 2319, 2415, 2417, 2509, + 2528, 2578, 2578, 2584, 2590, 2590, 2624, 2636, 2678, 2678, + 2678, 2678, 2678, 2776, 2776, 2789, 2862, 2862, 2894, 2894, + 2894, 2923, 2923, 3058, 3073, 3073, 3073, 3073, 3137, 3159, + 3159, 3159, 3186, 3218, 3218, 3279, 3279, 3281, 3338, 3358, + 3360, 3368, 3387, 3609, 3636, 3636, 3713, 3713, 3716, 3716, + 3748, 3767, 3769, 3854, 3871, 3912, 3968, 3986, 3994, 3994, + 4069] + + assert splif['hash'].shape == (172,) + assert_array_equal(splif['ligand_coords'].shape, (172, 7, 3)) + assert_array_equal(splif['protein_coords'].shape, (172, 7, 3)) + assert_array_equal(reference, splif['hash']) + + +def test_splif_similarity(): + """SPLIF similarity""" + mols = list(oddt.toolkit.readfile('sdf', os.path.join( + test_data_dir, 'data/dude/xiap/actives_docked.sdf'))) + mols = list(filter(lambda x: x.title == '312335', mols)) + list(map(lambda x: x.addh(only_polar=True), mols)) + receptor = next(oddt.toolkit.readfile('pdb', os.path.join( + test_data_dir, 'data/dude/xiap/receptor_rdkit.pdb'))) + receptor.protein = True + receptor.addh(only_polar=True) + ref = SPLIF(mols[0], receptor) + splif_fps = [SPLIF(mol, receptor) for mol in mols] + outcome = [similarity_SPLIF(ref, fp) for fp in splif_fps] + target_outcome = np.array([1.000, 0.779, 0.660, 0.805, 0.630, + 0.802, 0.366, 0.817, 0.378, 0.553, + 0.732, 0.705, 0.856, 0.797, 0.502, + 0.418, 0.653, 0.436, 0.708, 0.688]) + + assert_array_almost_equal(outcome, target_outcome, decimal=3) + + # check if similarity is symmetric + for fp1, fp2 in combinations(splif_fps, 2): + assert similarity_SPLIF(fp1, fp2) == similarity_SPLIF(fp2, fp1) + + +def test_plec(): + """PLEC fingerprints""" + mols = list(oddt.toolkit.readfile('sdf', os.path.join( + test_data_dir, 'data/dude/xiap/actives_docked.sdf'))) + mols = list(filter(lambda x: x.title == '312335', mols)) + list(map(lambda x: x.removeh(), mols)) + receptor = next(oddt.toolkit.readfile('pdb', os.path.join( + test_data_dir, 'data/dude/xiap/receptor_rdkit.pdb'))) + receptor.protein = True + receptor.removeh() + plec = PLEC(mols[0], receptor) + reference = [80, 119, 120, 120, 120, 120, 137, 138, 155, 155, 155, 155, + 155, 155, 155, 161, 199, 214, 214, 214, 226, 226, 233, 266, + 282, 283, 283, 313, 313, 386, 386, 430, 431, 431, 432, 448, + 581, 581, 643, 662, 684, 690, 729, 737, 741, 778, 778, 795, + 799, 799, 812, 812, 876, 877, 894, 907, 924, 924, 925, 925, + 935, 935, 935, 935, 935, 964, 964, 964, 993, 993, 996, 996, + 1002, 1002, 1042, 1042, 1066, 1066, 1077, 1113, 1119, 1224, + 1266, 1266, 1290, 1322, 1322, 1334, 1334, 1403, 1411, 1411, + 1461, 1475, 1480, 1497, 1521, 1584, 1584, 1614, 1618, 1618, + 1618, 1618, 1691, 1694, 1694, 1755, 1755, 1755, 1755, 1786, + 1835, 1835, 1867, 1953, 1953, 1953, 1953, 1963, 1970, 1970, + 1990, 1992, 1992, 1992, 2024, 2024, 2060, 2252, 2373, 2383, + 2383, 2390, 2390, 2451, 2537, 2538, 2552, 2555, 2558, 2640, + 2720, 2752, 2791, 2821, 2821, 2931, 2950, 2957, 2957, 2959, + 2961, 2961, 2961, 2963, 2970, 2970, 2982, 3034, 3049, 3066, + 3084, 3084, 3084, 3104, 3126, 3227, 3248, 3293, 3293, 3293, + 3420, 3439, 3517, 3539, 3546, 3546, 3546, 3546, 3553, 3559, + 3596, 3630, 3643, 3643, 3674, 3707, 3708, 3716, 3738, 3742, + 3828, 3846, 3859, 3876, 3887, 3904, 3904, 3904, 3916, 3916, + 3939, 3941, 3981, 3981, 3991, 3993, 4010, 4097, 4127, 4127, + 4127, 4127, 4165, 4181, 4192, 4316, 4330, 4372, 4391, 4461, + 4462, 4463, 4542, 4542, 4542, 4549, 4549, 4549, 4549, 4614, + 4615, 4657, 4668, 4670, 4686, 4686, 4686, 4688, 4688, 4688, + 4688, 4695, 4729, 4740, 4741, 4744, 4744, 4744, 4744, 4756, + 4814, 4828, 4828, 4861, 4861, 4861, 4861, 4861, 4861, 4861, + 4861, 4861, 4861, 4861, 4861, 4861, 4861, 4861, 4861, 4916, + 4945, 4945, 5011, 5037, 5042, 5044, 5046, 5055, 5078, 5080, + 5101, 5101, 5126, 5139, 5146, 5189, 5193, 5232, 5271, 5314, + 5321, 5350, 5379, 5439, 5439, 5439, 5439, 5481, 5482, 5535, + 5563, 5565, 5565, 5585, 5601, 5601, 5626, 5626, 5631, 5631, + 5631, 5631, 5631, 5631, 5639, 5670, 5688, 5690, 5742, 5804, + 5804, 5864, 5871, 5885, 5983, 5992, 6010, 6010, 6010, 6059, + 6059, 6096, 6164, 6183, 6183, 6197, 6234, 6256, 6261, 6261, + 6277, 6277, 6277, 6277, 6299, 6333, 6333, 6388, 6388, 6404, + 6428, 6428, 6428, 6428, 6431, 6431, 6445, 6449, 6450, 6480, + 6496, 6519, 6519, 6540, 6582, 6642, 6654, 6654, 6671, 6717, + 6722, 6735, 6735, 6735, 6764, 6764, 6781, 6781, 6781, 6781, + 6788, 6788, 6803, 6808, 6833, 6838, 6838, 6950, 6979, 6979, + 6997, 7069, 7115, 7194, 7250, 7254, 7277, 7288, 7352, 7464, + 7493, 7506, 7506, 7520, 7530, 7530, 7530, 7542, 7546, 7561, + 7608, 7678, 7678, 7685, 7701, 7701, 7701, 7752, 7752, 7752, + 7790, 7847, 7957, 7957, 7957, 7959, 8003, 8003, 8003, 8010, + 8083, 8086, 8086, 8086, 8086, 8113, 8116, 8160, 8190, 8230, + 8230, 8262, 8262, 8282, 8284, 8284, 8292, 8297, 8327, 8327, + 8383, 8383, 8383, 8418, 8418, 8426, 8457, 8484, 8484, 8543, + 8543, 8580, 8629, 8651, 8655, 8697, 8726, 8781, 8784, 8796, + 8837, 8850, 8923, 9034, 9040, 9077, 9077, 9099, 9134, 9180, + 9206, 9257, 9281, 9304, 9304, 9333, 9341, 9358, 9393, 9394, + 9432, 9450, 9450, 9455, 9455, 9481, 9493, 9493, 9505, 9537, + 9547, 9572, 9585, 9610, 9610, 9661, 9689, 9690, 9690, 9700, + 9700, 9733, 9736, 9736, 9736, 9736, 9765, 9784, 9885, 9885, + 9885, 9934, 9938, 9968, 9968, 10037, 10080, 10080, 10103, + 10113, 10113, 10114, 10115, 10115, 10115, 10139, 10139, 10139, + 10139, 10139, 10181, 10181, 10181, 10181, 10185, 10286, 10295, + 10317, 10317, 10340, 10340, 10340, 10340, 10352, 10353, 10364, + 10364, 10385, 10490, 10490, 10504, 10535, 10539, 10539, 10589, + 10589, 10591, 10599, 10648, 10648, 10650, 10650, 10681, 10703, + 10714, 10714, 10714, 10739, 10739, 10793, 10806, 10806, 10806, + 10837, 10865, 10865, 10871, 10903, 10978, 10978, 11056, 11056, + 11141, 11159, 11207, 11213, 11257, 11272, 11360, 11362, 11377, + 11454, 11454, 11458, 11458, 11458, 11539, 11563, 11580, 11580, + 11580, 11605, 11605, 11610, 11610, 11613, 11624, 11664, 11664, + 11683, 11683, 11697, 11698, 11701, 11707, 11753, 11835, 11846, + 11852, 11858, 11876, 11879, 11890, 11957, 11957, 12009, 12115, + 12130, 12151, 12222, 12268, 12290, 12290, 12295, 12295, 12320, + 12431, 12448, 12475, 12475, 12475, 12481, 12485, 12487, 12587, + 12632, 12632, 12634, 12641, 12641, 12641, 12664, 12761, 12761, + 12778, 12832, 12878, 12878, 12884, 12958, 12982, 12982, 12982, + 12982, 12992, 13057, 13079, 13121, 13129, 13200, 13200, 13277, + 13277, 13317, 13317, 13320, 13320, 13336, 13388, 13434, 13443, + 13475, 13495, 13517, 13517, 13553, 13602, 13637, 13655, 13658, + 13658, 13688, 13688, 13774, 13774, 13784, 13784, 13784, 13786, + 13791, 13791, 13809, 13839, 13839, 13839, 13839, 13839, 13876, + 13905, 13906, 13906, 13906, 13906, 13920, 13920, 13920, 13920, + 13920, 13949, 13949, 14058, 14122, 14122, 14133, 14133, 14198, + 14259, 14259, 14317, 14332, 14368, 14386, 14423, 14423, 14423, + 14423, 14423, 14423, 14423, 14439, 14440, 14447, 14464, 14464, + 14469, 14505, 14510, 14510, 14513, 14516, 14516, 14529, 14529, + 14529, 14549, 14563, 14563, 14570, 14570, 14570, 14582, 14605, + 14605, 14611, 14748, 14748, 14750, 14757, 14772, 14798, 14802, + 14810, 14854, 14857, 14857, 14878, 14878, 14903, 14903, 14993, + 14993, 14996, 15008, 15012, 15018, 15044, 15044, 15074, 15092, + 15092, 15146, 15146, 15191, 15251, 15251, 15253, 15258, 15311, + 15311, 15317, 15429, 15429, 15441, 15444, 15498, 15518, 15520, + 15622, 15622, 15622, 15651, 15672, 15712, 15715, 15798, 15798, + 15811, 15950, 15982, 15982, 15987, 16023, 16023, 16042, 16049, + 16054, 16080, 16099, 16119, 16119, 16119, 16174, 16174, 16213, + 16225, 16229, 16234, 16234, 16234, 16252, 16252, 16252, 16252, + 16252, 16320, 16328, 16362, 16362] + + assert_array_equal(reference, plec) + assert_array_equal(plec.shape, (860,)) + + # Hydrogens should not impact the PLEC fingerprint + list(map(lambda x: x.addh(only_polar=True), mols)) + receptor.addh(only_polar=True) + plec = PLEC(mols[0], receptor) + assert_array_equal(reference, plec, "Polar Hs break PLEC") + + list(map(lambda x: x.addh(), mols)) + receptor.addh() + plec = PLEC(mols[0], receptor) + assert_array_equal(reference, plec, "Non-polar Hs break PLEC") + + +def test_plec_binded_hoh(): + # if water coordinates metal in PDB and ligand is in contact with it, HOH + # will pop up in metals environment, thus we cannot ignore HOHs in repr_dict + + if (oddt.toolkit.backend == 'ob' or + (oddt.toolkit.backend == 'rdk' and + oddt.toolkit.__version__ >= '2017.03')): + ligand = next(oddt.toolkit.readfile('sdf', os.path.join( + test_data_dir, 'data', 'pdb', '3kwa_ligand.sdf'))) + protein = next(oddt.toolkit.readfile('pdb', os.path.join( + test_data_dir, 'data', 'pdb', '3kwa_5Apocket.pdb'))) + protein.protein = True + + assert len(PLEC(ligand, protein, ignore_hoh=True)) == 465 + assert len(PLEC(ligand, protein, ignore_hoh=False)) == 560 + + +def test_plec_similarity(): + """PLEC similarity""" + mols = list(oddt.toolkit.readfile('sdf', os.path.join( + test_data_dir, 'data/dude/xiap/actives_docked.sdf'))) + mols = list(filter(lambda x: x.title == '312335', mols)) + list(map(lambda x: x.addh(only_polar=True), mols)) + receptor = next(oddt.toolkit.readfile('pdb', os.path.join( + test_data_dir, 'data/dude/xiap/receptor_rdkit.pdb'))) + receptor.protein = True + receptor.addh(only_polar=True) + reference_sparse = PLEC(mols[0], receptor) + outcome_sparse = [dice(reference_sparse, PLEC(mol, receptor), + sparse=True) for mol in mols[1:]] + target_outcome = np.array([0.833, 0.729, 0.849, 0.785, 0.821, + 0.604, 0.868, 0.656, 0.712, 0.652, + 0.699, 0.785, 0.736, 0.745, 0.661, + 0.667, 0.555, 0.616, 0.714]) + reference_dense = PLEC(mols[0], receptor, sparse=False) + outcome_dense = [dice(reference_dense, PLEC(mol, receptor, sparse=False), + sparse=False) for mol in mols[1:]] + assert_array_almost_equal(outcome_sparse, target_outcome, decimal=2) + assert_array_almost_equal(outcome_dense, target_outcome, decimal=2) + + +def test_molecular_shingles(): + sildenafil = oddt.toolkit.readstring("smi", "CCCc1nn(C)c2c(=O)[nH]c(-c3cc(S(=O)(=O)N4CCN(C)CC4)ccc3OCC)nc12") + if oddt.toolkit.backend == 'ob': + target_shingles = [ + 'CCC', 'CCCc', 'CCCc(c)n', 'CCN(C)CC', 'CCN(CC)S(=O)(=O)c', 'CCO', 'CCOc', 'CCOc(c)c', 'CCc1nncc1n', 'CN(C)C', + 'CN(C)S(=O)(=O)c(c)c', 'CN(S)CCN', 'CN(S)CCN', 'COc(cc)c(c)c', 'Cc1ccn(n1)C', 'Cn(c)n', 'Cn1ncc(c1c(=O)[nH])n', + 'Cn1nccc1c', 'NCCN(C)C', 'NCCN(C)C', 'cS(=O)(=O)N', 'cS(=O)(=O)N', 'c[nH]c(=O)c(c)n', 'cc(=O)[nH]', + 'cc([nH])nc(c)c', 'cc(c)cc(S)c', 'cc(n)[nH]c(=O)c', 'ccc(c(O)c)c(n)[nH]', 'ccc(cc)S(=O)(=O)N', 'cccc(O)c', + 'cccc(S)c', 'cnc([nH]c)c(c)c', 'cnc1c(C)nnc1c'] + else: + + target_shingles = [ + 'CCC', 'CCN(C)CC', 'CCO', 'CCc1nncc1n', 'CN(C)C', 'CN(C)CCN', 'CN(C)CCN', 'CN(S)CCN', 'CN(S)CCN', 'Cc1ccn(C)n1', + 'Cn1ncc(n)c1c([nH])=O', 'c-c([nH])nc(c)c', 'c-c(c)c(cc)OC', 'c-c(c)cc(c)S', 'c-c(n)[nH]c(c)=O', 'cCCC', 'cOCC', + 'cS(=O)(=O)N(CC)CC', 'cS(N)(=O)=O', 'cS(N)(=O)=O', 'c[nH]c(=O)c(c)n', 'cc([nH])=O', 'cc(c)OCC', + 'cc(c)S(=O)(=O)N(C)C', 'cc(n)CCC', 'cc1ccnn1C', 'ccc(-c(n)[nH])c(c)O', 'ccc(cc)S(N)(=O)=O', 'cccc(c)O', + 'cccc(c)S', 'cn(C)n', 'cnc([nH]c)-c(c)c', 'cnc1c(C)nnc1c'] + + for n in range(10): + sildenafil = shuffle_mol(sildenafil) + shingles = sorted(get_molecular_shingles(sildenafil)) + assert_array_equal(shingles, target_shingles) \ No newline at end of file