--- a +++ b/tests/test_toolkit.py @@ -0,0 +1,537 @@ +import os +from collections import OrderedDict, deque + +from six.moves.cPickle import loads, dumps +import numpy as np +import pandas as pd + +import pytest +from numpy.testing import assert_array_equal, assert_array_almost_equal + +import oddt +from oddt.spatial import rmsd +from oddt.toolkits.common import canonize_ring_path + +test_data_dir = os.path.dirname(os.path.abspath(__file__)) +xiap_receptor = os.path.join(test_data_dir, 'data', 'dude', 'xiap', + 'receptor_rdkit.pdb') +xiap_actives = os.path.join(test_data_dir, 'data', 'dude', 'xiap', + 'actives_docked.sdf') + + +def test_mol(): + """Test common molecule operations""" + # Hydrogen manipulation in small molecules + mol = oddt.toolkit.readstring('smi', 'c1ccccc1O') + assert len(mol.atoms) == 7 + mol.addh() + assert len(mol.atoms) == 13 + mol.removeh() + mol.addh(only_polar=True) + assert len(mol.atoms) == 8 + mol.removeh() + assert len(mol.atoms) == 7 + + # Hydrogen manipulation in proteins + protein = next(oddt.toolkit.readfile('pdb', xiap_receptor)) + protein.protein = True + + res_atoms_n = [6, 10, 8, 8, 7, 11, 8, 7, 6, 8, 5, 8, 12, 9, 5, 11, 8, + 11, 7, 11, 4, 7, 14, 8, 12, 6, 7, 8, 9, 9, 9, 8, 5, 11, + 5, 4, 11, 12, 5, 8, 4, 9, 4, 8, 9, 7, 9, 6, 11, 10, 6, + 4, 4, 4, 8, 7, 8, 14, 9, 7, 6, 9, 8, 7, 14, 9, 9, 10, 5, + 9, 14, 12, 7, 4, 6, 9, 12, 8, 8, 9, 9, 9, 4, 9, 9, 12, + 8, 8, 8, 8, 10, 8, 7, 10, 11, 12, 6, 7, 8, 11, 8, 9, 4, + 8, 9, 7, 9, 6, 6, 4, 4, 4, 8, 7, 8, 14, 9, 7, 6, 9, 8, + 7, 14, 9, 9, 10, 5, 9, 14, 12, 7, 4, 8, 10, 8, 7, 1, 1] + res_atoms_n_addh = [12, 17, 17, 19, 14, 23, 14, 14, 11, 17, 10, 13, 21, + 16, 10, 23, 19, 20, 14, 20, 7, 14, 24, 19, 21, 11, + 16, 14, 21, 16, 17, 19, 10, 23, 10, 7, 20, 21, 10, + 19, 7, 16, 7, 13, 21, 16, 21, 10, 20, 17, 10, 7, 7, + 7, 19, 14, 13, 24, 21, 14, 11, 16, 13, 14, 24, 16, + 17, 16, 10, 21, 24, 21, 14, 7, 10, 21, 21, 19, 19, + 16, 17, 21, 7, 17, 16, 21, 19, 14, 14, 19, 17, 19, + 14, 18, 25, 22, 11, 17, 21, 22, 21, 17, 7, 13, 21, + 16, 21, 11, 11, 7, 7, 7, 19, 14, 13, 24, 21, 14, + 11, 16, 13, 14, 24, 16, 17, 16, 10, 21, 24, 21, 14, + 8, 20, 17, 19, 15, 1, 1] + res_atoms_n_polarh = [9, 12, 9, 9, 7, 16, 11, 7, 8, 9, 6, 10, 14, 11, + 6, 16, 9, 12, 9, 12, 5, 9, 16, 9, 14, 8, 8, 11, + 12, 11, 12, 9, 6, 16, 6, 5, 12, 14, 6, 9, 5, 11, + 5, 10, 12, 8, 12, 7, 12, 12, 7, 5, 5, 5, 9, 9, + 10, 16, 12, 7, 8, 11, 10, 7, 16, 11, 12, 11, 6, + 12, 16, 14, 7, 5, 7, 12, 14, 9, 9, 11, 12, 12, 5, + 12, 11, 14, 9, 11, 11, 9, 12, 9, 9, 12, 17, 15, + 8, 8, 10, 13, 10, 12, 5, 10, 12, 8, 12, 7, 8, 5, + 5, 5, 9, 9, 10, 16, 12, 7, 8, 11, 10, 7, 16, 11, + 12, 11, 6, 12, 16, 14, 7, 5, 10, 12, 9, 9, 1, 1] + assert len(protein.atoms) == 1114 + assert len(protein.residues) == 138 + assert_array_equal([len(res.atoms) for res in protein.residues], + res_atoms_n) + + protein.addh() + assert len(protein.atoms) == 2170 + assert len(protein.residues) == 138 + assert_array_equal([len(res.atoms) for res in protein.residues], + res_atoms_n_addh) + + protein.removeh() + protein.addh(only_polar=True) + assert len(protein.atoms) == 1356 + assert len(protein.residues) == 138 + assert_array_equal([len(res.atoms) for res in protein.residues], + res_atoms_n_polarh) + + protein.removeh() + assert len(protein.atoms) == 1114 + assert len(protein.residues) == 138 + assert_array_equal([len(res.atoms) for res in protein.residues], + res_atoms_n) + + +def test_mol_calccharges(): + mol = oddt.toolkit.readstring('smi', 'c1ccccc1O') + mol.addh() + + with pytest.raises(ValueError): + mol.calccharges('mmff94aaaaaa') + + for m in ['gasteiger', 'mmff94']: + mol.calccharges(m) + assert (np.array(mol.charges) != 0.).any() + + protein = next(oddt.toolkit.readfile('pdb', xiap_receptor)) + protein.protein = True + + # for that protein mmff94 charges could not be generated + with pytest.raises(Exception): + protein.calccharges('mmff94') + + +def test_toolkit_hoh(): + """HOH residues splitting""" + pdb_block = """ATOM 1 C1 GLY 1 0.000 0.000 0.000 1.00 0.00 C +ATOM 2 C2 GLY 1 0.000 0.000 0.000 1.00 0.00 C +ATOM 3 O1 GLY 1 0.000 0.000 0.000 1.00 0.00 O +ATOM 4 O2 GLY 1 0.000 0.000 0.000 1.00 0.00 O +ATOM 5 N1 GLY 1 0.000 0.000 0.000 1.00 0.00 N +ATOM 6 O3 HOH 2 0.000 0.000 0.000 1.00 0.00 O +ATOM 7 O4 HOH 3 0.000 0.000 0.000 1.00 0.00 O +ATOM 8 O5 HOH 4 0.000 0.000 0.000 1.00 0.00 O +""" + protein = oddt.toolkit.readstring('pdb', pdb_block) + protein.protein = True + assert len(protein.residues) == 4 + + protein.addh(only_polar=True) + assert len(protein.residues) == 4 + + protein.addh() + assert len(protein.residues) == 4 + + +def test_pickle(): + """Pickle molecules""" + mols = list(oddt.toolkit.readfile('sdf', xiap_actives)) + pickled_mols = list(map(lambda x: loads(dumps(x)), mols)) + + assert_array_equal(list(map(lambda x: x.title, mols)), + list(map(lambda x: x.title, pickled_mols))) + + assert_array_equal(list(map(lambda x: x.smiles, mols)), + list(map(lambda x: x.smiles, pickled_mols))) + + for mol, pickled_mol in zip(mols, pickled_mols): + assert dict(mol.data) == dict(pickled_mol.data) + + # Test pickling of atom_dicts + assert_array_equal(list(map(lambda x: x._atom_dict is None, mols)), + [True] * len(mols)) + mols_atom_dict = np.hstack(list(map(lambda x: x.atom_dict, mols))) + assert_array_equal(list(map(lambda x: x._atom_dict is not None, mols)), + [True] * len(mols)) + pickled_mols = list(map(lambda x: loads(dumps(x)), mols)) + assert_array_equal(list(map(lambda x: x._atom_dict is not None, pickled_mols)), + [True] * len(mols)) + pickled_mols_atom_dict = np.hstack(list(map(lambda x: x._atom_dict, pickled_mols))) + for name in mols[0].atom_dict.dtype.names: + if issubclass(np.dtype(mols_atom_dict[name].dtype).type, np.number): + assert_array_almost_equal(mols_atom_dict[name], + pickled_mols_atom_dict[name]) + else: + assert_array_equal(mols_atom_dict[name], + pickled_mols_atom_dict[name]) + + # Lazy Mols + mols = list(oddt.toolkit.readfile('sdf', xiap_actives, lazy=True)) + pickled_mols = list(map(lambda x: loads(dumps(x)), mols)) + + assert_array_equal(list(map(lambda x: x._source is not None, pickled_mols)), + [True] * len(mols)) + + assert_array_equal(list(map(lambda x: x.title, mols)), + list(map(lambda x: x.title, pickled_mols))) + + assert_array_equal(list(map(lambda x: x.smiles, mols)), + list(map(lambda x: x.smiles, pickled_mols))) + + for mol, pickled_mol in zip(mols, pickled_mols): + assert dict(mol.data) == dict(pickled_mol.data) + + +def test_diverse_conformers(): + # FIXME: make toolkit a module so we can import from it + diverse_conformers_generator = oddt.toolkit.diverse_conformers_generator + + mol = oddt.toolkit.readstring( + 'smi', + 'CN1CCN(S(=O)(C2=CC=C(OCC)C(C3=NC4=C(N(C)N=C4CCC)C(N3)=O)=C2)=O)CC1' + ) + mol.make3D() + + res = [] + for conf in diverse_conformers_generator(mol, seed=123456): + res.append(rmsd(mol, conf)) + + assert len(res) == 10 + if oddt.toolkit.backend == 'ob': + if oddt.toolkit.__version__ < '0.3': + assert_array_almost_equal(res, [0., 3.043712, 3.897143, 3.289482, + 3.066374, 2.909683, 2.913927, + 3.488244, 3.70603, 3.597467]) + else: + assert_array_almost_equal(res, [0.0, 1.372770, 2.489789, 2.759941, + 2.968366, 3.228773, 3.392191, + 3.921166, 3.185065, 3.283915]) + # else: + # if oddt.toolkit.__version__ > '2016.03.9': + # assert_array_almost_equal(res, [1.237538, 2.346984, 0.900624, + # 3.469511, 1.886213, 2.128909, + # 2.852608, 1.312513, 1.291595, + # 1.326843]) + # else: + # assert_array_almost_equal(res, [3.08995, 2.846358, 3.021795, + # 1.720319, 2.741972, 2.965332, + # 2.925344, 2.930157, 2.934049, + # 3.009545]) + + # check all implemented methods + if oddt.toolkit.backend == 'ob': + methods = ['ga', 'confab'] + else: + methods = ['dg', 'etkdg', 'kdg', 'etdg'] + for method in methods: + assert len(diverse_conformers_generator(mol, + seed=123456, + n_conf=5, + method=method)) == 5 + assert len(diverse_conformers_generator(mol, + seed=123456, + n_conf=10, + method=method)) == 10 + assert len(diverse_conformers_generator(mol, + seed=123456, + n_conf=20, + method=method)) == 20 + + +def test_indices(): + """Test 0 and 1 based atom indices""" + mol = oddt.toolkit.readstring('smi', 'CCc1cc(C)c(C)cc1-c1ccc(-c2cccc(C)c2)cc1') + atom = mol.atoms[0] + + assert atom.idx0 == 0 + assert atom.idx1 == 1 + + # the unmarked index is deprecated in ODDT + with pytest.warns((DeprecationWarning, FutureWarning)): + assert atom.idx == 1 + + +def test_pickle_protein(): + """Pickle proteins""" + # Proteins + rec = next(oddt.toolkit.readfile('pdb', xiap_receptor)) + # generate atom_dict + assert rec.atom_dict is not None + + assert rec._atom_dict is not None + pickled_rec = loads(dumps(rec)) + assert pickled_rec.protein is False + assert pickled_rec._atom_dict is not None + + rec.protein = True + # setting protein property should clean atom_dict cache + assert rec._atom_dict is None + # generate atom_dict + assert rec.atom_dict is not None + + pickled_rec = loads(dumps(rec)) + assert pickled_rec.protein is True + assert pickled_rec._atom_dict is not None + + +if oddt.toolkit.backend == 'rdk': + def test_badmol(): + """Propagate None's for bad molecules""" + mol = oddt.toolkit.readstring('smi', 'c1cc2') + assert mol is None + + +def test_dicts(): + """Test ODDT numpy structures, aka. dicts""" + mols = list(oddt.toolkit.readfile('sdf', xiap_actives)) + list(map(lambda x: x.addh(only_polar=True), mols)) + + skip_cols = ['radius', 'charge', 'id', + # following fields need to be standarized + 'hybridization', + 'numhs', + 'formalcharge', + ] + all_cols = [name for name in mols[0].atom_dict.dtype.names + if name not in ['coords', 'neighbors', 'neighbors_id']] + common_cols = [name for name in all_cols if name not in skip_cols] + + # Small molecules + all_dicts = np.hstack([mol.atom_dict for mol in mols]) + all_dicts = all_dicts[all_dicts['atomicnum'] != 1] + + data = pd.DataFrame({name: all_dicts[name] for name in all_cols}) + data['mol_idx'] = [i + for i, mol in enumerate(mols) + for atom in mol + if atom.atomicnum != 1] + + # Save correct results + # data[common_cols].to_csv( + # os.path.join(test_data_dir, 'data/results/xiap/mols_atom_dict.csv'), + # index=False) + + corr_data = pd.read_csv(os.path.join(test_data_dir, 'data', 'results', + 'xiap', 'mols_atom_dict.csv') + ).fillna('') + + for name in common_cols: + if issubclass(np.dtype(data[name].dtype).type, np.number): + mask = data[name] - corr_data[name] > 1e-6 + for i in np.argwhere(mask.values): + print(i, data[name][i].values, corr_data[name][i].values, + mols[data['mol_idx'][int(i)]].write('smi')) + assert_array_almost_equal( + data[name], + corr_data[name], + err_msg='Mols atom_dict\'s collumn: "%s" is not equal' % name) + else: + mask = data[name] != corr_data[name] + for i in np.argwhere(mask.values): + print(i, data[name][i].values, corr_data[name][i].values, + mols[data['mol_idx'][int(i)]].write('smi')) + assert_array_equal( + data[name], + corr_data[name], + err_msg='Mols atom_dict\'s collumn: "%s" is not equal' % name) + + # Protein + rec = next(oddt.toolkit.readfile('pdb', xiap_receptor)) + rec.protein = True + rec.addh(only_polar=True) + + skip_cols = ['radius', 'charge', 'resid', 'id', + # following fields need to be standarized + 'hybridization', + 'numhs', + 'formalcharge', + ] + common_cols = [name for name in all_cols if name not in skip_cols] + + all_dicts = rec.atom_dict[rec.atom_dict['atomicnum'] != 1] + + data = pd.DataFrame({name: all_dicts[name] for name in all_cols}) + + # Save correct results + # data[common_cols].to_csv( + # os.path.join(test_data_dir, 'data/results/xiap/prot_atom_dict.csv'), + # index=False) + + corr_data = pd.read_csv(os.path.join(test_data_dir, 'data', 'results', + 'xiap', 'prot_atom_dict.csv') + ).fillna('') + + for name in common_cols: + if issubclass(np.dtype(data[name].dtype).type, np.number): + mask = data[name] - corr_data[name] > 1e-6 + for i in np.argwhere(mask.values): + print(i, + data['atomtype'][i].values, + data['resname'][i].values, + data[name][i].values, + corr_data[name][i].values) + assert_array_almost_equal( + data[name], + corr_data[name], + err_msg='Protein atom_dict\'s collumn: "%s" is not equal' % name) + else: + mask = data[name] != corr_data[name] + for i in np.argwhere(mask.values): + print(i, + data['atomtype'][i].values, + data['resname'][i].values, + data[name][i].values, + corr_data[name][i].values) + assert_array_equal( + data[name], + corr_data[name], + err_msg='Protein atom_dict\'s collumn: "%s" is not equal' % name) + + +def test_ss(): + """Secondary structure assignment""" + # Alpha Helix + prot_file = os.path.join(test_data_dir, 'data', 'pdb', '1cos_helix.pdb') + protein = next(oddt.toolkit.readfile('pdb', prot_file)) + protein.protein = True + + # print(protein.res_dict['resname']) + # print(protein.res_dict['isalpha']) + # print(protein.res_dict['isbeta']) + + isalpha = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26] + + assert len(protein.res_dict) == 29 + assert_array_equal(np.where(protein.res_dict['isalpha'])[0], isalpha) + assert protein.res_dict['isalpha'].sum() == 27 + assert protein.res_dict['isbeta'].sum() == 0 + + # Beta Sheet + prot_file = os.path.join(test_data_dir, 'data', 'pdb', '1icl_sheet.pdb') + protein = next(oddt.toolkit.readfile('pdb', prot_file)) + protein.protein = True + + # print(protein.res_dict['resname']) + # print(protein.res_dict['isalpha']) + # print(protein.res_dict['isbeta']) + # print(protein.res_dict['isbeta']) + # for mask_group in np.split(np.argwhere(protein.res_dict['isbeta']).flatten(), + # np.argwhere(np.diff(np.argwhere(protein.res_dict['isbeta']).flatten()) != 1).flatten() + 1): + # print(mask_group + 1, protein.res_dict[mask_group]['resname']) + + isbeta = [2, 3, 4, 5, 10, 11, 12, 13] + + assert len(protein.res_dict) == 29 + assert_array_equal(np.where(protein.res_dict['isbeta'])[0], isbeta) + assert protein.res_dict['isbeta'].sum() == 8 + assert protein.res_dict['isalpha'].sum() == 0 + + # Protein test + protein = next(oddt.toolkit.readfile('pdb', xiap_receptor)) + protein.protein = True + + # print(protein.res_dict['resname']) + # print(protein.res_dict['isalpha']) + # for mask_group in np.split(np.argwhere(protein.res_dict['isalpha']).flatten(), + # np.argwhere(np.diff(np.argwhere(protein.res_dict['isalpha']).flatten()) != 1).flatten() + 1): + # print(mask_group + 1, protein.res_dict[mask_group]['resname']) + + # print(protein.res_dict['isbeta']) + # for mask_group in np.split(np.argwhere(protein.res_dict['isbeta']).flatten(), + # np.argwhere(np.diff(np.argwhere(protein.res_dict['isbeta']).flatten()) != 1).flatten() + 1): + # print(mask_group + 1, protein.res_dict[mask_group]['resname']) + + isalpha = [15, 16, 17, 18, 19, 20, 28, 29, 30, 31, 32, 33, 63, 64, 65, 66, + 67, 68, 69, 70, 75, 76, 77, 78, 79, 80, 83, 84, 85, 86, 87, 88, + 89, 90, 91, 121, 122, 123, 124, 125, 126, 127, 128] + isbeta = [36, 37, 38, 45, 46, 47, 52, 53, 54] + + assert_array_equal(np.where(protein.res_dict['isalpha'])[0], isalpha) + assert_array_equal(np.where(protein.res_dict['isbeta'])[0], isbeta) + assert len(protein.res_dict) == 136 + assert protein.res_dict['isalpha'].sum() == 43 + assert protein.res_dict['isbeta'].sum() == 9 + assert (protein.res_dict['isalpha'] & + protein.res_dict['isbeta']).sum() == 0 # Must be zero! + assert (~protein.res_dict['isalpha'] & + ~protein.res_dict['isbeta']).sum() == 84 + + +def test_pdbqt(): + """RDKit PDBQT writer and reader""" + mol = next(oddt.toolkit.readfile('sdf', xiap_actives)) + mol2 = oddt.toolkit.readstring('pdbqt', mol.write('pdbqt')) + assert mol.title == mol2.title + + # test loop breaks in DFS algorithm + mol = oddt.toolkit.readstring('smi', 'CCc1cc(C)c(C)cc1-c1ccc(-c2cccc(C)c2)cc1') + mol.make3D() + + # roundtrip molecule with template + mol2 = oddt.toolkit.readstring('pdbqt', mol.write('pdbqt')) + mol.removeh() + + assert len(mol.atoms) == len(mol2.atoms) + + def nodes_size(block): + out = OrderedDict() + current_key = None + for line in block.split('\n'): + if line[:4] == 'ROOT' or line[:6] == 'BRANCH': + current_key = line.strip() + out[current_key] = 0 + elif line[:4] == 'ATOM': + out[current_key] += 1 + return list(out.values()) + + # check the branch order and size + if oddt.toolkit.backend == 'ob': + assert_array_equal(nodes_size(mol.write('pdbqt')), + [6, 8, 2, 7]) + else: + assert_array_equal(nodes_size(mol.write('pdbqt')), + [8, 6, 7, 2]) + ligand_file = os.path.join(test_data_dir, 'data', 'dude', 'xiap', + 'crystal_ligand.sdf') + mol = next(oddt.toolkit.readfile('sdf', ligand_file)) + assert_array_equal(nodes_size(mol.write('pdbqt')), + [8, 3, 6, 6, 1, 6, 3, 2, 2]) + + # roundtrip a disconnected fragments + mol = oddt.toolkit.readstring('smi', 'c1ccccc1.c1ccccc1C') + if oddt.toolkit.backend == 'ob': + kwargs = {'opt': {'r': None}} + else: + kwargs = {'flexible': False} + + mol2 = oddt.toolkit.readstring('pdbqt', mol.write('pdbqt', **kwargs)) + assert len(mol.atoms) == len(mol2.atoms) + + mol2 = oddt.toolkit.readstring('pdbqt', mol.write('pdbqt')) + assert len(mol.atoms) == len(mol2.atoms) + + +def test_residue_info(): + """Residue properties""" + mol_file = os.path.join(test_data_dir, 'data', 'pdb', '3kwa_5Apocket.pdb') + mol = next(oddt.toolkit.readfile('pdb', mol_file)) + assert len(mol.residues) == 19 + + res = mol.residues[0] + assert res.idx0 == 0 + assert res.number == 92 + assert res.chain == 'A' + assert res.name == 'GLN' + + +def test_canonize_ring_path(): + """Test canonic paths""" + path0 = list(range(6)) + path = deque(path0) + path.rotate(3) + + assert canonize_ring_path(path) == path0 + path.reverse() + assert canonize_ring_path(path) == path0 + + with pytest.raises(ValueError): + canonize_ring_path(tuple(range(6)))