oddt / Git / [3b722e] /tests/test

Models:
Amanda-D/
oddt
Downloads: 1
[3b722e]: / tests / test_toolkit.py
History
Download this file
538 lines (442 with data), 21.6 kB

import os
from collections import OrderedDict, deque

from six.moves.cPickle import loads, dumps
import numpy as np
import pandas as pd

import pytest
from numpy.testing import assert_array_equal, assert_array_almost_equal

import oddt
from oddt.spatial import rmsd
from oddt.toolkits.common import canonize_ring_path

test_data_dir = os.path.dirname(os.path.abspath(__file__))
xiap_receptor = os.path.join(test_data_dir, 'data', 'dude', 'xiap',
                             'receptor_rdkit.pdb')
xiap_actives = os.path.join(test_data_dir, 'data', 'dude', 'xiap',
                            'actives_docked.sdf')


def test_mol():
    """Test common molecule operations"""
    # Hydrogen manipulation in small molecules
    mol = oddt.toolkit.readstring('smi', 'c1ccccc1O')
    assert len(mol.atoms) == 7
    mol.addh()
    assert len(mol.atoms) == 13
    mol.removeh()
    mol.addh(only_polar=True)
    assert len(mol.atoms) == 8
    mol.removeh()
    assert len(mol.atoms) == 7

    # Hydrogen manipulation in proteins
    protein = next(oddt.toolkit.readfile('pdb', xiap_receptor))
    protein.protein = True

    res_atoms_n = [6, 10, 8, 8, 7, 11, 8, 7, 6, 8, 5, 8, 12, 9, 5, 11, 8,
                   11, 7, 11, 4, 7, 14, 8, 12, 6, 7, 8, 9, 9, 9, 8, 5, 11,
                   5, 4, 11, 12, 5, 8, 4, 9, 4, 8, 9, 7, 9, 6, 11, 10, 6,
                   4, 4, 4, 8, 7, 8, 14, 9, 7, 6, 9, 8, 7, 14, 9, 9, 10, 5,
                   9, 14, 12, 7, 4, 6, 9, 12, 8, 8, 9, 9, 9, 4, 9, 9, 12,
                   8, 8, 8, 8, 10, 8, 7, 10, 11, 12, 6, 7, 8, 11, 8, 9, 4,
                   8, 9, 7, 9, 6, 6, 4, 4, 4, 8, 7, 8, 14, 9, 7, 6, 9, 8,
                   7, 14, 9, 9, 10, 5, 9, 14, 12, 7, 4, 8, 10, 8, 7, 1, 1]
    res_atoms_n_addh = [12, 17, 17, 19, 14, 23, 14, 14, 11, 17, 10, 13, 21,
                        16, 10, 23, 19, 20, 14, 20, 7, 14, 24, 19, 21, 11,
                        16, 14, 21, 16, 17, 19, 10, 23, 10, 7, 20, 21, 10,
                        19, 7, 16, 7, 13, 21, 16, 21, 10, 20, 17, 10, 7, 7,
                        7, 19, 14, 13, 24, 21, 14, 11, 16, 13, 14, 24, 16,
                        17, 16, 10, 21, 24, 21, 14, 7, 10, 21, 21, 19, 19,
                        16, 17, 21, 7, 17, 16, 21, 19, 14, 14, 19, 17, 19,
                        14, 18, 25, 22, 11, 17, 21, 22, 21, 17, 7, 13, 21,
                        16, 21, 11, 11, 7, 7, 7, 19, 14, 13, 24, 21, 14,
                        11, 16, 13, 14, 24, 16, 17, 16, 10, 21, 24, 21, 14,
                        8, 20, 17, 19, 15, 1, 1]
    res_atoms_n_polarh = [9, 12, 9, 9, 7, 16, 11, 7, 8, 9, 6, 10, 14, 11,
                          6, 16, 9, 12, 9, 12, 5, 9, 16, 9, 14, 8, 8, 11,
                          12, 11, 12, 9, 6, 16, 6, 5, 12, 14, 6, 9, 5, 11,
                          5, 10, 12, 8, 12, 7, 12, 12, 7, 5, 5, 5, 9, 9,
                          10, 16, 12, 7, 8, 11, 10, 7, 16, 11, 12, 11, 6,
                          12, 16, 14, 7, 5, 7, 12, 14, 9, 9, 11, 12, 12, 5,
                          12, 11, 14, 9, 11, 11, 9, 12, 9, 9, 12, 17, 15,
                          8, 8, 10, 13, 10, 12, 5, 10, 12, 8, 12, 7, 8, 5,
                          5, 5, 9, 9, 10, 16, 12, 7, 8, 11, 10, 7, 16, 11,
                          12, 11, 6, 12, 16, 14, 7, 5, 10, 12, 9, 9, 1, 1]
    assert len(protein.atoms) == 1114
    assert len(protein.residues) == 138
    assert_array_equal([len(res.atoms) for res in protein.residues],
                       res_atoms_n)

    protein.addh()
    assert len(protein.atoms) == 2170
    assert len(protein.residues) == 138
    assert_array_equal([len(res.atoms) for res in protein.residues],
                       res_atoms_n_addh)

    protein.removeh()
    protein.addh(only_polar=True)
    assert len(protein.atoms) == 1356
    assert len(protein.residues) == 138
    assert_array_equal([len(res.atoms) for res in protein.residues],
                       res_atoms_n_polarh)

    protein.removeh()
    assert len(protein.atoms) == 1114
    assert len(protein.residues) == 138
    assert_array_equal([len(res.atoms) for res in protein.residues],
                       res_atoms_n)


def test_mol_calccharges():
    mol = oddt.toolkit.readstring('smi', 'c1ccccc1O')
    mol.addh()

    with pytest.raises(ValueError):
        mol.calccharges('mmff94aaaaaa')

    for m in ['gasteiger', 'mmff94']:
        mol.calccharges(m)
        assert (np.array(mol.charges) != 0.).any()

    protein = next(oddt.toolkit.readfile('pdb', xiap_receptor))
    protein.protein = True

    # for that protein mmff94 charges could not be generated
    with pytest.raises(Exception):
        protein.calccharges('mmff94')


def test_toolkit_hoh():
    """HOH residues splitting"""
    pdb_block = """ATOM      1  C1  GLY     1       0.000   0.000   0.000  1.00  0.00           C
ATOM      2  C2  GLY     1       0.000   0.000   0.000  1.00  0.00           C
ATOM      3  O1  GLY     1       0.000   0.000   0.000  1.00  0.00           O
ATOM      4  O2  GLY     1       0.000   0.000   0.000  1.00  0.00           O
ATOM      5  N1  GLY     1       0.000   0.000   0.000  1.00  0.00           N
ATOM      6  O3  HOH     2       0.000   0.000   0.000  1.00  0.00           O
ATOM      7  O4  HOH     3       0.000   0.000   0.000  1.00  0.00           O
ATOM      8  O5  HOH     4       0.000   0.000   0.000  1.00  0.00           O
"""
    protein = oddt.toolkit.readstring('pdb', pdb_block)
    protein.protein = True
    assert len(protein.residues) == 4

    protein.addh(only_polar=True)
    assert len(protein.residues) == 4

    protein.addh()
    assert len(protein.residues) == 4


def test_pickle():
    """Pickle molecules"""
    mols = list(oddt.toolkit.readfile('sdf', xiap_actives))
    pickled_mols = list(map(lambda x: loads(dumps(x)), mols))

    assert_array_equal(list(map(lambda x: x.title, mols)),
                       list(map(lambda x: x.title, pickled_mols)))

    assert_array_equal(list(map(lambda x: x.smiles, mols)),
                       list(map(lambda x: x.smiles, pickled_mols)))

    for mol, pickled_mol in zip(mols, pickled_mols):
        assert dict(mol.data) == dict(pickled_mol.data)

    # Test pickling of atom_dicts
    assert_array_equal(list(map(lambda x: x._atom_dict is None, mols)),
                       [True] * len(mols))
    mols_atom_dict = np.hstack(list(map(lambda x: x.atom_dict, mols)))
    assert_array_equal(list(map(lambda x: x._atom_dict is not None, mols)),
                       [True] * len(mols))
    pickled_mols = list(map(lambda x: loads(dumps(x)), mols))
    assert_array_equal(list(map(lambda x: x._atom_dict is not None, pickled_mols)),
                       [True] * len(mols))
    pickled_mols_atom_dict = np.hstack(list(map(lambda x: x._atom_dict, pickled_mols)))
    for name in mols[0].atom_dict.dtype.names:
        if issubclass(np.dtype(mols_atom_dict[name].dtype).type, np.number):
            assert_array_almost_equal(mols_atom_dict[name],
                                      pickled_mols_atom_dict[name])
        else:
            assert_array_equal(mols_atom_dict[name],
                               pickled_mols_atom_dict[name])

    # Lazy Mols
    mols = list(oddt.toolkit.readfile('sdf', xiap_actives, lazy=True))
    pickled_mols = list(map(lambda x: loads(dumps(x)), mols))

    assert_array_equal(list(map(lambda x: x._source is not None, pickled_mols)),
                       [True] * len(mols))

    assert_array_equal(list(map(lambda x: x.title, mols)),
                       list(map(lambda x: x.title, pickled_mols)))

    assert_array_equal(list(map(lambda x: x.smiles, mols)),
                       list(map(lambda x: x.smiles, pickled_mols)))

    for mol, pickled_mol in zip(mols, pickled_mols):
        assert dict(mol.data) == dict(pickled_mol.data)


def test_diverse_conformers():
    # FIXME: make toolkit a module so we can import from it
    diverse_conformers_generator = oddt.toolkit.diverse_conformers_generator

    mol = oddt.toolkit.readstring(
        'smi',
        'CN1CCN(S(=O)(C2=CC=C(OCC)C(C3=NC4=C(N(C)N=C4CCC)C(N3)=O)=C2)=O)CC1'
    )
    mol.make3D()

    res = []
    for conf in diverse_conformers_generator(mol, seed=123456):
        res.append(rmsd(mol, conf))

    assert len(res) == 10
    if oddt.toolkit.backend == 'ob':
        if oddt.toolkit.__version__ < '0.3':
            assert_array_almost_equal(res, [0., 3.043712, 3.897143, 3.289482,
                                            3.066374, 2.909683, 2.913927,
                                            3.488244, 3.70603, 3.597467])
        else:
            assert_array_almost_equal(res, [0.0, 1.372770, 2.489789, 2.759941,
                                            2.968366, 3.228773, 3.392191,
                                            3.921166, 3.185065, 3.283915])
    # else:
    #     if oddt.toolkit.__version__ > '2016.03.9':
    #         assert_array_almost_equal(res, [1.237538, 2.346984, 0.900624,
    #                                         3.469511, 1.886213, 2.128909,
    #                                         2.852608, 1.312513, 1.291595,
    #                                         1.326843])
    #     else:
    #         assert_array_almost_equal(res, [3.08995, 2.846358, 3.021795,
    #                                         1.720319, 2.741972, 2.965332,
    #                                         2.925344, 2.930157, 2.934049,
    #                                         3.009545])

    # check all implemented methods
    if oddt.toolkit.backend == 'ob':
        methods = ['ga', 'confab']
    else:
        methods = ['dg', 'etkdg', 'kdg', 'etdg']
    for method in methods:
        assert len(diverse_conformers_generator(mol,
                                                seed=123456,
                                                n_conf=5,
                                                method=method)) == 5
        assert len(diverse_conformers_generator(mol,
                                                seed=123456,
                                                n_conf=10,
                                                method=method)) == 10
        assert len(diverse_conformers_generator(mol,
                                                seed=123456,
                                                n_conf=20,
                                                method=method)) == 20


def test_indices():
    """Test 0 and 1 based atom indices"""
    mol = oddt.toolkit.readstring('smi', 'CCc1cc(C)c(C)cc1-c1ccc(-c2cccc(C)c2)cc1')
    atom = mol.atoms[0]

    assert atom.idx0 == 0
    assert atom.idx1 == 1

    # the unmarked index is deprecated in ODDT
    with pytest.warns((DeprecationWarning, FutureWarning)):
        assert atom.idx == 1


def test_pickle_protein():
    """Pickle proteins"""
    # Proteins
    rec = next(oddt.toolkit.readfile('pdb', xiap_receptor))
    # generate atom_dict
    assert rec.atom_dict is not None

    assert rec._atom_dict is not None
    pickled_rec = loads(dumps(rec))
    assert pickled_rec.protein is False
    assert pickled_rec._atom_dict is not None

    rec.protein = True
    # setting protein property should clean atom_dict cache
    assert rec._atom_dict is None
    # generate atom_dict
    assert rec.atom_dict is not None

    pickled_rec = loads(dumps(rec))
    assert pickled_rec.protein is True
    assert pickled_rec._atom_dict is not None


if oddt.toolkit.backend == 'rdk':
    def test_badmol():
        """Propagate None's for bad molecules"""
        mol = oddt.toolkit.readstring('smi', 'c1cc2')
        assert mol is None


def test_dicts():
    """Test ODDT numpy structures, aka. dicts"""
    mols = list(oddt.toolkit.readfile('sdf', xiap_actives))
    list(map(lambda x: x.addh(only_polar=True), mols))

    skip_cols = ['radius', 'charge', 'id',
                 # following fields need to be standarized
                 'hybridization',
                 'numhs',
                 'formalcharge',
                 ]
    all_cols = [name for name in mols[0].atom_dict.dtype.names
                if name not in ['coords', 'neighbors', 'neighbors_id']]
    common_cols = [name for name in all_cols if name not in skip_cols]

    # Small molecules
    all_dicts = np.hstack([mol.atom_dict for mol in mols])
    all_dicts = all_dicts[all_dicts['atomicnum'] != 1]

    data = pd.DataFrame({name: all_dicts[name] for name in all_cols})
    data['mol_idx'] = [i
                       for i, mol in enumerate(mols)
                       for atom in mol
                       if atom.atomicnum != 1]

    # Save correct results
    # data[common_cols].to_csv(
    #     os.path.join(test_data_dir, 'data/results/xiap/mols_atom_dict.csv'),
    #     index=False)

    corr_data = pd.read_csv(os.path.join(test_data_dir, 'data', 'results',
                                         'xiap', 'mols_atom_dict.csv')
                            ).fillna('')

    for name in common_cols:
        if issubclass(np.dtype(data[name].dtype).type, np.number):
            mask = data[name] - corr_data[name] > 1e-6
            for i in np.argwhere(mask.values):
                print(i, data[name][i].values, corr_data[name][i].values,
                      mols[data['mol_idx'][int(i)]].write('smi'))
            assert_array_almost_equal(
                data[name],
                corr_data[name],
                err_msg='Mols atom_dict\'s collumn: "%s" is not equal' % name)
        else:
            mask = data[name] != corr_data[name]
            for i in np.argwhere(mask.values):
                print(i, data[name][i].values, corr_data[name][i].values,
                      mols[data['mol_idx'][int(i)]].write('smi'))
            assert_array_equal(
                data[name],
                corr_data[name],
                err_msg='Mols atom_dict\'s collumn: "%s" is not equal' % name)

    # Protein
    rec = next(oddt.toolkit.readfile('pdb', xiap_receptor))
    rec.protein = True
    rec.addh(only_polar=True)

    skip_cols = ['radius', 'charge', 'resid', 'id',
                 # following fields need to be standarized
                 'hybridization',
                 'numhs',
                 'formalcharge',
                 ]
    common_cols = [name for name in all_cols if name not in skip_cols]

    all_dicts = rec.atom_dict[rec.atom_dict['atomicnum'] != 1]

    data = pd.DataFrame({name: all_dicts[name] for name in all_cols})

    # Save correct results
    # data[common_cols].to_csv(
    #     os.path.join(test_data_dir, 'data/results/xiap/prot_atom_dict.csv'),
    #     index=False)

    corr_data = pd.read_csv(os.path.join(test_data_dir, 'data', 'results',
                                         'xiap', 'prot_atom_dict.csv')
                            ).fillna('')

    for name in common_cols:
        if issubclass(np.dtype(data[name].dtype).type, np.number):
            mask = data[name] - corr_data[name] > 1e-6
            for i in np.argwhere(mask.values):
                print(i,
                      data['atomtype'][i].values,
                      data['resname'][i].values,
                      data[name][i].values,
                      corr_data[name][i].values)
            assert_array_almost_equal(
                data[name],
                corr_data[name],
                err_msg='Protein atom_dict\'s collumn: "%s" is not equal' % name)
        else:
            mask = data[name] != corr_data[name]
            for i in np.argwhere(mask.values):
                print(i,
                      data['atomtype'][i].values,
                      data['resname'][i].values,
                      data[name][i].values,
                      corr_data[name][i].values)
            assert_array_equal(
                data[name],
                corr_data[name],
                err_msg='Protein atom_dict\'s collumn: "%s" is not equal' % name)


def test_ss():
    """Secondary structure assignment"""
    # Alpha Helix
    prot_file = os.path.join(test_data_dir, 'data', 'pdb', '1cos_helix.pdb')
    protein = next(oddt.toolkit.readfile('pdb', prot_file))
    protein.protein = True

    # print(protein.res_dict['resname'])
    # print(protein.res_dict['isalpha'])
    # print(protein.res_dict['isbeta'])

    isalpha = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
               18, 19, 20, 21, 22, 23, 24, 25, 26]

    assert len(protein.res_dict) == 29
    assert_array_equal(np.where(protein.res_dict['isalpha'])[0], isalpha)
    assert protein.res_dict['isalpha'].sum() == 27
    assert protein.res_dict['isbeta'].sum() == 0

    # Beta Sheet
    prot_file = os.path.join(test_data_dir, 'data', 'pdb', '1icl_sheet.pdb')
    protein = next(oddt.toolkit.readfile('pdb', prot_file))
    protein.protein = True

    # print(protein.res_dict['resname'])
    # print(protein.res_dict['isalpha'])
    # print(protein.res_dict['isbeta'])
    # print(protein.res_dict['isbeta'])
    # for mask_group in np.split(np.argwhere(protein.res_dict['isbeta']).flatten(),
    #                            np.argwhere(np.diff(np.argwhere(protein.res_dict['isbeta']).flatten()) != 1).flatten() + 1):
    #         print(mask_group + 1, protein.res_dict[mask_group]['resname'])

    isbeta = [2, 3, 4, 5, 10, 11, 12, 13]

    assert len(protein.res_dict) == 29
    assert_array_equal(np.where(protein.res_dict['isbeta'])[0], isbeta)
    assert protein.res_dict['isbeta'].sum() == 8
    assert protein.res_dict['isalpha'].sum() == 0

    # Protein test
    protein = next(oddt.toolkit.readfile('pdb', xiap_receptor))
    protein.protein = True

    # print(protein.res_dict['resname'])
    # print(protein.res_dict['isalpha'])
    # for mask_group in np.split(np.argwhere(protein.res_dict['isalpha']).flatten(),
    #                            np.argwhere(np.diff(np.argwhere(protein.res_dict['isalpha']).flatten()) != 1).flatten() + 1):
    #         print(mask_group + 1, protein.res_dict[mask_group]['resname'])

    # print(protein.res_dict['isbeta'])
    # for mask_group in np.split(np.argwhere(protein.res_dict['isbeta']).flatten(),
    #                            np.argwhere(np.diff(np.argwhere(protein.res_dict['isbeta']).flatten()) != 1).flatten() + 1):
    #         print(mask_group + 1, protein.res_dict[mask_group]['resname'])

    isalpha = [15, 16, 17, 18, 19, 20, 28, 29, 30, 31, 32, 33, 63, 64, 65, 66,
               67, 68, 69, 70, 75, 76, 77, 78, 79, 80, 83, 84, 85, 86, 87, 88,
               89, 90, 91, 121, 122, 123, 124, 125, 126, 127, 128]
    isbeta = [36, 37, 38, 45, 46, 47, 52, 53, 54]

    assert_array_equal(np.where(protein.res_dict['isalpha'])[0], isalpha)
    assert_array_equal(np.where(protein.res_dict['isbeta'])[0], isbeta)
    assert len(protein.res_dict) == 136
    assert protein.res_dict['isalpha'].sum() == 43
    assert protein.res_dict['isbeta'].sum() == 9
    assert (protein.res_dict['isalpha'] &
            protein.res_dict['isbeta']).sum() == 0  # Must be zero!
    assert (~protein.res_dict['isalpha'] &
            ~protein.res_dict['isbeta']).sum() == 84


def test_pdbqt():
    """RDKit PDBQT writer and reader"""
    mol = next(oddt.toolkit.readfile('sdf', xiap_actives))
    mol2 = oddt.toolkit.readstring('pdbqt', mol.write('pdbqt'))
    assert mol.title == mol2.title

    # test loop breaks in DFS algorithm
    mol = oddt.toolkit.readstring('smi', 'CCc1cc(C)c(C)cc1-c1ccc(-c2cccc(C)c2)cc1')
    mol.make3D()

    # roundtrip molecule with template
    mol2 = oddt.toolkit.readstring('pdbqt', mol.write('pdbqt'))
    mol.removeh()

    assert len(mol.atoms) == len(mol2.atoms)

    def nodes_size(block):
        out = OrderedDict()
        current_key = None
        for line in block.split('\n'):
            if line[:4] == 'ROOT' or line[:6] == 'BRANCH':
                current_key = line.strip()
                out[current_key] = 0
            elif line[:4] == 'ATOM':
                out[current_key] += 1
        return list(out.values())

    # check the branch order and size
    if oddt.toolkit.backend == 'ob':
        assert_array_equal(nodes_size(mol.write('pdbqt')),
                           [6, 8, 2, 7])
    else:
        assert_array_equal(nodes_size(mol.write('pdbqt')),
                           [8, 6, 7, 2])
    ligand_file = os.path.join(test_data_dir, 'data', 'dude', 'xiap',
                               'crystal_ligand.sdf')
    mol = next(oddt.toolkit.readfile('sdf', ligand_file))
    assert_array_equal(nodes_size(mol.write('pdbqt')),
                       [8, 3, 6, 6, 1, 6, 3, 2, 2])

    # roundtrip a disconnected fragments
    mol = oddt.toolkit.readstring('smi', 'c1ccccc1.c1ccccc1C')
    if oddt.toolkit.backend == 'ob':
        kwargs = {'opt': {'r': None}}
    else:
        kwargs = {'flexible': False}

    mol2 = oddt.toolkit.readstring('pdbqt', mol.write('pdbqt', **kwargs))
    assert len(mol.atoms) == len(mol2.atoms)

    mol2 = oddt.toolkit.readstring('pdbqt', mol.write('pdbqt'))
    assert len(mol.atoms) == len(mol2.atoms)


def test_residue_info():
    """Residue properties"""
    mol_file = os.path.join(test_data_dir, 'data', 'pdb', '3kwa_5Apocket.pdb')
    mol = next(oddt.toolkit.readfile('pdb', mol_file))
    assert len(mol.residues) == 19

    res = mol.residues[0]
    assert res.idx0 == 0
    assert res.number == 92
    assert res.chain == 'A'
    assert res.name == 'GLN'


def test_canonize_ring_path():
    """Test canonic paths"""
    path0 = list(range(6))
    path = deque(path0)
    path.rotate(3)

    assert canonize_ring_path(path) == path0
    path.reverse()
    assert canonize_ring_path(path) == path0

    with pytest.raises(ValueError):
        canonize_ring_path(tuple(range(6)))