oddt / Git / [d6730e] /tests/test

Models:
Amanda-D/
oddt
Downloads: 1
[d6730e]: / tests / test_pandas.py
History
Download this file
199 lines (160 with data), 6.9 kB

import os
from tempfile import NamedTemporaryFile

from numpy.testing import assert_array_equal
import pandas as pd

import oddt
import oddt.pandas as opd

test_data_dir = os.path.dirname(os.path.abspath(__file__))
input_fname = os.path.join(test_data_dir, 'data/dude/xiap/actives_docked.sdf')


def test_classes():
    """ Test oddt.pandas classes behavior """
    df = opd.read_sdf(input_fname)

    # Check classes inheritance
    assert isinstance(df, opd.ChemDataFrame)
    assert isinstance(df, pd.DataFrame)
    assert isinstance(df['mol'], opd.ChemSeries)
    assert isinstance(df['mol'], pd.Series)
    assert isinstance(df, pd.DataFrame)

    # Check custom metadata
    assert hasattr(df, '_molecule_column')
    assert hasattr(df[['mol']], '_molecule_column')
    assert df._molecule_column == df[['mol']]._molecule_column

    # Check if slicing perserve classes
    assert isinstance(df.head(1), opd.ChemDataFrame)
    assert isinstance(df['mol'].head(1), opd.ChemSeries)


def test_reading():
    """ Test reading molecule files to ChemDataFrame """
    df = opd.read_sdf(input_fname)

    # Check dimensions
    assert len(df) == 100
    assert len(df.columns) == 15

    df = opd.read_sdf(input_fname, smiles_column='smi_col')
    assert 'smi_col' in df.columns

    df = opd.read_sdf(input_fname,
                      molecule_column=None,
                      molecule_name_column=None,
                      usecols=['name'])
    assert 'mol' not in df.columns
    assert 'mol_name' not in df.columns
    assert len(df.columns) == 1

    df = opd.read_sdf(input_fname,
                      usecols=['name', 'uniprot_id', 'act'])
    assert len(df.columns) == 5  # 3 from use_cols + 1 'mol' + 1 'mol_name'
    assert 'uniprot_id' in df.columns
    assert 'smi_col' not in df.columns

    # Chunk reading
    chunks = []
    for chunk in opd.read_sdf(input_fname, chunksize=10):
        assert len(chunk) == 10
        chunks.append(chunk)
    assert len(chunks) == 10
    df = pd.concat(chunks)

    # Check dimensions
    assert len(df) == 100


def test_substruct_sim_search():
    df = opd.read_sdf(input_fname).head(10)
    query = oddt.toolkit.readstring('smi', 'C(=O)(N1C[C@H](C[C@H]1C(=O)N[C@@H]1CCCc2c1cccc2)Oc1ccccc1)[C@@H](NC(=O)[C@H](C)NC)C1CCCCC1')

    ge_answear = [True, True, True, False, True, False, False, False, False, False]
    assert (df.mol >= query).tolist() == ge_answear
    assert (query <= df.mol).tolist() == ge_answear

    le_answear = [True, True, True, True, True, True, False, False, False, True]
    assert (df.mol <= query).tolist() == le_answear
    assert (query >= df.mol).tolist() == le_answear

    sim = df.mol.calcfp() | query.calcfp()
    assert sim.dtype == 'float64'


def test_mol2():
    """Writing and reading of mol2 fils to/from ChemDataFrame"""
    if oddt.toolkit.backend == 'ob':
        df = opd.read_sdf(input_fname)
        with NamedTemporaryFile(suffix='.mol2') as f:
            df.to_mol2(f.name)
            df2 = opd.read_mol2(f.name)
            assert df.shape == df2.shape
            chunks = []
            for chunk in opd.read_mol2(f.name, chunksize=10):
                assert len(chunk) == 10
                chunks.append(chunk)
            df3 = pd.concat(chunks)
            assert df.shape == df3.shape
        with NamedTemporaryFile(suffix='.mol2') as f:
            df.to_mol2(f.name, columns=['name', 'uniprot_id', 'act'])
            df2 = opd.read_mol2(f.name)
            assert len(df2.columns) == 5


def test_sdf():
    """Writing ChemDataFrame to SDF molecular files"""
    df = opd.read_sdf(input_fname)
    with NamedTemporaryFile(suffix='.sdf') as f:
        df.to_sdf(f.name)
        df2 = opd.read_sdf(f.name)
    assert_array_equal(df.columns.sort_values(), df2.columns.sort_values())
    with NamedTemporaryFile(suffix='.sdf') as f:
        df.to_sdf(f.name, columns=['name', 'uniprot_id', 'act'])
        df2 = opd.read_sdf(f.name)
    assert len(df2.columns) == 5


def test_csv():
    df = opd.read_sdf(input_fname,
                      columns=['mol', 'name', 'chembl_id', 'dude_smiles', 'act'])
    df['act'] = df['act'].astype(float)
    df['name'] = df['name'].astype(int)
    with NamedTemporaryFile(suffix='.csv', mode='w+') as f:
        for str_buff in (f, f.name):
            df.to_csv(str_buff, index=False)
            f.seek(0)
            df2 = opd.read_csv(f.name, smiles_to_molecule='mol',
                               molecule_column='mol')
            assert df.shape == df2.shape
            assert df.columns.tolist() == df2.columns.tolist()
            assert df.dtypes.tolist() == df2.dtypes.tolist()

    with NamedTemporaryFile(suffix='.csv', mode='w+') as f:
        for str_buff in (f, f.name):
            df.to_csv(str_buff, index=False, columns=['name', 'act'])
            f.seek(0)
            df2 = pd.read_csv(f.name)
            assert df[['name', 'act']].shape == df2.shape
            assert df[['name', 'act']].columns.tolist() == df2.columns.tolist()
            assert df[['name', 'act']].dtypes.tolist() == df2.dtypes.tolist()


def test_excel():
    # just check if it doesn't fail
    df = opd.read_sdf(input_fname,
                      columns=['mol', 'name', 'chembl_id', 'dude_smiles', 'act'])
    df = df.head(10)    # it's slow so use first 10 mols
    df['act'] = df['act'].astype(float)
    df['name'] = df['name'].astype(int)
    with NamedTemporaryFile(suffix='.xls', mode='w') as f:
        df.to_excel(f.name, index=False)
        writer = pd.ExcelWriter(f.name, engine='xlsxwriter')
        df.to_excel(writer, index=False)


def test_chemseries_writers():
    df = opd.read_sdf(input_fname,
                      columns=['mol', 'name', 'chembl_id', 'dude_smiles', 'act'])

    mols = df['mol']

    # SMILES
    with NamedTemporaryFile(suffix='.ism', mode='w') as f:
        mols.to_smiles(f)
        for mol in oddt.toolkit.readfile('smi', f.name):
            assert isinstance(mol, oddt.toolkit.Molecule)

    # SDF
    with NamedTemporaryFile(suffix='.sdf', mode='w') as f:
        mols.to_sdf(f)
        for mol in oddt.toolkit.readfile('sdf', f.name):
            assert isinstance(mol, oddt.toolkit.Molecule)

    # mol2
    if oddt.toolkit.backend == 'ob':
        with NamedTemporaryFile(suffix='.mol2', mode='w') as f:
            mols.to_mol2(f)
            for mol in oddt.toolkit.readfile('mol2', f.name):
                assert isinstance(mol, oddt.toolkit.Molecule)


def test_ipython():
    """iPython Notebook molecule rendering in SVG"""
    df = opd.read_sdf(input_fname)
    # mock ipython
    oddt.toolkit.ipython_notebook = True
    # png
    oddt.toolkit.image_backend = 'png'
    html = df.head(1).to_html()
    assert '<img src="data:image/png;base64,' in html
    # svg
    oddt.toolkit.image_backend = 'svg'
    html = df.head(1).to_html()
    assert '<svg' in html
    oddt.toolkit.ipython_notebook = False