--- a +++ b/tests/test_pandas.py @@ -0,0 +1,198 @@ +import os +from tempfile import NamedTemporaryFile + +from numpy.testing import assert_array_equal +import pandas as pd + +import oddt +import oddt.pandas as opd + +test_data_dir = os.path.dirname(os.path.abspath(__file__)) +input_fname = os.path.join(test_data_dir, 'data/dude/xiap/actives_docked.sdf') + + +def test_classes(): + """ Test oddt.pandas classes behavior """ + df = opd.read_sdf(input_fname) + + # Check classes inheritance + assert isinstance(df, opd.ChemDataFrame) + assert isinstance(df, pd.DataFrame) + assert isinstance(df['mol'], opd.ChemSeries) + assert isinstance(df['mol'], pd.Series) + assert isinstance(df, pd.DataFrame) + + # Check custom metadata + assert hasattr(df, '_molecule_column') + assert hasattr(df[['mol']], '_molecule_column') + assert df._molecule_column == df[['mol']]._molecule_column + + # Check if slicing perserve classes + assert isinstance(df.head(1), opd.ChemDataFrame) + assert isinstance(df['mol'].head(1), opd.ChemSeries) + + +def test_reading(): + """ Test reading molecule files to ChemDataFrame """ + df = opd.read_sdf(input_fname) + + # Check dimensions + assert len(df) == 100 + assert len(df.columns) == 15 + + df = opd.read_sdf(input_fname, smiles_column='smi_col') + assert 'smi_col' in df.columns + + df = opd.read_sdf(input_fname, + molecule_column=None, + molecule_name_column=None, + usecols=['name']) + assert 'mol' not in df.columns + assert 'mol_name' not in df.columns + assert len(df.columns) == 1 + + df = opd.read_sdf(input_fname, + usecols=['name', 'uniprot_id', 'act']) + assert len(df.columns) == 5 # 3 from use_cols + 1 'mol' + 1 'mol_name' + assert 'uniprot_id' in df.columns + assert 'smi_col' not in df.columns + + # Chunk reading + chunks = [] + for chunk in opd.read_sdf(input_fname, chunksize=10): + assert len(chunk) == 10 + chunks.append(chunk) + assert len(chunks) == 10 + df = pd.concat(chunks) + + # Check dimensions + assert len(df) == 100 + + +def test_substruct_sim_search(): + df = opd.read_sdf(input_fname).head(10) + query = oddt.toolkit.readstring('smi', 'C(=O)(N1C[C@H](C[C@H]1C(=O)N[C@@H]1CCCc2c1cccc2)Oc1ccccc1)[C@@H](NC(=O)[C@H](C)NC)C1CCCCC1') + + ge_answear = [True, True, True, False, True, False, False, False, False, False] + assert (df.mol >= query).tolist() == ge_answear + assert (query <= df.mol).tolist() == ge_answear + + le_answear = [True, True, True, True, True, True, False, False, False, True] + assert (df.mol <= query).tolist() == le_answear + assert (query >= df.mol).tolist() == le_answear + + sim = df.mol.calcfp() | query.calcfp() + assert sim.dtype == 'float64' + + +def test_mol2(): + """Writing and reading of mol2 fils to/from ChemDataFrame""" + if oddt.toolkit.backend == 'ob': + df = opd.read_sdf(input_fname) + with NamedTemporaryFile(suffix='.mol2') as f: + df.to_mol2(f.name) + df2 = opd.read_mol2(f.name) + assert df.shape == df2.shape + chunks = [] + for chunk in opd.read_mol2(f.name, chunksize=10): + assert len(chunk) == 10 + chunks.append(chunk) + df3 = pd.concat(chunks) + assert df.shape == df3.shape + with NamedTemporaryFile(suffix='.mol2') as f: + df.to_mol2(f.name, columns=['name', 'uniprot_id', 'act']) + df2 = opd.read_mol2(f.name) + assert len(df2.columns) == 5 + + +def test_sdf(): + """Writing ChemDataFrame to SDF molecular files""" + df = opd.read_sdf(input_fname) + with NamedTemporaryFile(suffix='.sdf') as f: + df.to_sdf(f.name) + df2 = opd.read_sdf(f.name) + assert_array_equal(df.columns.sort_values(), df2.columns.sort_values()) + with NamedTemporaryFile(suffix='.sdf') as f: + df.to_sdf(f.name, columns=['name', 'uniprot_id', 'act']) + df2 = opd.read_sdf(f.name) + assert len(df2.columns) == 5 + + +def test_csv(): + df = opd.read_sdf(input_fname, + columns=['mol', 'name', 'chembl_id', 'dude_smiles', 'act']) + df['act'] = df['act'].astype(float) + df['name'] = df['name'].astype(int) + with NamedTemporaryFile(suffix='.csv', mode='w+') as f: + for str_buff in (f, f.name): + df.to_csv(str_buff, index=False) + f.seek(0) + df2 = opd.read_csv(f.name, smiles_to_molecule='mol', + molecule_column='mol') + assert df.shape == df2.shape + assert df.columns.tolist() == df2.columns.tolist() + assert df.dtypes.tolist() == df2.dtypes.tolist() + + with NamedTemporaryFile(suffix='.csv', mode='w+') as f: + for str_buff in (f, f.name): + df.to_csv(str_buff, index=False, columns=['name', 'act']) + f.seek(0) + df2 = pd.read_csv(f.name) + assert df[['name', 'act']].shape == df2.shape + assert df[['name', 'act']].columns.tolist() == df2.columns.tolist() + assert df[['name', 'act']].dtypes.tolist() == df2.dtypes.tolist() + + +def test_excel(): + # just check if it doesn't fail + df = opd.read_sdf(input_fname, + columns=['mol', 'name', 'chembl_id', 'dude_smiles', 'act']) + df = df.head(10) # it's slow so use first 10 mols + df['act'] = df['act'].astype(float) + df['name'] = df['name'].astype(int) + with NamedTemporaryFile(suffix='.xls', mode='w') as f: + df.to_excel(f.name, index=False) + writer = pd.ExcelWriter(f.name, engine='xlsxwriter') + df.to_excel(writer, index=False) + + +def test_chemseries_writers(): + df = opd.read_sdf(input_fname, + columns=['mol', 'name', 'chembl_id', 'dude_smiles', 'act']) + + mols = df['mol'] + + # SMILES + with NamedTemporaryFile(suffix='.ism', mode='w') as f: + mols.to_smiles(f) + for mol in oddt.toolkit.readfile('smi', f.name): + assert isinstance(mol, oddt.toolkit.Molecule) + + # SDF + with NamedTemporaryFile(suffix='.sdf', mode='w') as f: + mols.to_sdf(f) + for mol in oddt.toolkit.readfile('sdf', f.name): + assert isinstance(mol, oddt.toolkit.Molecule) + + # mol2 + if oddt.toolkit.backend == 'ob': + with NamedTemporaryFile(suffix='.mol2', mode='w') as f: + mols.to_mol2(f) + for mol in oddt.toolkit.readfile('mol2', f.name): + assert isinstance(mol, oddt.toolkit.Molecule) + + +def test_ipython(): + """iPython Notebook molecule rendering in SVG""" + df = opd.read_sdf(input_fname) + # mock ipython + oddt.toolkit.ipython_notebook = True + # png + oddt.toolkit.image_backend = 'png' + html = df.head(1).to_html() + assert '<img src="data:image/png;base64,' in html + # svg + oddt.toolkit.image_backend = 'svg' + html = df.head(1).to_html() + assert '<svg' in html + oddt.toolkit.ipython_notebook = False