Diff of /tests/test_pandas.py [000000] .. [3b722e]

Switch to unified view

a b/tests/test_pandas.py
1
import os
2
from tempfile import NamedTemporaryFile
3
4
from numpy.testing import assert_array_equal
5
import pandas as pd
6
7
import oddt
8
import oddt.pandas as opd
9
10
test_data_dir = os.path.dirname(os.path.abspath(__file__))
11
input_fname = os.path.join(test_data_dir, 'data/dude/xiap/actives_docked.sdf')
12
13
14
def test_classes():
15
    """ Test oddt.pandas classes behavior """
16
    df = opd.read_sdf(input_fname)
17
18
    # Check classes inheritance
19
    assert isinstance(df, opd.ChemDataFrame)
20
    assert isinstance(df, pd.DataFrame)
21
    assert isinstance(df['mol'], opd.ChemSeries)
22
    assert isinstance(df['mol'], pd.Series)
23
    assert isinstance(df, pd.DataFrame)
24
25
    # Check custom metadata
26
    assert hasattr(df, '_molecule_column')
27
    assert hasattr(df[['mol']], '_molecule_column')
28
    assert df._molecule_column == df[['mol']]._molecule_column
29
30
    # Check if slicing perserve classes
31
    assert isinstance(df.head(1), opd.ChemDataFrame)
32
    assert isinstance(df['mol'].head(1), opd.ChemSeries)
33
34
35
def test_reading():
36
    """ Test reading molecule files to ChemDataFrame """
37
    df = opd.read_sdf(input_fname)
38
39
    # Check dimensions
40
    assert len(df) == 100
41
    assert len(df.columns) == 15
42
43
    df = opd.read_sdf(input_fname, smiles_column='smi_col')
44
    assert 'smi_col' in df.columns
45
46
    df = opd.read_sdf(input_fname,
47
                      molecule_column=None,
48
                      molecule_name_column=None,
49
                      usecols=['name'])
50
    assert 'mol' not in df.columns
51
    assert 'mol_name' not in df.columns
52
    assert len(df.columns) == 1
53
54
    df = opd.read_sdf(input_fname,
55
                      usecols=['name', 'uniprot_id', 'act'])
56
    assert len(df.columns) == 5  # 3 from use_cols + 1 'mol' + 1 'mol_name'
57
    assert 'uniprot_id' in df.columns
58
    assert 'smi_col' not in df.columns
59
60
    # Chunk reading
61
    chunks = []
62
    for chunk in opd.read_sdf(input_fname, chunksize=10):
63
        assert len(chunk) == 10
64
        chunks.append(chunk)
65
    assert len(chunks) == 10
66
    df = pd.concat(chunks)
67
68
    # Check dimensions
69
    assert len(df) == 100
70
71
72
def test_substruct_sim_search():
73
    df = opd.read_sdf(input_fname).head(10)
74
    query = oddt.toolkit.readstring('smi', 'C(=O)(N1C[C@H](C[C@H]1C(=O)N[C@@H]1CCCc2c1cccc2)Oc1ccccc1)[C@@H](NC(=O)[C@H](C)NC)C1CCCCC1')
75
76
    ge_answear = [True, True, True, False, True, False, False, False, False, False]
77
    assert (df.mol >= query).tolist() == ge_answear
78
    assert (query <= df.mol).tolist() == ge_answear
79
80
    le_answear = [True, True, True, True, True, True, False, False, False, True]
81
    assert (df.mol <= query).tolist() == le_answear
82
    assert (query >= df.mol).tolist() == le_answear
83
84
    sim = df.mol.calcfp() | query.calcfp()
85
    assert sim.dtype == 'float64'
86
87
88
def test_mol2():
89
    """Writing and reading of mol2 fils to/from ChemDataFrame"""
90
    if oddt.toolkit.backend == 'ob':
91
        df = opd.read_sdf(input_fname)
92
        with NamedTemporaryFile(suffix='.mol2') as f:
93
            df.to_mol2(f.name)
94
            df2 = opd.read_mol2(f.name)
95
            assert df.shape == df2.shape
96
            chunks = []
97
            for chunk in opd.read_mol2(f.name, chunksize=10):
98
                assert len(chunk) == 10
99
                chunks.append(chunk)
100
            df3 = pd.concat(chunks)
101
            assert df.shape == df3.shape
102
        with NamedTemporaryFile(suffix='.mol2') as f:
103
            df.to_mol2(f.name, columns=['name', 'uniprot_id', 'act'])
104
            df2 = opd.read_mol2(f.name)
105
            assert len(df2.columns) == 5
106
107
108
def test_sdf():
109
    """Writing ChemDataFrame to SDF molecular files"""
110
    df = opd.read_sdf(input_fname)
111
    with NamedTemporaryFile(suffix='.sdf') as f:
112
        df.to_sdf(f.name)
113
        df2 = opd.read_sdf(f.name)
114
    assert_array_equal(df.columns.sort_values(), df2.columns.sort_values())
115
    with NamedTemporaryFile(suffix='.sdf') as f:
116
        df.to_sdf(f.name, columns=['name', 'uniprot_id', 'act'])
117
        df2 = opd.read_sdf(f.name)
118
    assert len(df2.columns) == 5
119
120
121
def test_csv():
122
    df = opd.read_sdf(input_fname,
123
                      columns=['mol', 'name', 'chembl_id', 'dude_smiles', 'act'])
124
    df['act'] = df['act'].astype(float)
125
    df['name'] = df['name'].astype(int)
126
    with NamedTemporaryFile(suffix='.csv', mode='w+') as f:
127
        for str_buff in (f, f.name):
128
            df.to_csv(str_buff, index=False)
129
            f.seek(0)
130
            df2 = opd.read_csv(f.name, smiles_to_molecule='mol',
131
                               molecule_column='mol')
132
            assert df.shape == df2.shape
133
            assert df.columns.tolist() == df2.columns.tolist()
134
            assert df.dtypes.tolist() == df2.dtypes.tolist()
135
136
    with NamedTemporaryFile(suffix='.csv', mode='w+') as f:
137
        for str_buff in (f, f.name):
138
            df.to_csv(str_buff, index=False, columns=['name', 'act'])
139
            f.seek(0)
140
            df2 = pd.read_csv(f.name)
141
            assert df[['name', 'act']].shape == df2.shape
142
            assert df[['name', 'act']].columns.tolist() == df2.columns.tolist()
143
            assert df[['name', 'act']].dtypes.tolist() == df2.dtypes.tolist()
144
145
146
def test_excel():
147
    # just check if it doesn't fail
148
    df = opd.read_sdf(input_fname,
149
                      columns=['mol', 'name', 'chembl_id', 'dude_smiles', 'act'])
150
    df = df.head(10)    # it's slow so use first 10 mols
151
    df['act'] = df['act'].astype(float)
152
    df['name'] = df['name'].astype(int)
153
    with NamedTemporaryFile(suffix='.xls', mode='w') as f:
154
        df.to_excel(f.name, index=False)
155
        writer = pd.ExcelWriter(f.name, engine='xlsxwriter')
156
        df.to_excel(writer, index=False)
157
158
159
def test_chemseries_writers():
160
    df = opd.read_sdf(input_fname,
161
                      columns=['mol', 'name', 'chembl_id', 'dude_smiles', 'act'])
162
163
    mols = df['mol']
164
165
    # SMILES
166
    with NamedTemporaryFile(suffix='.ism', mode='w') as f:
167
        mols.to_smiles(f)
168
        for mol in oddt.toolkit.readfile('smi', f.name):
169
            assert isinstance(mol, oddt.toolkit.Molecule)
170
171
    # SDF
172
    with NamedTemporaryFile(suffix='.sdf', mode='w') as f:
173
        mols.to_sdf(f)
174
        for mol in oddt.toolkit.readfile('sdf', f.name):
175
            assert isinstance(mol, oddt.toolkit.Molecule)
176
177
    # mol2
178
    if oddt.toolkit.backend == 'ob':
179
        with NamedTemporaryFile(suffix='.mol2', mode='w') as f:
180
            mols.to_mol2(f)
181
            for mol in oddt.toolkit.readfile('mol2', f.name):
182
                assert isinstance(mol, oddt.toolkit.Molecule)
183
184
185
def test_ipython():
186
    """iPython Notebook molecule rendering in SVG"""
187
    df = opd.read_sdf(input_fname)
188
    # mock ipython
189
    oddt.toolkit.ipython_notebook = True
190
    # png
191
    oddt.toolkit.image_backend = 'png'
192
    html = df.head(1).to_html()
193
    assert '<img src="data:image/png;base64,' in html
194
    # svg
195
    oddt.toolkit.image_backend = 'svg'
196
    html = df.head(1).to_html()
197
    assert '<svg' in html
198
    oddt.toolkit.ipython_notebook = False