|
a |
|
b/tests/test_pandas.py |
|
|
1 |
import os |
|
|
2 |
from tempfile import NamedTemporaryFile |
|
|
3 |
|
|
|
4 |
from numpy.testing import assert_array_equal |
|
|
5 |
import pandas as pd |
|
|
6 |
|
|
|
7 |
import oddt |
|
|
8 |
import oddt.pandas as opd |
|
|
9 |
|
|
|
10 |
test_data_dir = os.path.dirname(os.path.abspath(__file__)) |
|
|
11 |
input_fname = os.path.join(test_data_dir, 'data/dude/xiap/actives_docked.sdf') |
|
|
12 |
|
|
|
13 |
|
|
|
14 |
def test_classes(): |
|
|
15 |
""" Test oddt.pandas classes behavior """ |
|
|
16 |
df = opd.read_sdf(input_fname) |
|
|
17 |
|
|
|
18 |
# Check classes inheritance |
|
|
19 |
assert isinstance(df, opd.ChemDataFrame) |
|
|
20 |
assert isinstance(df, pd.DataFrame) |
|
|
21 |
assert isinstance(df['mol'], opd.ChemSeries) |
|
|
22 |
assert isinstance(df['mol'], pd.Series) |
|
|
23 |
assert isinstance(df, pd.DataFrame) |
|
|
24 |
|
|
|
25 |
# Check custom metadata |
|
|
26 |
assert hasattr(df, '_molecule_column') |
|
|
27 |
assert hasattr(df[['mol']], '_molecule_column') |
|
|
28 |
assert df._molecule_column == df[['mol']]._molecule_column |
|
|
29 |
|
|
|
30 |
# Check if slicing perserve classes |
|
|
31 |
assert isinstance(df.head(1), opd.ChemDataFrame) |
|
|
32 |
assert isinstance(df['mol'].head(1), opd.ChemSeries) |
|
|
33 |
|
|
|
34 |
|
|
|
35 |
def test_reading(): |
|
|
36 |
""" Test reading molecule files to ChemDataFrame """ |
|
|
37 |
df = opd.read_sdf(input_fname) |
|
|
38 |
|
|
|
39 |
# Check dimensions |
|
|
40 |
assert len(df) == 100 |
|
|
41 |
assert len(df.columns) == 15 |
|
|
42 |
|
|
|
43 |
df = opd.read_sdf(input_fname, smiles_column='smi_col') |
|
|
44 |
assert 'smi_col' in df.columns |
|
|
45 |
|
|
|
46 |
df = opd.read_sdf(input_fname, |
|
|
47 |
molecule_column=None, |
|
|
48 |
molecule_name_column=None, |
|
|
49 |
usecols=['name']) |
|
|
50 |
assert 'mol' not in df.columns |
|
|
51 |
assert 'mol_name' not in df.columns |
|
|
52 |
assert len(df.columns) == 1 |
|
|
53 |
|
|
|
54 |
df = opd.read_sdf(input_fname, |
|
|
55 |
usecols=['name', 'uniprot_id', 'act']) |
|
|
56 |
assert len(df.columns) == 5 # 3 from use_cols + 1 'mol' + 1 'mol_name' |
|
|
57 |
assert 'uniprot_id' in df.columns |
|
|
58 |
assert 'smi_col' not in df.columns |
|
|
59 |
|
|
|
60 |
# Chunk reading |
|
|
61 |
chunks = [] |
|
|
62 |
for chunk in opd.read_sdf(input_fname, chunksize=10): |
|
|
63 |
assert len(chunk) == 10 |
|
|
64 |
chunks.append(chunk) |
|
|
65 |
assert len(chunks) == 10 |
|
|
66 |
df = pd.concat(chunks) |
|
|
67 |
|
|
|
68 |
# Check dimensions |
|
|
69 |
assert len(df) == 100 |
|
|
70 |
|
|
|
71 |
|
|
|
72 |
def test_substruct_sim_search(): |
|
|
73 |
df = opd.read_sdf(input_fname).head(10) |
|
|
74 |
query = oddt.toolkit.readstring('smi', 'C(=O)(N1C[C@H](C[C@H]1C(=O)N[C@@H]1CCCc2c1cccc2)Oc1ccccc1)[C@@H](NC(=O)[C@H](C)NC)C1CCCCC1') |
|
|
75 |
|
|
|
76 |
ge_answear = [True, True, True, False, True, False, False, False, False, False] |
|
|
77 |
assert (df.mol >= query).tolist() == ge_answear |
|
|
78 |
assert (query <= df.mol).tolist() == ge_answear |
|
|
79 |
|
|
|
80 |
le_answear = [True, True, True, True, True, True, False, False, False, True] |
|
|
81 |
assert (df.mol <= query).tolist() == le_answear |
|
|
82 |
assert (query >= df.mol).tolist() == le_answear |
|
|
83 |
|
|
|
84 |
sim = df.mol.calcfp() | query.calcfp() |
|
|
85 |
assert sim.dtype == 'float64' |
|
|
86 |
|
|
|
87 |
|
|
|
88 |
def test_mol2(): |
|
|
89 |
"""Writing and reading of mol2 fils to/from ChemDataFrame""" |
|
|
90 |
if oddt.toolkit.backend == 'ob': |
|
|
91 |
df = opd.read_sdf(input_fname) |
|
|
92 |
with NamedTemporaryFile(suffix='.mol2') as f: |
|
|
93 |
df.to_mol2(f.name) |
|
|
94 |
df2 = opd.read_mol2(f.name) |
|
|
95 |
assert df.shape == df2.shape |
|
|
96 |
chunks = [] |
|
|
97 |
for chunk in opd.read_mol2(f.name, chunksize=10): |
|
|
98 |
assert len(chunk) == 10 |
|
|
99 |
chunks.append(chunk) |
|
|
100 |
df3 = pd.concat(chunks) |
|
|
101 |
assert df.shape == df3.shape |
|
|
102 |
with NamedTemporaryFile(suffix='.mol2') as f: |
|
|
103 |
df.to_mol2(f.name, columns=['name', 'uniprot_id', 'act']) |
|
|
104 |
df2 = opd.read_mol2(f.name) |
|
|
105 |
assert len(df2.columns) == 5 |
|
|
106 |
|
|
|
107 |
|
|
|
108 |
def test_sdf(): |
|
|
109 |
"""Writing ChemDataFrame to SDF molecular files""" |
|
|
110 |
df = opd.read_sdf(input_fname) |
|
|
111 |
with NamedTemporaryFile(suffix='.sdf') as f: |
|
|
112 |
df.to_sdf(f.name) |
|
|
113 |
df2 = opd.read_sdf(f.name) |
|
|
114 |
assert_array_equal(df.columns.sort_values(), df2.columns.sort_values()) |
|
|
115 |
with NamedTemporaryFile(suffix='.sdf') as f: |
|
|
116 |
df.to_sdf(f.name, columns=['name', 'uniprot_id', 'act']) |
|
|
117 |
df2 = opd.read_sdf(f.name) |
|
|
118 |
assert len(df2.columns) == 5 |
|
|
119 |
|
|
|
120 |
|
|
|
121 |
def test_csv(): |
|
|
122 |
df = opd.read_sdf(input_fname, |
|
|
123 |
columns=['mol', 'name', 'chembl_id', 'dude_smiles', 'act']) |
|
|
124 |
df['act'] = df['act'].astype(float) |
|
|
125 |
df['name'] = df['name'].astype(int) |
|
|
126 |
with NamedTemporaryFile(suffix='.csv', mode='w+') as f: |
|
|
127 |
for str_buff in (f, f.name): |
|
|
128 |
df.to_csv(str_buff, index=False) |
|
|
129 |
f.seek(0) |
|
|
130 |
df2 = opd.read_csv(f.name, smiles_to_molecule='mol', |
|
|
131 |
molecule_column='mol') |
|
|
132 |
assert df.shape == df2.shape |
|
|
133 |
assert df.columns.tolist() == df2.columns.tolist() |
|
|
134 |
assert df.dtypes.tolist() == df2.dtypes.tolist() |
|
|
135 |
|
|
|
136 |
with NamedTemporaryFile(suffix='.csv', mode='w+') as f: |
|
|
137 |
for str_buff in (f, f.name): |
|
|
138 |
df.to_csv(str_buff, index=False, columns=['name', 'act']) |
|
|
139 |
f.seek(0) |
|
|
140 |
df2 = pd.read_csv(f.name) |
|
|
141 |
assert df[['name', 'act']].shape == df2.shape |
|
|
142 |
assert df[['name', 'act']].columns.tolist() == df2.columns.tolist() |
|
|
143 |
assert df[['name', 'act']].dtypes.tolist() == df2.dtypes.tolist() |
|
|
144 |
|
|
|
145 |
|
|
|
146 |
def test_excel(): |
|
|
147 |
# just check if it doesn't fail |
|
|
148 |
df = opd.read_sdf(input_fname, |
|
|
149 |
columns=['mol', 'name', 'chembl_id', 'dude_smiles', 'act']) |
|
|
150 |
df = df.head(10) # it's slow so use first 10 mols |
|
|
151 |
df['act'] = df['act'].astype(float) |
|
|
152 |
df['name'] = df['name'].astype(int) |
|
|
153 |
with NamedTemporaryFile(suffix='.xls', mode='w') as f: |
|
|
154 |
df.to_excel(f.name, index=False) |
|
|
155 |
writer = pd.ExcelWriter(f.name, engine='xlsxwriter') |
|
|
156 |
df.to_excel(writer, index=False) |
|
|
157 |
|
|
|
158 |
|
|
|
159 |
def test_chemseries_writers(): |
|
|
160 |
df = opd.read_sdf(input_fname, |
|
|
161 |
columns=['mol', 'name', 'chembl_id', 'dude_smiles', 'act']) |
|
|
162 |
|
|
|
163 |
mols = df['mol'] |
|
|
164 |
|
|
|
165 |
# SMILES |
|
|
166 |
with NamedTemporaryFile(suffix='.ism', mode='w') as f: |
|
|
167 |
mols.to_smiles(f) |
|
|
168 |
for mol in oddt.toolkit.readfile('smi', f.name): |
|
|
169 |
assert isinstance(mol, oddt.toolkit.Molecule) |
|
|
170 |
|
|
|
171 |
# SDF |
|
|
172 |
with NamedTemporaryFile(suffix='.sdf', mode='w') as f: |
|
|
173 |
mols.to_sdf(f) |
|
|
174 |
for mol in oddt.toolkit.readfile('sdf', f.name): |
|
|
175 |
assert isinstance(mol, oddt.toolkit.Molecule) |
|
|
176 |
|
|
|
177 |
# mol2 |
|
|
178 |
if oddt.toolkit.backend == 'ob': |
|
|
179 |
with NamedTemporaryFile(suffix='.mol2', mode='w') as f: |
|
|
180 |
mols.to_mol2(f) |
|
|
181 |
for mol in oddt.toolkit.readfile('mol2', f.name): |
|
|
182 |
assert isinstance(mol, oddt.toolkit.Molecule) |
|
|
183 |
|
|
|
184 |
|
|
|
185 |
def test_ipython(): |
|
|
186 |
"""iPython Notebook molecule rendering in SVG""" |
|
|
187 |
df = opd.read_sdf(input_fname) |
|
|
188 |
# mock ipython |
|
|
189 |
oddt.toolkit.ipython_notebook = True |
|
|
190 |
# png |
|
|
191 |
oddt.toolkit.image_backend = 'png' |
|
|
192 |
html = df.head(1).to_html() |
|
|
193 |
assert '<img src="data:image/png;base64,' in html |
|
|
194 |
# svg |
|
|
195 |
oddt.toolkit.image_backend = 'svg' |
|
|
196 |
html = df.head(1).to_html() |
|
|
197 |
assert '<svg' in html |
|
|
198 |
oddt.toolkit.ipython_notebook = False |