|
a |
|
b/analysis/docking.py |
|
|
1 |
import os |
|
|
2 |
import re |
|
|
3 |
import tempfile |
|
|
4 |
import numpy as np |
|
|
5 |
import torch |
|
|
6 |
from pathlib import Path |
|
|
7 |
import argparse |
|
|
8 |
import pandas as pd |
|
|
9 |
from rdkit import Chem |
|
|
10 |
from tqdm import tqdm |
|
|
11 |
|
|
|
12 |
try: |
|
|
13 |
import utils |
|
|
14 |
except ModuleNotFoundError as e: |
|
|
15 |
print(e) |
|
|
16 |
|
|
|
17 |
|
|
|
18 |
def calculate_smina_score(pdb_file, sdf_file): |
|
|
19 |
# add '-o <name>_smina.sdf' if you want to see the output |
|
|
20 |
out = os.popen(f'smina.static -l {sdf_file} -r {pdb_file} ' |
|
|
21 |
f'--score_only').read() |
|
|
22 |
matches = re.findall( |
|
|
23 |
r"Affinity:[ ]+([+-]?[0-9]*[.]?[0-9]+)[ ]+\(kcal/mol\)", out) |
|
|
24 |
return [float(x) for x in matches] |
|
|
25 |
|
|
|
26 |
|
|
|
27 |
def smina_score(rdmols, receptor_file): |
|
|
28 |
""" |
|
|
29 |
Calculate smina score |
|
|
30 |
:param rdmols: List of RDKit molecules |
|
|
31 |
:param receptor_file: Receptor pdb/pdbqt file or list of receptor files |
|
|
32 |
:return: Smina score for each input molecule (list) |
|
|
33 |
""" |
|
|
34 |
|
|
|
35 |
if isinstance(receptor_file, list): |
|
|
36 |
scores = [] |
|
|
37 |
for mol, rec_file in zip(rdmols, receptor_file): |
|
|
38 |
with tempfile.NamedTemporaryFile(suffix='.sdf') as tmp: |
|
|
39 |
tmp_file = tmp.name |
|
|
40 |
utils.write_sdf_file(tmp_file, [mol]) |
|
|
41 |
scores.extend(calculate_smina_score(rec_file, tmp_file)) |
|
|
42 |
|
|
|
43 |
# Use same receptor file for all molecules |
|
|
44 |
else: |
|
|
45 |
with tempfile.NamedTemporaryFile(suffix='.sdf') as tmp: |
|
|
46 |
tmp_file = tmp.name |
|
|
47 |
utils.write_sdf_file(tmp_file, rdmols) |
|
|
48 |
scores = calculate_smina_score(receptor_file, tmp_file) |
|
|
49 |
|
|
|
50 |
return scores |
|
|
51 |
|
|
|
52 |
|
|
|
53 |
def sdf_to_pdbqt(sdf_file, pdbqt_outfile, mol_id): |
|
|
54 |
os.popen(f'obabel {sdf_file} -O {pdbqt_outfile} ' |
|
|
55 |
f'-f {mol_id + 1} -l {mol_id + 1}').read() |
|
|
56 |
return pdbqt_outfile |
|
|
57 |
|
|
|
58 |
|
|
|
59 |
def calculate_qvina2_score(receptor_file, sdf_file, out_dir, size=20, |
|
|
60 |
exhaustiveness=16, return_rdmol=False): |
|
|
61 |
|
|
|
62 |
receptor_file = Path(receptor_file) |
|
|
63 |
sdf_file = Path(sdf_file) |
|
|
64 |
|
|
|
65 |
if receptor_file.suffix == '.pdb': |
|
|
66 |
# prepare receptor, requires Python 2.7 |
|
|
67 |
receptor_pdbqt_file = Path(out_dir, receptor_file.stem + '.pdbqt') |
|
|
68 |
os.popen(f'prepare_receptor4.py -r {receptor_file} -O {receptor_pdbqt_file}') |
|
|
69 |
else: |
|
|
70 |
receptor_pdbqt_file = receptor_file |
|
|
71 |
|
|
|
72 |
scores = [] |
|
|
73 |
rdmols = [] # for if return rdmols |
|
|
74 |
suppl = Chem.SDMolSupplier(str(sdf_file), sanitize=False) |
|
|
75 |
for i, mol in enumerate(suppl): # sdf file may contain several ligands |
|
|
76 |
ligand_name = f'{sdf_file.stem}_{i}' |
|
|
77 |
# prepare ligand |
|
|
78 |
ligand_pdbqt_file = Path(out_dir, ligand_name + '.pdbqt') |
|
|
79 |
out_sdf_file = Path(out_dir, ligand_name + '_out.sdf') |
|
|
80 |
|
|
|
81 |
if out_sdf_file.exists(): |
|
|
82 |
with open(out_sdf_file, 'r') as f: |
|
|
83 |
scores.append( |
|
|
84 |
min([float(x.split()[2]) for x in f.readlines() |
|
|
85 |
if x.startswith(' VINA RESULT:')]) |
|
|
86 |
) |
|
|
87 |
|
|
|
88 |
else: |
|
|
89 |
sdf_to_pdbqt(sdf_file, ligand_pdbqt_file, i) |
|
|
90 |
|
|
|
91 |
# center box at ligand's center of mass |
|
|
92 |
cx, cy, cz = mol.GetConformer().GetPositions().mean(0) |
|
|
93 |
|
|
|
94 |
# run QuickVina 2 |
|
|
95 |
out = os.popen( |
|
|
96 |
f'qvina2.1 --receptor {receptor_pdbqt_file} ' |
|
|
97 |
f'--ligand {ligand_pdbqt_file} ' |
|
|
98 |
f'--center_x {cx:.4f} --center_y {cy:.4f} --center_z {cz:.4f} ' |
|
|
99 |
f'--size_x {size} --size_y {size} --size_z {size} ' |
|
|
100 |
f'--exhaustiveness {exhaustiveness}' |
|
|
101 |
).read() |
|
|
102 |
|
|
|
103 |
# clean up |
|
|
104 |
ligand_pdbqt_file.unlink() |
|
|
105 |
|
|
|
106 |
if '-----+------------+----------+----------' not in out: |
|
|
107 |
scores.append(np.nan) |
|
|
108 |
continue |
|
|
109 |
|
|
|
110 |
out_split = out.splitlines() |
|
|
111 |
best_idx = out_split.index('-----+------------+----------+----------') + 1 |
|
|
112 |
best_line = out_split[best_idx].split() |
|
|
113 |
assert best_line[0] == '1' |
|
|
114 |
scores.append(float(best_line[1])) |
|
|
115 |
|
|
|
116 |
out_pdbqt_file = Path(out_dir, ligand_name + '_out.pdbqt') |
|
|
117 |
if out_pdbqt_file.exists(): |
|
|
118 |
os.popen(f'obabel {out_pdbqt_file} -O {out_sdf_file}').read() |
|
|
119 |
|
|
|
120 |
# clean up |
|
|
121 |
out_pdbqt_file.unlink() |
|
|
122 |
|
|
|
123 |
if return_rdmol: |
|
|
124 |
rdmol = Chem.SDMolSupplier(str(out_sdf_file))[0] |
|
|
125 |
rdmols.append(rdmol) |
|
|
126 |
|
|
|
127 |
if return_rdmol: |
|
|
128 |
return scores, rdmols |
|
|
129 |
else: |
|
|
130 |
return scores |
|
|
131 |
|
|
|
132 |
|
|
|
133 |
if __name__ == '__main__': |
|
|
134 |
parser = argparse.ArgumentParser('QuickVina evaluation') |
|
|
135 |
parser.add_argument('--pdbqt_dir', type=Path, |
|
|
136 |
help='Receptor files in pdbqt format') |
|
|
137 |
parser.add_argument('--sdf_dir', type=Path, default=None, |
|
|
138 |
help='Ligand files in sdf format') |
|
|
139 |
parser.add_argument('--sdf_files', type=Path, nargs='+', default=None) |
|
|
140 |
parser.add_argument('--out_dir', type=Path) |
|
|
141 |
parser.add_argument('--write_csv', action='store_true') |
|
|
142 |
parser.add_argument('--write_dict', action='store_true') |
|
|
143 |
parser.add_argument('--dataset', type=str, default='moad') |
|
|
144 |
args = parser.parse_args() |
|
|
145 |
|
|
|
146 |
assert (args.sdf_dir is not None) ^ (args.sdf_files is not None) |
|
|
147 |
|
|
|
148 |
args.out_dir.mkdir(exist_ok=True) |
|
|
149 |
|
|
|
150 |
results = {'receptor': [], 'ligand': [], 'scores': []} |
|
|
151 |
results_dict = {} |
|
|
152 |
sdf_files = list(args.sdf_dir.glob('[!.]*.sdf')) \ |
|
|
153 |
if args.sdf_dir is not None else args.sdf_files |
|
|
154 |
pbar = tqdm(sdf_files) |
|
|
155 |
for sdf_file in pbar: |
|
|
156 |
pbar.set_description(f'Processing {sdf_file.name}') |
|
|
157 |
|
|
|
158 |
if args.dataset == 'moad': |
|
|
159 |
""" |
|
|
160 |
Ligand file names should be of the following form: |
|
|
161 |
<receptor-name>_<pocket-id>_<some-suffix>.sdf |
|
|
162 |
where <receptor-name> and <pocket-id> cannot contain any |
|
|
163 |
underscores, e.g.: 1abc-bio1_pocket0_gen.sdf |
|
|
164 |
""" |
|
|
165 |
ligand_name = sdf_file.stem |
|
|
166 |
receptor_name, pocket_id, *suffix = ligand_name.split('_') |
|
|
167 |
suffix = '_'.join(suffix) |
|
|
168 |
receptor_file = Path(args.pdbqt_dir, receptor_name + '.pdbqt') |
|
|
169 |
elif args.dataset == 'crossdocked': |
|
|
170 |
ligand_name = sdf_file.stem |
|
|
171 |
receptor_name = ligand_name[:-4] |
|
|
172 |
receptor_file = Path(args.pdbqt_dir, receptor_name + '.pdbqt') |
|
|
173 |
|
|
|
174 |
# try: |
|
|
175 |
scores, rdmols = calculate_qvina2_score( |
|
|
176 |
receptor_file, sdf_file, args.out_dir, return_rdmol=True) |
|
|
177 |
# except AttributeError as e: |
|
|
178 |
# print(e) |
|
|
179 |
# continue |
|
|
180 |
results['receptor'].append(str(receptor_file)) |
|
|
181 |
results['ligand'].append(str(sdf_file)) |
|
|
182 |
results['scores'].append(scores) |
|
|
183 |
|
|
|
184 |
if args.write_dict: |
|
|
185 |
results_dict[ligand_name] = { |
|
|
186 |
'receptor': str(receptor_file), |
|
|
187 |
'ligand': str(sdf_file), |
|
|
188 |
'scores': scores, |
|
|
189 |
'rmdols': rdmols |
|
|
190 |
} |
|
|
191 |
|
|
|
192 |
if args.write_csv: |
|
|
193 |
df = pd.DataFrame.from_dict(results) |
|
|
194 |
df.to_csv(Path(args.out_dir, 'qvina2_scores.csv')) |
|
|
195 |
|
|
|
196 |
if args.write_dict: |
|
|
197 |
torch.save(results_dict, Path(args.out_dir, 'qvina2_scores.pt')) |