Diff of /analysis/docking.py [000000] .. [607087]

Switch to unified view

a b/analysis/docking.py
1
import os
2
import re
3
import tempfile
4
import numpy as np
5
import torch
6
from pathlib import Path
7
import argparse
8
import pandas as pd
9
from rdkit import Chem
10
from tqdm import tqdm
11
12
try:
13
    import utils
14
except ModuleNotFoundError as e:
15
    print(e)
16
17
18
def calculate_smina_score(pdb_file, sdf_file):
19
    # add '-o <name>_smina.sdf' if you want to see the output
20
    out = os.popen(f'smina.static -l {sdf_file} -r {pdb_file} '
21
                   f'--score_only').read()
22
    matches = re.findall(
23
        r"Affinity:[ ]+([+-]?[0-9]*[.]?[0-9]+)[ ]+\(kcal/mol\)", out)
24
    return [float(x) for x in matches]
25
26
27
def smina_score(rdmols, receptor_file):
28
    """
29
    Calculate smina score
30
    :param rdmols: List of RDKit molecules
31
    :param receptor_file: Receptor pdb/pdbqt file or list of receptor files
32
    :return: Smina score for each input molecule (list)
33
    """
34
35
    if isinstance(receptor_file, list):
36
        scores = []
37
        for mol, rec_file in zip(rdmols, receptor_file):
38
            with tempfile.NamedTemporaryFile(suffix='.sdf') as tmp:
39
                tmp_file = tmp.name
40
                utils.write_sdf_file(tmp_file, [mol])
41
                scores.extend(calculate_smina_score(rec_file, tmp_file))
42
43
    # Use same receptor file for all molecules
44
    else:
45
        with tempfile.NamedTemporaryFile(suffix='.sdf') as tmp:
46
            tmp_file = tmp.name
47
            utils.write_sdf_file(tmp_file, rdmols)
48
            scores = calculate_smina_score(receptor_file, tmp_file)
49
50
    return scores
51
52
53
def sdf_to_pdbqt(sdf_file, pdbqt_outfile, mol_id):
54
    os.popen(f'obabel {sdf_file} -O {pdbqt_outfile} '
55
             f'-f {mol_id + 1} -l {mol_id + 1}').read()
56
    return pdbqt_outfile
57
58
59
def calculate_qvina2_score(receptor_file, sdf_file, out_dir, size=20,
60
                           exhaustiveness=16, return_rdmol=False):
61
62
    receptor_file = Path(receptor_file)
63
    sdf_file = Path(sdf_file)
64
65
    if receptor_file.suffix == '.pdb':
66
        # prepare receptor, requires Python 2.7
67
        receptor_pdbqt_file = Path(out_dir, receptor_file.stem + '.pdbqt')
68
        os.popen(f'prepare_receptor4.py -r {receptor_file} -O {receptor_pdbqt_file}')
69
    else:
70
        receptor_pdbqt_file = receptor_file
71
72
    scores = []
73
    rdmols = []  # for if return rdmols
74
    suppl = Chem.SDMolSupplier(str(sdf_file), sanitize=False)
75
    for i, mol in enumerate(suppl):  # sdf file may contain several ligands
76
        ligand_name = f'{sdf_file.stem}_{i}'
77
        # prepare ligand
78
        ligand_pdbqt_file = Path(out_dir, ligand_name + '.pdbqt')
79
        out_sdf_file = Path(out_dir, ligand_name + '_out.sdf')
80
81
        if out_sdf_file.exists():
82
            with open(out_sdf_file, 'r') as f:
83
                scores.append(
84
                    min([float(x.split()[2]) for x in f.readlines()
85
                         if x.startswith(' VINA RESULT:')])
86
                )
87
88
        else:
89
            sdf_to_pdbqt(sdf_file, ligand_pdbqt_file, i)
90
91
            # center box at ligand's center of mass
92
            cx, cy, cz = mol.GetConformer().GetPositions().mean(0)
93
94
            # run QuickVina 2
95
            out = os.popen(
96
                f'qvina2.1 --receptor {receptor_pdbqt_file} '
97
                f'--ligand {ligand_pdbqt_file} '
98
                f'--center_x {cx:.4f} --center_y {cy:.4f} --center_z {cz:.4f} '
99
                f'--size_x {size} --size_y {size} --size_z {size} '
100
                f'--exhaustiveness {exhaustiveness}'
101
            ).read()
102
103
            # clean up
104
            ligand_pdbqt_file.unlink()
105
106
            if '-----+------------+----------+----------' not in out:
107
                scores.append(np.nan)
108
                continue
109
110
            out_split = out.splitlines()
111
            best_idx = out_split.index('-----+------------+----------+----------') + 1
112
            best_line = out_split[best_idx].split()
113
            assert best_line[0] == '1'
114
            scores.append(float(best_line[1]))
115
116
            out_pdbqt_file = Path(out_dir, ligand_name + '_out.pdbqt')
117
            if out_pdbqt_file.exists():
118
                os.popen(f'obabel {out_pdbqt_file} -O {out_sdf_file}').read()
119
120
                # clean up
121
                out_pdbqt_file.unlink()
122
123
        if return_rdmol:
124
            rdmol = Chem.SDMolSupplier(str(out_sdf_file))[0]
125
            rdmols.append(rdmol)
126
127
    if return_rdmol:
128
        return scores, rdmols
129
    else:
130
        return scores
131
132
133
if __name__ == '__main__':
134
    parser = argparse.ArgumentParser('QuickVina evaluation')
135
    parser.add_argument('--pdbqt_dir', type=Path,
136
                        help='Receptor files in pdbqt format')
137
    parser.add_argument('--sdf_dir', type=Path, default=None,
138
                        help='Ligand files in sdf format')
139
    parser.add_argument('--sdf_files', type=Path, nargs='+', default=None)
140
    parser.add_argument('--out_dir', type=Path)
141
    parser.add_argument('--write_csv', action='store_true')
142
    parser.add_argument('--write_dict', action='store_true')
143
    parser.add_argument('--dataset', type=str, default='moad')
144
    args = parser.parse_args()
145
146
    assert (args.sdf_dir is not None) ^ (args.sdf_files is not None)
147
148
    args.out_dir.mkdir(exist_ok=True)
149
150
    results = {'receptor': [], 'ligand': [], 'scores': []}
151
    results_dict = {}
152
    sdf_files = list(args.sdf_dir.glob('[!.]*.sdf')) \
153
        if args.sdf_dir is not None else args.sdf_files
154
    pbar = tqdm(sdf_files)
155
    for sdf_file in pbar:
156
        pbar.set_description(f'Processing {sdf_file.name}')
157
158
        if args.dataset == 'moad':
159
            """
160
            Ligand file names should be of the following form:
161
            <receptor-name>_<pocket-id>_<some-suffix>.sdf
162
            where <receptor-name> and <pocket-id> cannot contain any 
163
            underscores, e.g.: 1abc-bio1_pocket0_gen.sdf
164
            """
165
            ligand_name = sdf_file.stem
166
            receptor_name, pocket_id, *suffix = ligand_name.split('_')
167
            suffix = '_'.join(suffix)
168
            receptor_file = Path(args.pdbqt_dir, receptor_name + '.pdbqt')
169
        elif args.dataset == 'crossdocked':
170
            ligand_name = sdf_file.stem
171
            receptor_name = ligand_name[:-4]
172
            receptor_file = Path(args.pdbqt_dir, receptor_name + '.pdbqt')
173
174
        # try:
175
        scores, rdmols = calculate_qvina2_score(
176
            receptor_file, sdf_file, args.out_dir, return_rdmol=True)
177
        # except AttributeError as e:
178
        #     print(e)
179
        #     continue
180
        results['receptor'].append(str(receptor_file))
181
        results['ligand'].append(str(sdf_file))
182
        results['scores'].append(scores)
183
184
        if args.write_dict:
185
            results_dict[ligand_name] = {
186
                'receptor': str(receptor_file),
187
                'ligand': str(sdf_file),
188
                'scores': scores,
189
                'rmdols': rdmols
190
            }
191
192
    if args.write_csv:
193
        df = pd.DataFrame.from_dict(results)
194
        df.to_csv(Path(args.out_dir, 'qvina2_scores.csv'))
195
196
    if args.write_dict:
197
        torch.save(results_dict, Path(args.out_dir, 'qvina2_scores.pt'))