Diff of /analysis/docking.py [000000] .. [607087]

Switch to side-by-side view

--- a
+++ b/analysis/docking.py
@@ -0,0 +1,197 @@
+import os
+import re
+import tempfile
+import numpy as np
+import torch
+from pathlib import Path
+import argparse
+import pandas as pd
+from rdkit import Chem
+from tqdm import tqdm
+
+try:
+    import utils
+except ModuleNotFoundError as e:
+    print(e)
+
+
+def calculate_smina_score(pdb_file, sdf_file):
+    # add '-o <name>_smina.sdf' if you want to see the output
+    out = os.popen(f'smina.static -l {sdf_file} -r {pdb_file} '
+                   f'--score_only').read()
+    matches = re.findall(
+        r"Affinity:[ ]+([+-]?[0-9]*[.]?[0-9]+)[ ]+\(kcal/mol\)", out)
+    return [float(x) for x in matches]
+
+
+def smina_score(rdmols, receptor_file):
+    """
+    Calculate smina score
+    :param rdmols: List of RDKit molecules
+    :param receptor_file: Receptor pdb/pdbqt file or list of receptor files
+    :return: Smina score for each input molecule (list)
+    """
+
+    if isinstance(receptor_file, list):
+        scores = []
+        for mol, rec_file in zip(rdmols, receptor_file):
+            with tempfile.NamedTemporaryFile(suffix='.sdf') as tmp:
+                tmp_file = tmp.name
+                utils.write_sdf_file(tmp_file, [mol])
+                scores.extend(calculate_smina_score(rec_file, tmp_file))
+
+    # Use same receptor file for all molecules
+    else:
+        with tempfile.NamedTemporaryFile(suffix='.sdf') as tmp:
+            tmp_file = tmp.name
+            utils.write_sdf_file(tmp_file, rdmols)
+            scores = calculate_smina_score(receptor_file, tmp_file)
+
+    return scores
+
+
+def sdf_to_pdbqt(sdf_file, pdbqt_outfile, mol_id):
+    os.popen(f'obabel {sdf_file} -O {pdbqt_outfile} '
+             f'-f {mol_id + 1} -l {mol_id + 1}').read()
+    return pdbqt_outfile
+
+
+def calculate_qvina2_score(receptor_file, sdf_file, out_dir, size=20,
+                           exhaustiveness=16, return_rdmol=False):
+
+    receptor_file = Path(receptor_file)
+    sdf_file = Path(sdf_file)
+
+    if receptor_file.suffix == '.pdb':
+        # prepare receptor, requires Python 2.7
+        receptor_pdbqt_file = Path(out_dir, receptor_file.stem + '.pdbqt')
+        os.popen(f'prepare_receptor4.py -r {receptor_file} -O {receptor_pdbqt_file}')
+    else:
+        receptor_pdbqt_file = receptor_file
+
+    scores = []
+    rdmols = []  # for if return rdmols
+    suppl = Chem.SDMolSupplier(str(sdf_file), sanitize=False)
+    for i, mol in enumerate(suppl):  # sdf file may contain several ligands
+        ligand_name = f'{sdf_file.stem}_{i}'
+        # prepare ligand
+        ligand_pdbqt_file = Path(out_dir, ligand_name + '.pdbqt')
+        out_sdf_file = Path(out_dir, ligand_name + '_out.sdf')
+
+        if out_sdf_file.exists():
+            with open(out_sdf_file, 'r') as f:
+                scores.append(
+                    min([float(x.split()[2]) for x in f.readlines()
+                         if x.startswith(' VINA RESULT:')])
+                )
+
+        else:
+            sdf_to_pdbqt(sdf_file, ligand_pdbqt_file, i)
+
+            # center box at ligand's center of mass
+            cx, cy, cz = mol.GetConformer().GetPositions().mean(0)
+
+            # run QuickVina 2
+            out = os.popen(
+                f'qvina2.1 --receptor {receptor_pdbqt_file} '
+                f'--ligand {ligand_pdbqt_file} '
+                f'--center_x {cx:.4f} --center_y {cy:.4f} --center_z {cz:.4f} '
+                f'--size_x {size} --size_y {size} --size_z {size} '
+                f'--exhaustiveness {exhaustiveness}'
+            ).read()
+
+            # clean up
+            ligand_pdbqt_file.unlink()
+
+            if '-----+------------+----------+----------' not in out:
+                scores.append(np.nan)
+                continue
+
+            out_split = out.splitlines()
+            best_idx = out_split.index('-----+------------+----------+----------') + 1
+            best_line = out_split[best_idx].split()
+            assert best_line[0] == '1'
+            scores.append(float(best_line[1]))
+
+            out_pdbqt_file = Path(out_dir, ligand_name + '_out.pdbqt')
+            if out_pdbqt_file.exists():
+                os.popen(f'obabel {out_pdbqt_file} -O {out_sdf_file}').read()
+
+                # clean up
+                out_pdbqt_file.unlink()
+
+        if return_rdmol:
+            rdmol = Chem.SDMolSupplier(str(out_sdf_file))[0]
+            rdmols.append(rdmol)
+
+    if return_rdmol:
+        return scores, rdmols
+    else:
+        return scores
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('QuickVina evaluation')
+    parser.add_argument('--pdbqt_dir', type=Path,
+                        help='Receptor files in pdbqt format')
+    parser.add_argument('--sdf_dir', type=Path, default=None,
+                        help='Ligand files in sdf format')
+    parser.add_argument('--sdf_files', type=Path, nargs='+', default=None)
+    parser.add_argument('--out_dir', type=Path)
+    parser.add_argument('--write_csv', action='store_true')
+    parser.add_argument('--write_dict', action='store_true')
+    parser.add_argument('--dataset', type=str, default='moad')
+    args = parser.parse_args()
+
+    assert (args.sdf_dir is not None) ^ (args.sdf_files is not None)
+
+    args.out_dir.mkdir(exist_ok=True)
+
+    results = {'receptor': [], 'ligand': [], 'scores': []}
+    results_dict = {}
+    sdf_files = list(args.sdf_dir.glob('[!.]*.sdf')) \
+        if args.sdf_dir is not None else args.sdf_files
+    pbar = tqdm(sdf_files)
+    for sdf_file in pbar:
+        pbar.set_description(f'Processing {sdf_file.name}')
+
+        if args.dataset == 'moad':
+            """
+            Ligand file names should be of the following form:
+            <receptor-name>_<pocket-id>_<some-suffix>.sdf
+            where <receptor-name> and <pocket-id> cannot contain any 
+            underscores, e.g.: 1abc-bio1_pocket0_gen.sdf
+            """
+            ligand_name = sdf_file.stem
+            receptor_name, pocket_id, *suffix = ligand_name.split('_')
+            suffix = '_'.join(suffix)
+            receptor_file = Path(args.pdbqt_dir, receptor_name + '.pdbqt')
+        elif args.dataset == 'crossdocked':
+            ligand_name = sdf_file.stem
+            receptor_name = ligand_name[:-4]
+            receptor_file = Path(args.pdbqt_dir, receptor_name + '.pdbqt')
+
+        # try:
+        scores, rdmols = calculate_qvina2_score(
+            receptor_file, sdf_file, args.out_dir, return_rdmol=True)
+        # except AttributeError as e:
+        #     print(e)
+        #     continue
+        results['receptor'].append(str(receptor_file))
+        results['ligand'].append(str(sdf_file))
+        results['scores'].append(scores)
+
+        if args.write_dict:
+            results_dict[ligand_name] = {
+                'receptor': str(receptor_file),
+                'ligand': str(sdf_file),
+                'scores': scores,
+                'rmdols': rdmols
+            }
+
+    if args.write_csv:
+        df = pd.DataFrame.from_dict(results)
+        df.to_csv(Path(args.out_dir, 'qvina2_scores.csv'))
+
+    if args.write_dict:
+        torch.save(results_dict, Path(args.out_dir, 'qvina2_scores.pt'))