--- a +++ b/demo_cRNN.ipynb @@ -0,0 +1,183 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "\n", + "import numpy as np\n", + "import rdkit\n", + "from rdkit import Chem\n", + "\n", + "import h5py, ast, pickle\n", + "\n", + "# Occupy a GPU for the model to be loaded \n", + "%env CUDA_DEVICE_ORDER=PCI_BUS_ID\n", + "# GPU ID, if occupied change to an available GPU ID listed under !nvidia-smi\n", + "%env CUDA_VISIBLE_DEVICES=2 \n", + "\n", + "from ddc_pub import ddc_v3 as ddc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_descriptors(smiles_list, qsar_model=None, show_actives=False, active_thresh=0.5, qed_thresh=0.5):\n", + " \"\"\"Calculate molecular descriptors of SMILES in a list.\n", + " The descriptors are logp, tpsa, mw, qed, hba, hbd and probability of being active towards DRD2.\n", + " \n", + " Returns:\n", + " A np.ndarray of descriptors.\n", + " \"\"\"\n", + " from tqdm import tqdm_notebook as tqdm\n", + " import rdkit\n", + " from rdkit import Chem, DataStructs\n", + " from rdkit.Chem import Descriptors, rdMolDescriptors, AllChem, QED\n", + " \n", + " descriptors = []\n", + " active_mols = []\n", + " \n", + " for idx, smiles in enumerate(smiles_list):\n", + " # Convert to mol\n", + " mol = Chem.MolFromSmiles(smiles)\n", + " # If valid, calculate its properties\n", + " if mol:\n", + " try:\n", + " logp = Descriptors.MolLogP(mol)\n", + " tpsa = Descriptors.TPSA(mol)\n", + " molwt = Descriptors.ExactMolWt(mol)\n", + " hba = rdMolDescriptors.CalcNumHBA(mol)\n", + " hbd = rdMolDescriptors.CalcNumHBD(mol)\n", + " qed = QED.qed(mol)\n", + " \n", + " # Calculate fingerprints\n", + " fp = AllChem.GetMorganFingerprintAsBitVect(mol,2, nBits=2048)\n", + " ecfp4 = np.zeros((2048,))\n", + " DataStructs.ConvertToNumpyArray(fp, ecfp4) \n", + " # Predict activity and pick only the second component\n", + " active = qsar_model.predict_proba([ecfp4])[0][1]\n", + " descriptors.append([logp, tpsa, molwt, qed, hba, hbd, active]) \n", + " \n", + " if active > active_thresh and qed > qed_thresh:\n", + " if show_actives:\n", + " active_mols.append(mol)\n", + " print(\"active_proba: %.2f, QED: %.2f.\" % (active, qed))\n", + " display(mol)\n", + " pass\n", + " \n", + " except Exception as e:\n", + " # Sanitization error: Explicit valence for atom # 17 N, 4, is greater than permitted\n", + " print(e)\n", + " # Else, return None\n", + " else:\n", + " print(\"Invalid generation.\")\n", + " \n", + " return np.asarray(descriptors)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load QSAR model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "qsar_model_name = \"models/qsar_model.pickle\"\n", + "with open(qsar_model_name, \"rb\") as file:\n", + " qsar_model = pickle.load(file)[\"classifier_sv\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load PCB cRNN" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import existing (trained) model\n", + "# Ignore any warning(s) about training configuration or non-seriazable keyword arguments\n", + "model_name = \"models/pcb_model\"\n", + "model = ddc.DDC(model_name=model_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Select conditions for generated molecules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Custom conditions\n", + "logp = 3.5\n", + "tpsa = 70.0\n", + "mw = 350.0\n", + "qed = 0.8\n", + "hba = 4.0\n", + "hbd = 1.0\n", + "drd2_active_proba = 0.9\n", + "\n", + "target = np.array([logp, tpsa, mw, qed, hba, hbd, drd2_active_proba])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Convert back to SMILES\n", + "smiles_out, _ = model.predict(latent=target, temp=0) # Change temp to 1 for more funky results\n", + "\n", + "# Calculate the properties of the generated structure and compare\n", + "get_descriptors(smiles_list=[smiles_out], qsar_model=qsar_model, show_actives=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ddc_env (python_3.6.7)", + "language": "python", + "name": "ddc_env" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}