Diff of /demo_cRNN.ipynb [000000] .. [58db57]

Switch to side-by-side view

--- a
+++ b/demo_cRNN.ipynb
@@ -0,0 +1,183 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "\n",
+    "import numpy as np\n",
+    "import rdkit\n",
+    "from rdkit import Chem\n",
+    "\n",
+    "import h5py, ast, pickle\n",
+    "\n",
+    "# Occupy a GPU for the model to be loaded \n",
+    "%env CUDA_DEVICE_ORDER=PCI_BUS_ID\n",
+    "# GPU ID, if occupied change to an available GPU ID listed under !nvidia-smi\n",
+    "%env CUDA_VISIBLE_DEVICES=2 \n",
+    "\n",
+    "from ddc_pub import ddc_v3 as ddc"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_descriptors(smiles_list, qsar_model=None, show_actives=False, active_thresh=0.5, qed_thresh=0.5):\n",
+    "    \"\"\"Calculate molecular descriptors of SMILES in a list.\n",
+    "    The descriptors are logp, tpsa, mw, qed, hba, hbd and probability of being active towards DRD2.\n",
+    "    \n",
+    "    Returns:\n",
+    "        A np.ndarray of descriptors.\n",
+    "    \"\"\"\n",
+    "    from tqdm import tqdm_notebook as tqdm\n",
+    "    import rdkit\n",
+    "    from rdkit import Chem, DataStructs\n",
+    "    from rdkit.Chem import Descriptors, rdMolDescriptors, AllChem, QED\n",
+    "    \n",
+    "    descriptors = []\n",
+    "    active_mols = []\n",
+    "    \n",
+    "    for idx, smiles in enumerate(smiles_list):\n",
+    "        # Convert to mol\n",
+    "        mol = Chem.MolFromSmiles(smiles)\n",
+    "        # If valid, calculate its properties\n",
+    "        if mol:\n",
+    "            try:\n",
+    "                logp  = Descriptors.MolLogP(mol)\n",
+    "                tpsa  = Descriptors.TPSA(mol)\n",
+    "                molwt = Descriptors.ExactMolWt(mol)\n",
+    "                hba   = rdMolDescriptors.CalcNumHBA(mol)\n",
+    "                hbd   = rdMolDescriptors.CalcNumHBD(mol)\n",
+    "                qed   = QED.qed(mol)\n",
+    "                \n",
+    "                # Calculate fingerprints\n",
+    "                fp = AllChem.GetMorganFingerprintAsBitVect(mol,2, nBits=2048)\n",
+    "                ecfp4 = np.zeros((2048,))\n",
+    "                DataStructs.ConvertToNumpyArray(fp, ecfp4) \n",
+    "                # Predict activity and pick only the second component\n",
+    "                active = qsar_model.predict_proba([ecfp4])[0][1]\n",
+    "                descriptors.append([logp, tpsa, molwt, qed, hba, hbd, active]) \n",
+    "                \n",
+    "                if active > active_thresh and qed > qed_thresh:\n",
+    "                    if show_actives:\n",
+    "                        active_mols.append(mol)\n",
+    "                        print(\"active_proba: %.2f, QED: %.2f.\" % (active, qed))\n",
+    "                        display(mol)\n",
+    "                        pass\n",
+    "                \n",
+    "            except Exception as e:\n",
+    "                # Sanitization error: Explicit valence for atom # 17 N, 4, is greater than permitted\n",
+    "                print(e)\n",
+    "        # Else, return None\n",
+    "        else:\n",
+    "            print(\"Invalid generation.\")\n",
+    "            \n",
+    "    return np.asarray(descriptors)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Load QSAR model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "qsar_model_name = \"models/qsar_model.pickle\"\n",
+    "with open(qsar_model_name, \"rb\") as file:\n",
+    "    qsar_model = pickle.load(file)[\"classifier_sv\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Load PCB cRNN"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import existing (trained) model\n",
+    "# Ignore any warning(s) about training configuration or non-seriazable keyword arguments\n",
+    "model_name = \"models/pcb_model\"\n",
+    "model = ddc.DDC(model_name=model_name)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Select conditions for generated molecules"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Custom conditions\n",
+    "logp              = 3.5\n",
+    "tpsa              = 70.0\n",
+    "mw                = 350.0\n",
+    "qed               = 0.8\n",
+    "hba               = 4.0\n",
+    "hbd               = 1.0\n",
+    "drd2_active_proba = 0.9\n",
+    "\n",
+    "target = np.array([logp, tpsa, mw, qed, hba, hbd, drd2_active_proba])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Convert back to SMILES\n",
+    "smiles_out, _ = model.predict(latent=target, temp=0) # Change temp to 1 for more funky results\n",
+    "\n",
+    "# Calculate the properties of the generated structure and compare\n",
+    "get_descriptors(smiles_list=[smiles_out], qsar_model=qsar_model, show_actives=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "ddc_env (python_3.6.7)",
+   "language": "python",
+   "name": "ddc_env"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}