Diff of /ML_6.ipynb [000000] .. [480cb7]

Switch to side-by-side view

--- a
+++ b/ML_6.ipynb
@@ -0,0 +1,435 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "authorship_tag": "ABX9TyMr0hWVdgzD2y+mSWheMNkb",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/francescopatane96/Computer_aided_drug_discovery_kit/blob/main/ML_6.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install rdkit"
+      ],
+      "metadata": {
+        "id": "_6naBpRYKt9E"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install git+https://github.com/volkamerlab/teachopencadd"
+      ],
+      "metadata": {
+        "id": "NoozFkbcLIz-"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "nuFhGaTJJwVH"
+      },
+      "outputs": [],
+      "source": [
+        "from collections import defaultdict\n",
+        "from pathlib import Path\n",
+        "from copy import deepcopy\n",
+        "import random\n",
+        "\n",
+        "from ipywidgets import interact, fixed, widgets\n",
+        "import pandas as pd\n",
+        "import matplotlib.pyplot as plt\n",
+        "from rdkit import Chem, Geometry\n",
+        "from rdkit.Chem import AllChem\n",
+        "from rdkit.Chem import Draw\n",
+        "from rdkit.Chem import rdFMCS\n",
+        "from rdkit.Chem import PandasTools\n",
+        "\n",
+        "from teachopencadd.utils import seed_everything\n",
+        "\n",
+        "seed_everything()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "sdf = str(\"molecule_set_largest_cluster.sdf\")\n",
+        "supplier = Chem.ForwardSDMolSupplier(sdf)\n",
+        "mols = list(supplier)\n",
+        "\n",
+        "print(f\"Set with {len(mols)} molecules loaded.\")\n",
+        "# NBVAL_CHECK_OUTPUT"
+      ],
+      "metadata": {
+        "id": "e-qUzpDaKy7k"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Show only first 10 molecules -- use slicing\n",
+        "num_mols = 10\n",
+        "legends = [mol.GetProp(\"_Name\") for mol in mols]\n",
+        "Draw.MolsToGridImage(mols[:num_mols], legends=legends[:num_mols], molsPerRow=5)"
+      ],
+      "metadata": {
+        "id": "Dit2D2WWMXQn"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "mcs1 = rdFMCS.FindMCS(mols)\n",
+        "print(f\"MCS1 contains {mcs1.numAtoms} atoms and {mcs1.numBonds} bonds.\")\n",
+        "print(\"MCS SMARTS string:\", mcs1.smartsString)\n",
+        "# NBVAL_CHECK_OUTPUT"
+      ],
+      "metadata": {
+        "id": "VGgIkru-Md-E"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Draw substructure from Smarts\n",
+        "m1 = Chem.MolFromSmarts(mcs1.smartsString)\n",
+        "Draw.MolToImage(m1, legend=\"MCS1\")"
+      ],
+      "metadata": {
+        "id": "uL0Wkqq5MhDc"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def highlight_molecules(molecules, mcs, number, label=True, same_orientation=True, **kwargs):\n",
+        "    \"\"\"Highlight the MCS in our query molecules\"\"\"\n",
+        "    molecules = deepcopy(molecules)\n",
+        "    # convert MCS to molecule\n",
+        "    pattern = Chem.MolFromSmarts(mcs.smartsString)\n",
+        "    # find the matching atoms in each molecule\n",
+        "    matching = [molecule.GetSubstructMatch(pattern) for molecule in molecules[:number]]\n",
+        "\n",
+        "    legends = None\n",
+        "    if label is True:\n",
+        "        legends = [x.GetProp(\"_Name\") for x in molecules]\n",
+        "\n",
+        "    # Align by matched substructure so they are depicted in the same orientation\n",
+        "    # Adapted from: https://gist.github.com/greglandrum/82d9a86acb3b00d3bb1df502779a5810\n",
+        "    if same_orientation:\n",
+        "        mol, match = molecules[0], matching[0]\n",
+        "        AllChem.Compute2DCoords(mol)\n",
+        "        coords = [mol.GetConformer().GetAtomPosition(x) for x in match]\n",
+        "        coords2D = [Geometry.Point2D(pt.x, pt.y) for pt in coords]\n",
+        "        for mol, match in zip(molecules[1:number], matching[1:number]):\n",
+        "            if not match:\n",
+        "                continue\n",
+        "            coord_dict = {match[i]: coord for i, coord in enumerate(coords2D)}\n",
+        "            AllChem.Compute2DCoords(mol, coordMap=coord_dict)\n",
+        "\n",
+        "    return Draw.MolsToGridImage(\n",
+        "        molecules[:number],\n",
+        "        legends=legends,\n",
+        "        molsPerRow=5,\n",
+        "        highlightAtomLists=matching[:number],\n",
+        "        subImgSize=(200, 200),\n",
+        "        **kwargs,\n",
+        "    )"
+      ],
+      "metadata": {
+        "id": "VAO9Iw0PMmvV"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "highlight_molecules(mols, mcs1, 5)"
+      ],
+      "metadata": {
+        "id": "BTjkrtL3Mpgt"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "img = highlight_molecules(mols, mcs1, 3, useSVG=True)\n",
+        "\n",
+        "# Get SVG data\n",
+        "molsvg = img\n",
+        "\n",
+        "# Set background to transparent & Enlarge size of label\n",
+        "molsvg = molsvg.replace(\"opacity:1.0\", \"opacity:0.0\").replace(\"12px\", \"20px\")\n",
+        "\n",
+        "# Save altered SVG data to file\n",
+        "with open(\"mcs_largest_cluster.svg\", \"w\") as f:\n",
+        "    f.write(molsvg)"
+      ],
+      "metadata": {
+        "id": "wYN0esqNMt5G"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "mcs2 = rdFMCS.FindMCS(mols, threshold=0.8)\n",
+        "print(f\"MCS2 contains {mcs2.numAtoms} atoms and {mcs2.numBonds} bonds.\")\n",
+        "print(\"SMARTS string:\", mcs2.smartsString)\n",
+        "# NBVAL_CHECK_OUTPUT"
+      ],
+      "metadata": {
+        "id": "rIv0XWKvM6-F"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Draw substructure\n",
+        "m2 = Chem.MolFromSmarts(mcs2.smartsString)\n",
+        "Draw.MolsToGridImage([m1, m2], legends=[\"MCS1\", \"MCS2: +threshold=0.8\"])"
+      ],
+      "metadata": {
+        "id": "3mOA_rBfM9ME"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "highlight_molecules(mols, mcs2, 5)"
+      ],
+      "metadata": {
+        "id": "s0qi1pmUM_vX"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "mcs3 = rdFMCS.FindMCS(mols, threshold=0.8, ringMatchesRingOnly=True)\n",
+        "print(f\"MCS3 contains {mcs3.numAtoms} atoms and {mcs3.numBonds} bonds.\")\n",
+        "print(\"SMARTS string:\", mcs3.smartsString)\n",
+        "# NBVAL_CHECK_OUTPUT"
+      ],
+      "metadata": {
+        "id": "CJC7xVA7PsxQ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Draw substructure\n",
+        "m3 = Chem.MolFromSmarts(mcs3.smartsString)\n",
+        "Draw.MolsToGridImage([m1, m2, m3], legends=[\"MCS1\", \"MCS2: +treshold=0.8\", \"mcs3: +ringmatch\"])"
+      ],
+      "metadata": {
+        "id": "yN1nkyZOPvd3"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "highlight_molecules(mols, mcs3, 5)"
+      ],
+      "metadata": {
+        "id": "x7r94cNGP6BG"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Read full EGFR data\n",
+        "mol_df = pd.read_csv(\"TNFB_compounds.csv\", index_col=0)\n",
+        "print(\"Total number of compounds:\", mol_df.shape[0])\n",
+        "\n",
+        "# Only keep molecules with pIC50 > 9 (IC50 > 1nM)\n",
+        "mol_df = mol_df[mol_df.pIC50 > 4]\n",
+        "print(\"Number of compounds with pIC50 > 9:\", mol_df.shape[0])\n",
+        "# NBVAL_CHECK_OUTPUT"
+      ],
+      "metadata": {
+        "id": "-dlKTG31P_sc"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Add molecule column to data frame\n",
+        "PandasTools.AddMoleculeColumnToFrame(mol_df, \"smiles\")\n",
+        "mol_df.head(3)"
+      ],
+      "metadata": {
+        "id": "D6KzVIQ1QOtJ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "mols_diverse = []\n",
+        "# Note: discarded variables we do not care about are usually referred to with a single underscore\n",
+        "for _, row in mol_df.iterrows():\n",
+        "    m = Chem.MolFromSmiles(row.smiles)\n",
+        "    m.SetProp(\"_Name\", row.molecule_chembl_id)\n",
+        "    mols_diverse.append(m)"
+      ],
+      "metadata": {
+        "id": "UeucS0xBQSAM"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# We have fixed the random seed above (imports) for deterministic results\n",
+        "mols_diverse_sample = random.sample(mols_diverse, 40)"
+      ],
+      "metadata": {
+        "id": "MALRoGkxQUpE"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "threshold_diverse = 0.3\n",
+        "mcs1 = rdFMCS.FindMCS(mols_diverse_sample)\n",
+        "print(\"SMARTS string1:\", mcs1.smartsString)\n",
+        "mcs2 = rdFMCS.FindMCS(mols_diverse_sample, threshold=threshold_diverse)\n",
+        "print(\"SMARTS string2:\", mcs2.smartsString)\n",
+        "mcs3 = rdFMCS.FindMCS(mols_diverse_sample, ringMatchesRingOnly=True, threshold=threshold_diverse)\n",
+        "print(\"SMARTS string3:\", mcs3.smartsString)\n",
+        "# NBVAL_CHECK_OUTPUT"
+      ],
+      "metadata": {
+        "id": "WfqFVH3eQYjU"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Draw substructures\n",
+        "m1 = Chem.MolFromSmarts(mcs1.smartsString)\n",
+        "m2 = Chem.MolFromSmarts(mcs2.smartsString)\n",
+        "m3 = Chem.MolFromSmarts(mcs3.smartsString)\n",
+        "\n",
+        "Draw.MolsToGridImage(\n",
+        "    [m1, m2, m3],\n",
+        "    legends=[\n",
+        "        \"MCS1\",\n",
+        "        f\"MCS2: +threshold={threshold_diverse}\",\n",
+        "        \"MCS3: +ringmatch\",\n",
+        "    ],\n",
+        ")"
+      ],
+      "metadata": {
+        "id": "XWBTeg56Qm2Y"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "highlight_molecules(mols_diverse_sample, mcs3, 10)"
+      ],
+      "metadata": {
+        "id": "uTLyILInQqpC"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def render_mcs(molecules, percentage):\n",
+        "    \"\"\"Interactive widget helper. `molecules` must be wrapped in `ipywidgets.fixed()`,\n",
+        "    while `percentage` will be determined by an IntSlider widget.\"\"\"\n",
+        "    tmcs = rdFMCS.FindMCS(molecules, threshold=percentage / 100.0)\n",
+        "    if tmcs is None:\n",
+        "        print(\"No MCS found\")\n",
+        "        return None\n",
+        "\n",
+        "    m = Chem.MolFromSmarts(tmcs.smartsString)\n",
+        "    print(tmcs.smartsString)\n",
+        "    return m"
+      ],
+      "metadata": {
+        "id": "cXBgRsBrQx93"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Note that the slider may take a few seconds to react\n",
+        "interact(\n",
+        "    render_mcs,\n",
+        "    molecules=fixed(mols_diverse_sample),\n",
+        "    percentage=widgets.IntSlider(min=0, max=100, step=10, value=70),\n",
+        ");"
+      ],
+      "metadata": {
+        "id": "uUarjGwyQ1O1"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file