--- a +++ b/ML_6.ipynb @@ -0,0 +1,435 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "authorship_tag": "ABX9TyMr0hWVdgzD2y+mSWheMNkb", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "<a href=\"https://colab.research.google.com/github/francescopatane96/Computer_aided_drug_discovery_kit/blob/main/ML_6.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" + ] + }, + { + "cell_type": "code", + "source": [ + "!pip install rdkit" + ], + "metadata": { + "id": "_6naBpRYKt9E" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!pip install git+https://github.com/volkamerlab/teachopencadd" + ], + "metadata": { + "id": "NoozFkbcLIz-" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nuFhGaTJJwVH" + }, + "outputs": [], + "source": [ + "from collections import defaultdict\n", + "from pathlib import Path\n", + "from copy import deepcopy\n", + "import random\n", + "\n", + "from ipywidgets import interact, fixed, widgets\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from rdkit import Chem, Geometry\n", + "from rdkit.Chem import AllChem\n", + "from rdkit.Chem import Draw\n", + "from rdkit.Chem import rdFMCS\n", + "from rdkit.Chem import PandasTools\n", + "\n", + "from teachopencadd.utils import seed_everything\n", + "\n", + "seed_everything()" + ] + }, + { + "cell_type": "code", + "source": [ + "sdf = str(\"molecule_set_largest_cluster.sdf\")\n", + "supplier = Chem.ForwardSDMolSupplier(sdf)\n", + "mols = list(supplier)\n", + "\n", + "print(f\"Set with {len(mols)} molecules loaded.\")\n", + "# NBVAL_CHECK_OUTPUT" + ], + "metadata": { + "id": "e-qUzpDaKy7k" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Show only first 10 molecules -- use slicing\n", + "num_mols = 10\n", + "legends = [mol.GetProp(\"_Name\") for mol in mols]\n", + "Draw.MolsToGridImage(mols[:num_mols], legends=legends[:num_mols], molsPerRow=5)" + ], + "metadata": { + "id": "Dit2D2WWMXQn" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "mcs1 = rdFMCS.FindMCS(mols)\n", + "print(f\"MCS1 contains {mcs1.numAtoms} atoms and {mcs1.numBonds} bonds.\")\n", + "print(\"MCS SMARTS string:\", mcs1.smartsString)\n", + "# NBVAL_CHECK_OUTPUT" + ], + "metadata": { + "id": "VGgIkru-Md-E" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Draw substructure from Smarts\n", + "m1 = Chem.MolFromSmarts(mcs1.smartsString)\n", + "Draw.MolToImage(m1, legend=\"MCS1\")" + ], + "metadata": { + "id": "uL0Wkqq5MhDc" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def highlight_molecules(molecules, mcs, number, label=True, same_orientation=True, **kwargs):\n", + " \"\"\"Highlight the MCS in our query molecules\"\"\"\n", + " molecules = deepcopy(molecules)\n", + " # convert MCS to molecule\n", + " pattern = Chem.MolFromSmarts(mcs.smartsString)\n", + " # find the matching atoms in each molecule\n", + " matching = [molecule.GetSubstructMatch(pattern) for molecule in molecules[:number]]\n", + "\n", + " legends = None\n", + " if label is True:\n", + " legends = [x.GetProp(\"_Name\") for x in molecules]\n", + "\n", + " # Align by matched substructure so they are depicted in the same orientation\n", + " # Adapted from: https://gist.github.com/greglandrum/82d9a86acb3b00d3bb1df502779a5810\n", + " if same_orientation:\n", + " mol, match = molecules[0], matching[0]\n", + " AllChem.Compute2DCoords(mol)\n", + " coords = [mol.GetConformer().GetAtomPosition(x) for x in match]\n", + " coords2D = [Geometry.Point2D(pt.x, pt.y) for pt in coords]\n", + " for mol, match in zip(molecules[1:number], matching[1:number]):\n", + " if not match:\n", + " continue\n", + " coord_dict = {match[i]: coord for i, coord in enumerate(coords2D)}\n", + " AllChem.Compute2DCoords(mol, coordMap=coord_dict)\n", + "\n", + " return Draw.MolsToGridImage(\n", + " molecules[:number],\n", + " legends=legends,\n", + " molsPerRow=5,\n", + " highlightAtomLists=matching[:number],\n", + " subImgSize=(200, 200),\n", + " **kwargs,\n", + " )" + ], + "metadata": { + "id": "VAO9Iw0PMmvV" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "highlight_molecules(mols, mcs1, 5)" + ], + "metadata": { + "id": "BTjkrtL3Mpgt" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "img = highlight_molecules(mols, mcs1, 3, useSVG=True)\n", + "\n", + "# Get SVG data\n", + "molsvg = img\n", + "\n", + "# Set background to transparent & Enlarge size of label\n", + "molsvg = molsvg.replace(\"opacity:1.0\", \"opacity:0.0\").replace(\"12px\", \"20px\")\n", + "\n", + "# Save altered SVG data to file\n", + "with open(\"mcs_largest_cluster.svg\", \"w\") as f:\n", + " f.write(molsvg)" + ], + "metadata": { + "id": "wYN0esqNMt5G" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "mcs2 = rdFMCS.FindMCS(mols, threshold=0.8)\n", + "print(f\"MCS2 contains {mcs2.numAtoms} atoms and {mcs2.numBonds} bonds.\")\n", + "print(\"SMARTS string:\", mcs2.smartsString)\n", + "# NBVAL_CHECK_OUTPUT" + ], + "metadata": { + "id": "rIv0XWKvM6-F" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Draw substructure\n", + "m2 = Chem.MolFromSmarts(mcs2.smartsString)\n", + "Draw.MolsToGridImage([m1, m2], legends=[\"MCS1\", \"MCS2: +threshold=0.8\"])" + ], + "metadata": { + "id": "3mOA_rBfM9ME" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "highlight_molecules(mols, mcs2, 5)" + ], + "metadata": { + "id": "s0qi1pmUM_vX" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "mcs3 = rdFMCS.FindMCS(mols, threshold=0.8, ringMatchesRingOnly=True)\n", + "print(f\"MCS3 contains {mcs3.numAtoms} atoms and {mcs3.numBonds} bonds.\")\n", + "print(\"SMARTS string:\", mcs3.smartsString)\n", + "# NBVAL_CHECK_OUTPUT" + ], + "metadata": { + "id": "CJC7xVA7PsxQ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Draw substructure\n", + "m3 = Chem.MolFromSmarts(mcs3.smartsString)\n", + "Draw.MolsToGridImage([m1, m2, m3], legends=[\"MCS1\", \"MCS2: +treshold=0.8\", \"mcs3: +ringmatch\"])" + ], + "metadata": { + "id": "yN1nkyZOPvd3" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "highlight_molecules(mols, mcs3, 5)" + ], + "metadata": { + "id": "x7r94cNGP6BG" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Read full EGFR data\n", + "mol_df = pd.read_csv(\"TNFB_compounds.csv\", index_col=0)\n", + "print(\"Total number of compounds:\", mol_df.shape[0])\n", + "\n", + "# Only keep molecules with pIC50 > 9 (IC50 > 1nM)\n", + "mol_df = mol_df[mol_df.pIC50 > 4]\n", + "print(\"Number of compounds with pIC50 > 9:\", mol_df.shape[0])\n", + "# NBVAL_CHECK_OUTPUT" + ], + "metadata": { + "id": "-dlKTG31P_sc" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Add molecule column to data frame\n", + "PandasTools.AddMoleculeColumnToFrame(mol_df, \"smiles\")\n", + "mol_df.head(3)" + ], + "metadata": { + "id": "D6KzVIQ1QOtJ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "mols_diverse = []\n", + "# Note: discarded variables we do not care about are usually referred to with a single underscore\n", + "for _, row in mol_df.iterrows():\n", + " m = Chem.MolFromSmiles(row.smiles)\n", + " m.SetProp(\"_Name\", row.molecule_chembl_id)\n", + " mols_diverse.append(m)" + ], + "metadata": { + "id": "UeucS0xBQSAM" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# We have fixed the random seed above (imports) for deterministic results\n", + "mols_diverse_sample = random.sample(mols_diverse, 40)" + ], + "metadata": { + "id": "MALRoGkxQUpE" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "threshold_diverse = 0.3\n", + "mcs1 = rdFMCS.FindMCS(mols_diverse_sample)\n", + "print(\"SMARTS string1:\", mcs1.smartsString)\n", + "mcs2 = rdFMCS.FindMCS(mols_diverse_sample, threshold=threshold_diverse)\n", + "print(\"SMARTS string2:\", mcs2.smartsString)\n", + "mcs3 = rdFMCS.FindMCS(mols_diverse_sample, ringMatchesRingOnly=True, threshold=threshold_diverse)\n", + "print(\"SMARTS string3:\", mcs3.smartsString)\n", + "# NBVAL_CHECK_OUTPUT" + ], + "metadata": { + "id": "WfqFVH3eQYjU" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Draw substructures\n", + "m1 = Chem.MolFromSmarts(mcs1.smartsString)\n", + "m2 = Chem.MolFromSmarts(mcs2.smartsString)\n", + "m3 = Chem.MolFromSmarts(mcs3.smartsString)\n", + "\n", + "Draw.MolsToGridImage(\n", + " [m1, m2, m3],\n", + " legends=[\n", + " \"MCS1\",\n", + " f\"MCS2: +threshold={threshold_diverse}\",\n", + " \"MCS3: +ringmatch\",\n", + " ],\n", + ")" + ], + "metadata": { + "id": "XWBTeg56Qm2Y" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "highlight_molecules(mols_diverse_sample, mcs3, 10)" + ], + "metadata": { + "id": "uTLyILInQqpC" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def render_mcs(molecules, percentage):\n", + " \"\"\"Interactive widget helper. `molecules` must be wrapped in `ipywidgets.fixed()`,\n", + " while `percentage` will be determined by an IntSlider widget.\"\"\"\n", + " tmcs = rdFMCS.FindMCS(molecules, threshold=percentage / 100.0)\n", + " if tmcs is None:\n", + " print(\"No MCS found\")\n", + " return None\n", + "\n", + " m = Chem.MolFromSmarts(tmcs.smartsString)\n", + " print(tmcs.smartsString)\n", + " return m" + ], + "metadata": { + "id": "cXBgRsBrQx93" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Note that the slider may take a few seconds to react\n", + "interact(\n", + " render_mcs,\n", + " molecules=fixed(mols_diverse_sample),\n", + " percentage=widgets.IntSlider(min=0, max=100, step=10, value=70),\n", + ");" + ], + "metadata": { + "id": "uUarjGwyQ1O1" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file