435 lines (435 with data), 13.0 kB
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyMr0hWVdgzD2y+mSWheMNkb",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/github/francescopatane96/Computer_aided_drug_discovery_kit/blob/main/ML_6.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"source": [
"!pip install rdkit"
],
"metadata": {
"id": "_6naBpRYKt9E"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!pip install git+https://github.com/volkamerlab/teachopencadd"
],
"metadata": {
"id": "NoozFkbcLIz-"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "nuFhGaTJJwVH"
},
"outputs": [],
"source": [
"from collections import defaultdict\n",
"from pathlib import Path\n",
"from copy import deepcopy\n",
"import random\n",
"\n",
"from ipywidgets import interact, fixed, widgets\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from rdkit import Chem, Geometry\n",
"from rdkit.Chem import AllChem\n",
"from rdkit.Chem import Draw\n",
"from rdkit.Chem import rdFMCS\n",
"from rdkit.Chem import PandasTools\n",
"\n",
"from teachopencadd.utils import seed_everything\n",
"\n",
"seed_everything()"
]
},
{
"cell_type": "code",
"source": [
"sdf = str(\"molecule_set_largest_cluster.sdf\")\n",
"supplier = Chem.ForwardSDMolSupplier(sdf)\n",
"mols = list(supplier)\n",
"\n",
"print(f\"Set with {len(mols)} molecules loaded.\")\n",
"# NBVAL_CHECK_OUTPUT"
],
"metadata": {
"id": "e-qUzpDaKy7k"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Show only first 10 molecules -- use slicing\n",
"num_mols = 10\n",
"legends = [mol.GetProp(\"_Name\") for mol in mols]\n",
"Draw.MolsToGridImage(mols[:num_mols], legends=legends[:num_mols], molsPerRow=5)"
],
"metadata": {
"id": "Dit2D2WWMXQn"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"mcs1 = rdFMCS.FindMCS(mols)\n",
"print(f\"MCS1 contains {mcs1.numAtoms} atoms and {mcs1.numBonds} bonds.\")\n",
"print(\"MCS SMARTS string:\", mcs1.smartsString)\n",
"# NBVAL_CHECK_OUTPUT"
],
"metadata": {
"id": "VGgIkru-Md-E"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Draw substructure from Smarts\n",
"m1 = Chem.MolFromSmarts(mcs1.smartsString)\n",
"Draw.MolToImage(m1, legend=\"MCS1\")"
],
"metadata": {
"id": "uL0Wkqq5MhDc"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def highlight_molecules(molecules, mcs, number, label=True, same_orientation=True, **kwargs):\n",
" \"\"\"Highlight the MCS in our query molecules\"\"\"\n",
" molecules = deepcopy(molecules)\n",
" # convert MCS to molecule\n",
" pattern = Chem.MolFromSmarts(mcs.smartsString)\n",
" # find the matching atoms in each molecule\n",
" matching = [molecule.GetSubstructMatch(pattern) for molecule in molecules[:number]]\n",
"\n",
" legends = None\n",
" if label is True:\n",
" legends = [x.GetProp(\"_Name\") for x in molecules]\n",
"\n",
" # Align by matched substructure so they are depicted in the same orientation\n",
" # Adapted from: https://gist.github.com/greglandrum/82d9a86acb3b00d3bb1df502779a5810\n",
" if same_orientation:\n",
" mol, match = molecules[0], matching[0]\n",
" AllChem.Compute2DCoords(mol)\n",
" coords = [mol.GetConformer().GetAtomPosition(x) for x in match]\n",
" coords2D = [Geometry.Point2D(pt.x, pt.y) for pt in coords]\n",
" for mol, match in zip(molecules[1:number], matching[1:number]):\n",
" if not match:\n",
" continue\n",
" coord_dict = {match[i]: coord for i, coord in enumerate(coords2D)}\n",
" AllChem.Compute2DCoords(mol, coordMap=coord_dict)\n",
"\n",
" return Draw.MolsToGridImage(\n",
" molecules[:number],\n",
" legends=legends,\n",
" molsPerRow=5,\n",
" highlightAtomLists=matching[:number],\n",
" subImgSize=(200, 200),\n",
" **kwargs,\n",
" )"
],
"metadata": {
"id": "VAO9Iw0PMmvV"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"highlight_molecules(mols, mcs1, 5)"
],
"metadata": {
"id": "BTjkrtL3Mpgt"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"img = highlight_molecules(mols, mcs1, 3, useSVG=True)\n",
"\n",
"# Get SVG data\n",
"molsvg = img\n",
"\n",
"# Set background to transparent & Enlarge size of label\n",
"molsvg = molsvg.replace(\"opacity:1.0\", \"opacity:0.0\").replace(\"12px\", \"20px\")\n",
"\n",
"# Save altered SVG data to file\n",
"with open(\"mcs_largest_cluster.svg\", \"w\") as f:\n",
" f.write(molsvg)"
],
"metadata": {
"id": "wYN0esqNMt5G"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"mcs2 = rdFMCS.FindMCS(mols, threshold=0.8)\n",
"print(f\"MCS2 contains {mcs2.numAtoms} atoms and {mcs2.numBonds} bonds.\")\n",
"print(\"SMARTS string:\", mcs2.smartsString)\n",
"# NBVAL_CHECK_OUTPUT"
],
"metadata": {
"id": "rIv0XWKvM6-F"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Draw substructure\n",
"m2 = Chem.MolFromSmarts(mcs2.smartsString)\n",
"Draw.MolsToGridImage([m1, m2], legends=[\"MCS1\", \"MCS2: +threshold=0.8\"])"
],
"metadata": {
"id": "3mOA_rBfM9ME"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"highlight_molecules(mols, mcs2, 5)"
],
"metadata": {
"id": "s0qi1pmUM_vX"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"mcs3 = rdFMCS.FindMCS(mols, threshold=0.8, ringMatchesRingOnly=True)\n",
"print(f\"MCS3 contains {mcs3.numAtoms} atoms and {mcs3.numBonds} bonds.\")\n",
"print(\"SMARTS string:\", mcs3.smartsString)\n",
"# NBVAL_CHECK_OUTPUT"
],
"metadata": {
"id": "CJC7xVA7PsxQ"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Draw substructure\n",
"m3 = Chem.MolFromSmarts(mcs3.smartsString)\n",
"Draw.MolsToGridImage([m1, m2, m3], legends=[\"MCS1\", \"MCS2: +treshold=0.8\", \"mcs3: +ringmatch\"])"
],
"metadata": {
"id": "yN1nkyZOPvd3"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"highlight_molecules(mols, mcs3, 5)"
],
"metadata": {
"id": "x7r94cNGP6BG"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Read full EGFR data\n",
"mol_df = pd.read_csv(\"TNFB_compounds.csv\", index_col=0)\n",
"print(\"Total number of compounds:\", mol_df.shape[0])\n",
"\n",
"# Only keep molecules with pIC50 > 9 (IC50 > 1nM)\n",
"mol_df = mol_df[mol_df.pIC50 > 4]\n",
"print(\"Number of compounds with pIC50 > 9:\", mol_df.shape[0])\n",
"# NBVAL_CHECK_OUTPUT"
],
"metadata": {
"id": "-dlKTG31P_sc"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Add molecule column to data frame\n",
"PandasTools.AddMoleculeColumnToFrame(mol_df, \"smiles\")\n",
"mol_df.head(3)"
],
"metadata": {
"id": "D6KzVIQ1QOtJ"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"mols_diverse = []\n",
"# Note: discarded variables we do not care about are usually referred to with a single underscore\n",
"for _, row in mol_df.iterrows():\n",
" m = Chem.MolFromSmiles(row.smiles)\n",
" m.SetProp(\"_Name\", row.molecule_chembl_id)\n",
" mols_diverse.append(m)"
],
"metadata": {
"id": "UeucS0xBQSAM"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# We have fixed the random seed above (imports) for deterministic results\n",
"mols_diverse_sample = random.sample(mols_diverse, 40)"
],
"metadata": {
"id": "MALRoGkxQUpE"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"threshold_diverse = 0.3\n",
"mcs1 = rdFMCS.FindMCS(mols_diverse_sample)\n",
"print(\"SMARTS string1:\", mcs1.smartsString)\n",
"mcs2 = rdFMCS.FindMCS(mols_diverse_sample, threshold=threshold_diverse)\n",
"print(\"SMARTS string2:\", mcs2.smartsString)\n",
"mcs3 = rdFMCS.FindMCS(mols_diverse_sample, ringMatchesRingOnly=True, threshold=threshold_diverse)\n",
"print(\"SMARTS string3:\", mcs3.smartsString)\n",
"# NBVAL_CHECK_OUTPUT"
],
"metadata": {
"id": "WfqFVH3eQYjU"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Draw substructures\n",
"m1 = Chem.MolFromSmarts(mcs1.smartsString)\n",
"m2 = Chem.MolFromSmarts(mcs2.smartsString)\n",
"m3 = Chem.MolFromSmarts(mcs3.smartsString)\n",
"\n",
"Draw.MolsToGridImage(\n",
" [m1, m2, m3],\n",
" legends=[\n",
" \"MCS1\",\n",
" f\"MCS2: +threshold={threshold_diverse}\",\n",
" \"MCS3: +ringmatch\",\n",
" ],\n",
")"
],
"metadata": {
"id": "XWBTeg56Qm2Y"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"highlight_molecules(mols_diverse_sample, mcs3, 10)"
],
"metadata": {
"id": "uTLyILInQqpC"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def render_mcs(molecules, percentage):\n",
" \"\"\"Interactive widget helper. `molecules` must be wrapped in `ipywidgets.fixed()`,\n",
" while `percentage` will be determined by an IntSlider widget.\"\"\"\n",
" tmcs = rdFMCS.FindMCS(molecules, threshold=percentage / 100.0)\n",
" if tmcs is None:\n",
" print(\"No MCS found\")\n",
" return None\n",
"\n",
" m = Chem.MolFromSmarts(tmcs.smartsString)\n",
" print(tmcs.smartsString)\n",
" return m"
],
"metadata": {
"id": "cXBgRsBrQx93"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Note that the slider may take a few seconds to react\n",
"interact(\n",
" render_mcs,\n",
" molecules=fixed(mols_diverse_sample),\n",
" percentage=widgets.IntSlider(min=0, max=100, step=10, value=70),\n",
");"
],
"metadata": {
"id": "uUarjGwyQ1O1"
},
"execution_count": null,
"outputs": []
}
]
}