Diff of /ML_3.ipynb [000000] .. [480cb7]

Switch to side-by-side view

--- a
+++ b/ML_3.ipynb
@@ -0,0 +1,310 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "authorship_tag": "ABX9TyM1dCOzGN9zMcL1Nj+fE3Vk",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/francescopatane96/Computer_aided_drug_discovery_kit/blob/main/ML_3.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Unwanted substructures:\n",
+        "substructures can be reactive or toxic or they can interfere with certain assays. Filtering unwanted substructures can support assembling more efficient screening libraries, which can save time and resources.\n",
+        "\n",
+        "Examples of such unwanted features are nitro groups (mutagenic), sulfates and phosphates (likely resulting in unfavorable pharmacokinetic properties), 2-halopyridines and thiols (reactive). \n",
+        "\n",
+        "Pan Assay Interference Compounds (PAINS):\n",
+        "PAINS are compounds that often occur as hits in HTS even though they actually are false positives. PAINS show activity at numerous targets rather than one specific target."
+      ],
+      "metadata": {
+        "id": "pXWQZuaa19_j"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install rdkit"
+      ],
+      "metadata": {
+        "id": "5Ebl1uvs2Vzz"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "eYx9Zu8n0-5g"
+      },
+      "outputs": [],
+      "source": [
+        "from pathlib import Path\n",
+        "\n",
+        "import pandas as pd\n",
+        "from tqdm.auto import tqdm\n",
+        "from rdkit import Chem\n",
+        "from rdkit.Chem import PandasTools\n",
+        "from rdkit.Chem.FilterCatalog import FilterCatalog, FilterCatalogParams"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# load data from Talktorial T2\n",
+        "TNFB_data = pd.read_csv(\n",
+        "    \"TNFB_compounds_lipinski.csv\",\n",
+        "    index_col=0,\n",
+        ")\n",
+        "# Drop unnecessary information\n",
+        "print(\"Dataframe shape:\", TNFB_data.shape)\n",
+        "TNFB_data.drop(columns=[\"molecular_weight\", \"n_hbd\", \"n_hba\", \"logp\"], inplace=True)\n",
+        "TNFB_data.head()"
+      ],
+      "metadata": {
+        "id": "5wN1gfYL2fEr"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Add molecule column\n",
+        "PandasTools.AddMoleculeColumnToFrame(TNFB_data, smilesCol=\"smiles\")\n",
+        "# Draw first 3 molecules\n",
+        "Chem.Draw.MolsToGridImage(\n",
+        "    list(TNFB_data.head(3).ROMol),\n",
+        "    legends=list(TNFB_data.head(3).molecule_chembl_id),\n",
+        ")"
+      ],
+      "metadata": {
+        "id": "rzsqw1ha3Ha8"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Filter for PAINS"
+      ],
+      "metadata": {
+        "id": "u2dRkR2R3epc"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# initialize filter\n",
+        "params = FilterCatalogParams()\n",
+        "params.AddCatalog(FilterCatalogParams.FilterCatalogs.PAINS)\n",
+        "catalog = FilterCatalog(params)"
+      ],
+      "metadata": {
+        "id": "mtlZeOxU3hJj"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# search for PAINS\n",
+        "matches = []\n",
+        "clean = []\n",
+        "for index, row in tqdm(TNFB_data.iterrows(), total=TNFB_data.shape[0]):\n",
+        "    molecule = Chem.MolFromSmiles(row.smiles)\n",
+        "    entry = catalog.GetFirstMatch(molecule)  # Get the first matching PAINS\n",
+        "    if entry is not None:\n",
+        "        # store PAINS information\n",
+        "        matches.append(\n",
+        "            {\n",
+        "                \"chembl_id\": row.molecule_chembl_id,\n",
+        "                \"rdkit_molecule\": molecule,\n",
+        "                \"pains\": entry.GetDescription().capitalize(),\n",
+        "            }\n",
+        "        )\n",
+        "    else:\n",
+        "        # collect indices of molecules without PAINS\n",
+        "        clean.append(index)\n",
+        "\n",
+        "matches = pd.DataFrame(matches)\n",
+        "TNFB_data = TNFB_data.loc[clean]  # keep molecules without PAINS"
+      ],
+      "metadata": {
+        "id": "X2kiOCOo3oe7"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print(f\"Number of compounds with PAINS: {len(matches)}\")\n",
+        "print(f\"Number of compounds without PAINS: {len(TNFB_data)}\")"
+      ],
+      "metadata": {
+        "id": "msNTp6h_3_g6"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "Chem.Draw.MolsToGridImage(\n",
+        "    list(matches.head(5).rdkit_molecule),\n",
+        "    legends=list(matches.head(5)[\"pains\"]),\n",
+        ")"
+      ],
+      "metadata": {
+        "id": "zAsR5xdo4Hy7"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Filter and highlight unwanted substructures"
+      ],
+      "metadata": {
+        "id": "hNbb8xcZ4ZCa"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "substructures = pd.read_csv(\"unwantedSubstructures.csv\", sep=\"\\s+\")\n",
+        "substructures[\"rdkit_molecule\"] = substructures.smart.apply(Chem.MolFromSmarts)\n",
+        "print(\"Number of unwanted substructures in collection:\", len(substructures))"
+      ],
+      "metadata": {
+        "id": "Zl3Lpz_Q4b3K"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "Chem.Draw.MolsToGridImage(\n",
+        "    mols=substructures.rdkit_molecule.tolist()[2:5],\n",
+        "    \n",
+        ")"
+      ],
+      "metadata": {
+        "id": "8XY7inLq7j5L"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# search for unwanted substructure\n",
+        "matches = []\n",
+        "clean = []\n",
+        "for index, row in tqdm(TNFB_data.iterrows(), total=TNFB_data.shape[0]):\n",
+        "    molecule = Chem.MolFromSmiles(row.smiles)\n",
+        "    match = False\n",
+        "    for _, substructure in substructures.iterrows():\n",
+        "        if molecule.HasSubstructMatch(substructure.rdkit_molecule):\n",
+        "            matches.append(\n",
+        "                {\n",
+        "                    \"chembl_id\": row.molecule_chembl_id,\n",
+        "                    \"rdkit_molecule\": molecule,\n",
+        "                    \"substructure\": substructure.rdkit_molecule,\n",
+        "                    \n",
+        "                }\n",
+        "            )\n",
+        "            match = True\n",
+        "    if not match:\n",
+        "        clean.append(index)\n",
+        "\n",
+        "matches = pd.DataFrame(matches)\n",
+        "TNFB_data = TNFB_data.loc[clean]"
+      ],
+      "metadata": {
+        "id": "pvqK_Sgq946c"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print(f\"Number of found unwanted substructure: {len(matches)}\")\n",
+        "print(f\"Number of compounds without unwanted substructure: {len(TNFB_data)}\")"
+      ],
+      "metadata": {
+        "id": "0AP-wfa4-RxL"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "to_highlight = [\n",
+        "    row.rdkit_molecule.GetSubstructMatch(row.substructure) for _, row in matches.head(3).iterrows()\n",
+        "]\n",
+        "Chem.Draw.MolsToGridImage(\n",
+        "    list(matches.head(3).rdkit_molecule),\n",
+        "    highlightAtomLists=to_highlight,\n",
+        "    \n",
+        ")"
+      ],
+      "metadata": {
+        "id": "46RQc5Mg-cSS"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Substructure statistics"
+      ],
+      "metadata": {
+        "id": "vHbNd3uc-may"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "\n",
+        "group_frequencies = groups.size()\n",
+        "group_frequencies.sort_values(ascending=False, inplace=True)\n",
+        "group_frequencies.head(10)"
+      ],
+      "metadata": {
+        "id": "XEZb6VBj-myq"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file