{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "authorship_tag": "ABX9TyO/Pafq/n0HsCJL97lqmyPJ",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/francescopatane96/Computer_aided_drug_discovery_kit/blob/main/ML_4.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Virtual screening"
      ],
      "metadata": {
        "id": "IwnuqmRtAmlY"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install rdkit"
      ],
      "metadata": {
        "id": "wS_ngR5dEm9p"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "PMKUrKwU_N6_"
      },
      "outputs": [],
      "source": [
        "from pathlib import Path\n",
        "\n",
        "import pandas as pd\n",
        "import matplotlib.pyplot as plt\n",
        "from rdkit import Chem, DataStructs\n",
        "from rdkit.Chem import (\n",
        "    PandasTools,\n",
        "    Draw,\n",
        "    Descriptors,\n",
        "    MACCSkeys,\n",
        "    rdFingerprintGenerator,\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Molecules in SMILES format\n",
        "molecule_smiles = [\n",
        "    \"CC1C2C(C3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4O)O)O)O)C(=O)N)N(C)C)O\",\n",
        "    \"CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=C(C=C3)O)N)C(=O)O)C\",\n",
        "    \"C1=COC(=C1)CNC2=CC(=C(C=C2C(=O)O)S(=O)(=O)N)Cl\",\n",
        "    \"CCCCCCCCCCCC(=O)OCCOC(=O)CCCCCCCCCCC\",\n",
        "    \"C1NC2=CC(=C(C=C2S(=O)(=O)N1)S(=O)(=O)N)Cl\",\n",
        "    \"CC1=C(C(CCC1)(C)C)C=CC(=CC=CC(=CC(=O)O)C)C\",\n",
        "    \"CC1(C2CC3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4O)O)O)O)C(=O)N)N(C)C)O\",\n",
        "    \"CC1C(CC(=O)C2=C1C=CC=C2O)C(=O)O\",\n",
        "]\n",
        "\n",
        "# List of molecule names\n",
        "molecule_names = [\n",
        "    \"Doxycycline\",\n",
        "    \"Amoxicilline\",\n",
        "    \"Furosemide\",\n",
        "    \"Glycol dilaurate\",\n",
        "    \"Hydrochlorothiazide\",\n",
        "    \"Isotretinoin\",\n",
        "    \"Tetracycline\",\n",
        "    \"Hemi-cycline D\",\n",
        "]"
      ],
      "metadata": {
        "id": "9M3RgOqhEvfE"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "molecules = pd.DataFrame({\"smiles\": molecule_smiles, \"name\": molecule_names})\n",
        "PandasTools.AddMoleculeColumnToFrame(molecules, smilesCol=\"smiles\")\n",
        "# Show first 2 molecules\n",
        "molecules.head(2)"
      ],
      "metadata": {
        "id": "iJBPaUowEx7y"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "Draw.MolsToGridImage(\n",
        "    molecules[\"ROMol\"].to_list(),\n",
        "    molsPerRow=3,\n",
        "    subImgSize=(450, 150),\n",
        "    legends=molecules[\"name\"].to_list(),\n",
        ")"
      ],
      "metadata": {
        "id": "NqrhlCJzE3no"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Note -- we use pandas apply function to apply the MolWt function\n",
        "# to all ROMol objects in the DataFrame\n",
        "molecules[\"molecule_weight\"] = molecules.ROMol.apply(Descriptors.MolWt)\n",
        "# Sort molecules by molecular weight\n",
        "molecules.sort_values([\"molecule_weight\"], ascending=False, inplace=True)"
      ],
      "metadata": {
        "id": "L3BUmluyFA24"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Show only molecule names and molecular weights\n",
        "molecules[[\"smiles\", \"name\", \"molecule_weight\"]]"
      ],
      "metadata": {
        "id": "4ndk80ByFEHw"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "Draw.MolsToGridImage(\n",
        "    molecules[\"ROMol\"],\n",
        "    legends=[\n",
        "        f\"{molecule['name']}: {molecule['molecule_weight']:.2f} Da\"\n",
        "        for index, molecule in molecules.iterrows()\n",
        "    ],\n",
        "    subImgSize=(450, 150),\n",
        "    molsPerRow=3,\n",
        ")"
      ],
      "metadata": {
        "id": "QMDormacFJkh"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "molecule = molecules[\"ROMol\"][0]\n",
        "molecule"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "EdmfQDZKFPmx",
        "outputId": "8d6f0066-28da-47df-c51c-790e158ceaf0"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<rdkit.Chem.rdchem.Mol at 0x7f7ad3a09670>"
            ]
          },
          "metadata": {},
          "execution_count": 10
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "maccs_fp = MACCSkeys.GenMACCSKeys(molecule)"
      ],
      "metadata": {
        "id": "oYpCd1wnFV4p"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "molecules[\"maccs\"] = molecules.ROMol.apply(MACCSkeys.GenMACCSKeys)"
      ],
      "metadata": {
        "id": "Akm0V6NrFaji"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "circular_int_fp = rdFingerprintGenerator.GetCountFPs([molecule])[0]\n",
        "circular_int_fp"
      ],
      "metadata": {
        "id": "XhIpW23cFgoR"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "print(f\"Print non-zero elements:\\n{circular_int_fp.GetNonzeroElements()}\")"
      ],
      "metadata": {
        "id": "PUdhmhz9FkSB"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Note that the function takes a list as input parameter\n",
        "# (even if we only want to pass one molecule)\n",
        "circular_bit_fp = rdFingerprintGenerator.GetFPs([molecule])[0]\n",
        "circular_bit_fp"
      ],
      "metadata": {
        "id": "5wY0TR5DFocC"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "print(f\"Print top 400 fingerprint bits:\\n{circular_bit_fp.ToBitString()[:400]}\")"
      ],
      "metadata": {
        "id": "Kcc-7JeIFspR"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "molecules[\"morgan\"] = rdFingerprintGenerator.GetFPs(molecules[\"ROMol\"].tolist())"
      ],
      "metadata": {
        "id": "wdt0ZnleFx5q"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Example molecules\n",
        "molecule1 = molecules[\"ROMol\"][0]\n",
        "molecule2 = molecules[\"ROMol\"][1]\n",
        "\n",
        "# Example fingerprints\n",
        "maccs_fp1 = MACCSkeys.GenMACCSKeys(molecule1)\n",
        "maccs_fp2 = MACCSkeys.GenMACCSKeys(molecule2)"
      ],
      "metadata": {
        "id": "-QcPiu3QF3Eh"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "DataStructs.TanimotoSimilarity(maccs_fp1, maccs_fp2)"
      ],
      "metadata": {
        "id": "Y_DmsFphF5vj"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Define molecule query and list\n",
        "molecule_query = molecules[\"maccs\"][0]\n",
        "molecule_list = molecules[\"maccs\"].to_list()\n",
        "# Calculate similarty values between query and list elements\n",
        "molecules[\"tanimoto_maccs\"] = DataStructs.BulkTanimotoSimilarity(molecule_query, molecule_list)\n",
        "molecules[\"dice_maccs\"] = DataStructs.BulkDiceSimilarity(molecule_query, molecule_list)"
      ],
      "metadata": {
        "id": "KXZq_0ghGDyK"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "preview = molecules.sort_values([\"tanimoto_maccs\"], ascending=False).reset_index()\n",
        "preview[[\"name\", \"tanimoto_maccs\", \"dice_maccs\"]]"
      ],
      "metadata": {
        "id": "oLBgsIQDGIrB"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def draw_ranked_molecules(molecules, sort_by_column):\n",
        "    \"\"\"\n",
        "    Draw molecules sorted by a given column.\n",
        "\n",
        "    Parameters\n",
        "    ----------\n",
        "    molecules : pandas.DataFrame\n",
        "        Molecules (with \"ROMol\" and \"name\" columns and a column to sort by.\n",
        "    sort_by_column : str\n",
        "        Name of the column used to sort the molecules by.\n",
        "\n",
        "    Returns\n",
        "    -------\n",
        "    Draw.MolsToGridImage\n",
        "        2D visualization of sorted molecules.\n",
        "    \"\"\"\n",
        "\n",
        "    molecules_sorted = molecules.sort_values([sort_by_column], ascending=False).reset_index()\n",
        "    return Draw.MolsToGridImage(\n",
        "        molecules_sorted[\"ROMol\"],\n",
        "        legends=[\n",
        "            f\"#{index+1} {molecule['name']}, similarity={molecule[sort_by_column]:.2f}\"\n",
        "            for index, molecule in molecules_sorted.iterrows()\n",
        "        ],\n",
        "        molsPerRow=3,\n",
        "        subImgSize=(450, 150),\n",
        "    )"
      ],
      "metadata": {
        "id": "3XGUf57tGRHI"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "draw_ranked_molecules(molecules, \"tanimoto_maccs\")"
      ],
      "metadata": {
        "id": "1GaWEV_xGV8I"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Define molecule query and list\n",
        "molecule_query = molecules[\"morgan\"][0]\n",
        "molecule_list = molecules[\"morgan\"].to_list()\n",
        "# Calculate similarty values between query and list elements\n",
        "molecules[\"tanimoto_morgan\"] = DataStructs.BulkTanimotoSimilarity(molecule_query, molecule_list)\n",
        "molecules[\"dice_morgan\"] = DataStructs.BulkDiceSimilarity(molecule_query, molecule_list)"
      ],
      "metadata": {
        "id": "IODdVQqKGgMB"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "preview = molecules.sort_values([\"tanimoto_morgan\"], ascending=False).reset_index()\n",
        "preview[[\"name\", \"tanimoto_morgan\", \"dice_morgan\", \"tanimoto_maccs\", \"dice_maccs\"]]\n"
      ],
      "metadata": {
        "id": "fivuZHjlGipq"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "draw_ranked_molecules(molecules, \"tanimoto_morgan\")"
      ],
      "metadata": {
        "id": "UHv5-rSKGqbt"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "fig, ax = plt.subplots(figsize=(6, 6))\n",
        "molecules.plot(\"tanimoto_maccs\", \"tanimoto_morgan\", kind=\"scatter\", ax=ax)\n",
        "ax.plot([0, 1], [0, 1], \"k--\")\n",
        "ax.set_xlabel(\"Tanimoto (MACCS)\")\n",
        "ax.set_ylabel(\"Tanimoto (Morgan)\")\n",
        "fig;"
      ],
      "metadata": {
        "id": "3o5w6sDyGwYN"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "molecule_dataset = pd.read_csv(\n",
        "    \"TNFB_compounds_lipinski.csv\",\n",
        "    usecols=[\"molecule_chembl_id\", \"smiles\", \"pIC50\"],\n",
        ")\n",
        "print(f\"Number of molecules in dataset: {len(molecule_dataset)}\")\n",
        "molecule_dataset.head(5)"
      ],
      "metadata": {
        "id": "wH-Q8hyv9XZn"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "query = Chem.MolFromSmiles(\"O=C(Nc1cc(C(F)(F)F)cc(C(F)(F)F)c1)c1cnc(Cl)nc1C(F)(F)F\")\n",
        "query"
      ],
      "metadata": {
        "id": "3WmxmCzj97WQ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "maccs_fp_query = MACCSkeys.GenMACCSKeys(query)\n",
        "circular_fp_query = rdFingerprintGenerator.GetCountFPs([query])[0]"
      ],
      "metadata": {
        "id": "GkcDaxez-OuP"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "PandasTools.AddMoleculeColumnToFrame(molecule_dataset, \"smiles\")\n",
        "circular_fp_list = rdFingerprintGenerator.GetCountFPs(molecule_dataset[\"ROMol\"].tolist())\n",
        "maccs_fp_list = molecule_dataset[\"ROMol\"].apply(MACCSkeys.GenMACCSKeys).tolist()"
      ],
      "metadata": {
        "id": "n2bzjaPD-SfH"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "molecule_dataset[\"tanimoto_maccs\"] = DataStructs.BulkTanimotoSimilarity(\n",
        "    maccs_fp_query, maccs_fp_list\n",
        ")\n",
        "molecule_dataset[\"tanimoto_morgan\"] = DataStructs.BulkTanimotoSimilarity(\n",
        "    circular_fp_query, circular_fp_list\n",
        ")"
      ],
      "metadata": {
        "id": "fxxsJrac-WAw"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "molecule_dataset[\"dice_maccs\"] = DataStructs.BulkDiceSimilarity(maccs_fp_query, maccs_fp_list)\n",
        "molecule_dataset[\"dice_morgan\"] = DataStructs.BulkDiceSimilarity(\n",
        "    circular_fp_query, circular_fp_list\n",
        ")"
      ],
      "metadata": {
        "id": "PQT4ako_-YoL"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# NBVAL_CHECK_OUTPUT\n",
        "molecule_dataset[\n",
        "    [\"smiles\", \"tanimoto_maccs\", \"tanimoto_morgan\", \"dice_maccs\", \"dice_morgan\"]\n",
        "].head(5)"
      ],
      "metadata": {
        "id": "6ESskO7s-aVb"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Show all columns\n",
        "molecule_dataset.head(3)"
      ],
      "metadata": {
        "id": "4lvx2Tx--fkQ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "fig, axes = plt.subplots(figsize=(10, 6), nrows=2, ncols=2)\n",
        "molecule_dataset.hist([\"tanimoto_maccs\"], ax=axes[0, 0])\n",
        "molecule_dataset.hist([\"tanimoto_morgan\"], ax=axes[0, 1])\n",
        "molecule_dataset.hist([\"dice_maccs\"], ax=axes[1, 0])\n",
        "molecule_dataset.hist([\"dice_morgan\"], ax=axes[1, 1])\n",
        "axes[1, 0].set_xlabel(\"similarity value\")\n",
        "axes[1, 0].set_ylabel(\"# molecules\")\n",
        "fig;"
      ],
      "metadata": {
        "id": "7mySir-w-kah"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "fig, axes = plt.subplots(figsize=(12, 6), nrows=1, ncols=2)\n",
        "\n",
        "molecule_dataset.plot(\"tanimoto_maccs\", \"dice_maccs\", kind=\"scatter\", ax=axes[0])\n",
        "axes[0].plot([0, 1], [0, 1], \"k--\")\n",
        "axes[0].set_xlabel(\"Tanimoto (MACCS)\")\n",
        "axes[0].set_ylabel(\"Dice (MACCS)\")\n",
        "\n",
        "molecule_dataset.plot(\"tanimoto_morgan\", \"dice_morgan\", kind=\"scatter\", ax=axes[1])\n",
        "axes[1].plot([0, 1], [0, 1], \"k--\")\n",
        "axes[1].set_xlabel(\"Tanimoto (Morgan)\")\n",
        "axes[1].set_ylabel(\"Dice (Morgan)\")\n",
        "\n",
        "fig;"
      ],
      "metadata": {
        "id": "FMB6Qty1-pbn"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "molecule_dataset.sort_values([\"tanimoto_morgan\"], ascending=False).head(3)"
      ],
      "metadata": {
        "id": "4UzFAWc6-ykA"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "top_n_molecules = 10\n",
        "top_molecules = molecule_dataset.sort_values([\"tanimoto_morgan\"], ascending=False).reset_index()\n",
        "top_molecules = top_molecules[:top_n_molecules]\n",
        "legends = [\n",
        "    f\"#{index+1} {molecule['molecule_chembl_id']}, pIC50={molecule['pIC50']:.2f}\"\n",
        "    for index, molecule in top_molecules.iterrows()\n",
        "]\n",
        "Chem.Draw.MolsToGridImage(\n",
        "    mols=[query] + top_molecules[\"ROMol\"].tolist(),\n",
        "    legends=([\"Gefitinib\"] + legends),\n",
        "    molsPerRow=3,\n",
        "    subImgSize=(450, 150),\n",
        ")"
      ],
      "metadata": {
        "id": "BgcxSuAj-6gv"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "molecule_dataset.head(3)"
      ],
      "metadata": {
        "id": "u0tFvI4Y_Msf"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def get_enrichment_data(molecules, similarity_measure, pic50_cutoff):\n",
        "    \"\"\"\n",
        "    Calculates x and y values for enrichment plot:\n",
        "        x - % ranked dataset\n",
        "        y - % true actives identified\n",
        "\n",
        "    Parameters\n",
        "    ----------\n",
        "    molecules : pandas.DataFrame\n",
        "        Molecules with similarity values to a query molecule.\n",
        "    similarity_measure : str\n",
        "        Column name which will be used to sort the DataFrame．\n",
        "    pic50_cutoff : float\n",
        "        pIC50 cutoff value used to discriminate active and inactive molecules.\n",
        "\n",
        "    Returns\n",
        "    -------\n",
        "    pandas.DataFrame\n",
        "        Enrichment data: Percentage of ranked dataset by similarity vs. percentage of identified true actives.\n",
        "    \"\"\"\n",
        "\n",
        "    # Get number of molecules in data set\n",
        "    molecules_all = len(molecules)\n",
        "\n",
        "    # Get number of active molecules in data set\n",
        "    actives_all = sum(molecules[\"pIC50\"] >= pic50_cutoff)\n",
        "\n",
        "    # Initialize a list that will hold the counter for actives and molecules while iterating through our dataset\n",
        "    actives_counter_list = []\n",
        "\n",
        "    # Initialize counter for actives\n",
        "    actives_counter = 0\n",
        "\n",
        "    # Note: Data must be ranked for enrichment plots:\n",
        "    # Sort molecules by selected similarity measure\n",
        "    molecules.sort_values([similarity_measure], ascending=False, inplace=True)\n",
        "\n",
        "    # Iterate over the ranked dataset and check each molecule if active (by checking bioactivity)\n",
        "    for value in molecules[\"pIC50\"]:\n",
        "        if value >= pic50_cutoff:\n",
        "            actives_counter += 1\n",
        "        actives_counter_list.append(actives_counter)\n",
        "\n",
        "    # Transform number of molecules into % ranked dataset\n",
        "    molecules_percentage_list = [i / molecules_all for i in range(1, molecules_all + 1)]\n",
        "\n",
        "    # Transform number of actives into % true actives identified\n",
        "    actives_percentage_list = [i / actives_all for i in actives_counter_list]\n",
        "\n",
        "    # Generate DataFrame with x and y values as well as label\n",
        "    enrichment = pd.DataFrame(\n",
        "        {\n",
        "            \"% ranked dataset\": molecules_percentage_list,\n",
        "            \"% true actives identified\": actives_percentage_list,\n",
        "        }\n",
        "    )\n",
        "\n",
        "    return enrichment"
      ],
      "metadata": {
        "id": "GupUIRiM_Pin"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "pic50_cutoff = 6.3"
      ],
      "metadata": {
        "id": "KdgWp1l8_T4L"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "similarity_measures = [\"tanimoto_maccs\", \"tanimoto_morgan\"]\n",
        "enrichment_data = {\n",
        "    similarity_measure: get_enrichment_data(molecule_dataset, similarity_measure, pic50_cutoff)\n",
        "    for similarity_measure in similarity_measures\n",
        "}"
      ],
      "metadata": {
        "id": "mEKlMs7q_XG_"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# NBVAL_CHECK_OUTPUT\n",
        "enrichment_data[\"tanimoto_maccs\"].head()"
      ],
      "metadata": {
        "id": "58Jqhzk__ZHX"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "fig, ax = plt.subplots(figsize=(6, 6))\n",
        "\n",
        "fontsize = 20\n",
        "\n",
        "# Plot enrichment data\n",
        "for similarity_measure, enrichment in enrichment_data.items():\n",
        "    ax = enrichment.plot(\n",
        "        ax=ax,\n",
        "        x=\"% ranked dataset\",\n",
        "        y=\"% true actives identified\",\n",
        "        label=similarity_measure,\n",
        "        alpha=0.5,\n",
        "        linewidth=4,\n",
        "    )\n",
        "ax.set_ylabel(\"% True actives identified\", size=fontsize)\n",
        "ax.set_xlabel(\"% Ranked dataset\", size=fontsize)\n",
        "\n",
        "# Plot optimal curve: Ratio of actives in dataset\n",
        "ratio_actives = sum(molecule_dataset[\"pIC50\"] >= pic50_cutoff) / len(molecule_dataset)\n",
        "ax.plot(\n",
        "    [0, ratio_actives, 1],\n",
        "    [0, 1, 1],\n",
        "    label=\"Optimal curve\",\n",
        "    color=\"black\",\n",
        "    linestyle=\"--\",\n",
        ")\n",
        "\n",
        "# Plot random curve\n",
        "ax.plot([0, 1], [0, 1], label=\"Random curve\", color=\"grey\", linestyle=\"--\")\n",
        "\n",
        "plt.tick_params(labelsize=16)\n",
        "plt.legend(\n",
        "    labels=[\"MACCS\", \"Morgan\", \"Optimal\", \"Random\"],\n",
        "    loc=(0.5, 0.08),\n",
        "    fontsize=fontsize,\n",
        "    labelspacing=0.3,\n",
        ")\n",
        "\n",
        "# Save plot -- use bbox_inches to include text boxes\n",
        "plt.savefig(\n",
        "    \"enrichment_plot.png\",\n",
        "    dpi=300,\n",
        "    bbox_inches=\"tight\",\n",
        "    transparent=True,\n",
        ")\n",
        "\n",
        "plt.show()"
      ],
      "metadata": {
        "id": "8is59ZNb_ctY"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def calculate_enrichment_factor(enrichment, ranked_dataset_percentage_cutoff):\n",
        "    \"\"\"\n",
        "    Get the experimental enrichment factor for a given percentage of the ranked dataset.\n",
        "\n",
        "    Parameters\n",
        "    ----------\n",
        "    enrichment : pd.DataFrame\n",
        "        Enrichment data: Percentage of ranked dataset by similarity vs. percentage of\n",
        "        identified true actives.\n",
        "    ranked_dataset_percentage_cutoff : float or int\n",
        "        Percentage of ranked dataset to be included in enrichment factor calculation.\n",
        "\n",
        "    Returns\n",
        "    -------\n",
        "    float\n",
        "        Experimental enrichment factor.\n",
        "    \"\"\"\n",
        "\n",
        "    # Keep only molecules that meet the cutoff\n",
        "    enrichment = enrichment[\n",
        "        enrichment[\"% ranked dataset\"] <= ranked_dataset_percentage_cutoff / 100\n",
        "    ]\n",
        "    # Get highest percentage of actives and the corresponding percentage of actives\n",
        "    highest_enrichment = enrichment.iloc[-1]\n",
        "    enrichment_factor = round(100 * float(highest_enrichment[\"% true actives identified\"]), 1)\n",
        "    return enrichment_factor"
      ],
      "metadata": {
        "id": "XUWbwjU7_uBR"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def calculate_enrichment_factor_random(ranked_dataset_percentage_cutoff):\n",
        "    \"\"\"\n",
        "    Get the random enrichment factor for a given percentage of the ranked dataset.\n",
        "\n",
        "    Parameters\n",
        "    ----------\n",
        "    ranked_dataset_percentage_cutoff : float or int\n",
        "        Percentage of ranked dataset to be included in enrichment factor calculation.\n",
        "\n",
        "    Returns\n",
        "    -------\n",
        "    float\n",
        "        Random enrichment factor.\n",
        "    \"\"\"\n",
        "\n",
        "    enrichment_factor_random = round(float(ranked_dataset_percentage_cutoff), 1)\n",
        "    return enrichment_factor_random"
      ],
      "metadata": {
        "id": "wbjqjFRr_wqT"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def calculate_enrichment_factor_optimal(molecules, ranked_dataset_percentage_cutoff, pic50_cutoff):\n",
        "    \"\"\"\n",
        "    Get the optimal random enrichment factor for a given percentage of the ranked dataset.\n",
        "\n",
        "    Parameters\n",
        "    ----------\n",
        "    molecules : pandas.DataFrame\n",
        "        the DataFrame with all the molecules and pIC50.\n",
        "    ranked_dataset_percentage_cutoff : float or int\n",
        "        Percentage of ranked dataset to be included in enrichment factor calculation.\n",
        "    activity_cutoff: float\n",
        "        pIC50 cutoff value used to discriminate active and inactive molecules\n",
        "\n",
        "    Returns\n",
        "    -------\n",
        "    float\n",
        "        Optimal enrichment factor.\n",
        "    \"\"\"\n",
        "\n",
        "    ratio = sum(molecules[\"pIC50\"] >= pic50_cutoff) / len(molecules) * 100\n",
        "    if ranked_dataset_percentage_cutoff <= ratio:\n",
        "        enrichment_factor_optimal = round(100 / ratio * ranked_dataset_percentage_cutoff, 1)\n",
        "    else:\n",
        "        enrichment_factor_optimal = 100.0\n",
        "    return enrichment_factor_optimal"
      ],
      "metadata": {
        "id": "2m4I2IHo_0pj"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "ranked_dataset_percentage_cutoff = 5"
      ],
      "metadata": {
        "id": "WbAfjid7_4MA"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "for similarity_measure, enrichment in enrichment_data.items():\n",
        "    enrichment_factor = calculate_enrichment_factor(enrichment, ranked_dataset_percentage_cutoff)\n",
        "    print(\n",
        "        f\"Experimental EF for {ranked_dataset_percentage_cutoff}% of ranked dataset ({similarity_measure}): {enrichment_factor}%\"\n",
        "    )"
      ],
      "metadata": {
        "id": "osJhc5pK_5zv"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "enrichment_factor_random = calculate_enrichment_factor_random(ranked_dataset_percentage_cutoff)\n",
        "print(\n",
        "    f\"Random EF for {ranked_dataset_percentage_cutoff}% of ranked dataset: {enrichment_factor_random}%\"\n",
        ")\n",
        "enrichment_factor_optimal = calculate_enrichment_factor_optimal(\n",
        "    molecule_dataset, ranked_dataset_percentage_cutoff, pic50_cutoff\n",
        ")\n",
        "print(\n",
        "    f\"Optimal EF for {ranked_dataset_percentage_cutoff}% of ranked dataset: {enrichment_factor_optimal}%\"\n",
        ")\n",
        "# NBVAL_CHECK_OUTPUT"
      ],
      "metadata": {
        "id": "dY2atRHUACoz"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}