--- a
+++ b/ML_4.ipynb
@@ -0,0 +1,951 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "authorship_tag": "ABX9TyO/Pafq/n0HsCJL97lqmyPJ",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/francescopatane96/Computer_aided_drug_discovery_kit/blob/main/ML_4.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Virtual screening"
+      ],
+      "metadata": {
+        "id": "IwnuqmRtAmlY"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install rdkit"
+      ],
+      "metadata": {
+        "id": "wS_ngR5dEm9p"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "PMKUrKwU_N6_"
+      },
+      "outputs": [],
+      "source": [
+        "from pathlib import Path\n",
+        "\n",
+        "import pandas as pd\n",
+        "import matplotlib.pyplot as plt\n",
+        "from rdkit import Chem, DataStructs\n",
+        "from rdkit.Chem import (\n",
+        "    PandasTools,\n",
+        "    Draw,\n",
+        "    Descriptors,\n",
+        "    MACCSkeys,\n",
+        "    rdFingerprintGenerator,\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Molecules in SMILES format\n",
+        "molecule_smiles = [\n",
+        "    \"CC1C2C(C3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4O)O)O)O)C(=O)N)N(C)C)O\",\n",
+        "    \"CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=C(C=C3)O)N)C(=O)O)C\",\n",
+        "    \"C1=COC(=C1)CNC2=CC(=C(C=C2C(=O)O)S(=O)(=O)N)Cl\",\n",
+        "    \"CCCCCCCCCCCC(=O)OCCOC(=O)CCCCCCCCCCC\",\n",
+        "    \"C1NC2=CC(=C(C=C2S(=O)(=O)N1)S(=O)(=O)N)Cl\",\n",
+        "    \"CC1=C(C(CCC1)(C)C)C=CC(=CC=CC(=CC(=O)O)C)C\",\n",
+        "    \"CC1(C2CC3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4O)O)O)O)C(=O)N)N(C)C)O\",\n",
+        "    \"CC1C(CC(=O)C2=C1C=CC=C2O)C(=O)O\",\n",
+        "]\n",
+        "\n",
+        "# List of molecule names\n",
+        "molecule_names = [\n",
+        "    \"Doxycycline\",\n",
+        "    \"Amoxicilline\",\n",
+        "    \"Furosemide\",\n",
+        "    \"Glycol dilaurate\",\n",
+        "    \"Hydrochlorothiazide\",\n",
+        "    \"Isotretinoin\",\n",
+        "    \"Tetracycline\",\n",
+        "    \"Hemi-cycline D\",\n",
+        "]"
+      ],
+      "metadata": {
+        "id": "9M3RgOqhEvfE"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "molecules = pd.DataFrame({\"smiles\": molecule_smiles, \"name\": molecule_names})\n",
+        "PandasTools.AddMoleculeColumnToFrame(molecules, smilesCol=\"smiles\")\n",
+        "# Show first 2 molecules\n",
+        "molecules.head(2)"
+      ],
+      "metadata": {
+        "id": "iJBPaUowEx7y"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "Draw.MolsToGridImage(\n",
+        "    molecules[\"ROMol\"].to_list(),\n",
+        "    molsPerRow=3,\n",
+        "    subImgSize=(450, 150),\n",
+        "    legends=molecules[\"name\"].to_list(),\n",
+        ")"
+      ],
+      "metadata": {
+        "id": "NqrhlCJzE3no"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Note -- we use pandas apply function to apply the MolWt function\n",
+        "# to all ROMol objects in the DataFrame\n",
+        "molecules[\"molecule_weight\"] = molecules.ROMol.apply(Descriptors.MolWt)\n",
+        "# Sort molecules by molecular weight\n",
+        "molecules.sort_values([\"molecule_weight\"], ascending=False, inplace=True)"
+      ],
+      "metadata": {
+        "id": "L3BUmluyFA24"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Show only molecule names and molecular weights\n",
+        "molecules[[\"smiles\", \"name\", \"molecule_weight\"]]"
+      ],
+      "metadata": {
+        "id": "4ndk80ByFEHw"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "Draw.MolsToGridImage(\n",
+        "    molecules[\"ROMol\"],\n",
+        "    legends=[\n",
+        "        f\"{molecule['name']}: {molecule['molecule_weight']:.2f} Da\"\n",
+        "        for index, molecule in molecules.iterrows()\n",
+        "    ],\n",
+        "    subImgSize=(450, 150),\n",
+        "    molsPerRow=3,\n",
+        ")"
+      ],
+      "metadata": {
+        "id": "QMDormacFJkh"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "molecule = molecules[\"ROMol\"][0]\n",
+        "molecule"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "EdmfQDZKFPmx",
+        "outputId": "8d6f0066-28da-47df-c51c-790e158ceaf0"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "<rdkit.Chem.rdchem.Mol at 0x7f7ad3a09670>"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 10
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "maccs_fp = MACCSkeys.GenMACCSKeys(molecule)"
+      ],
+      "metadata": {
+        "id": "oYpCd1wnFV4p"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "molecules[\"maccs\"] = molecules.ROMol.apply(MACCSkeys.GenMACCSKeys)"
+      ],
+      "metadata": {
+        "id": "Akm0V6NrFaji"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "circular_int_fp = rdFingerprintGenerator.GetCountFPs([molecule])[0]\n",
+        "circular_int_fp"
+      ],
+      "metadata": {
+        "id": "XhIpW23cFgoR"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print(f\"Print non-zero elements:\\n{circular_int_fp.GetNonzeroElements()}\")"
+      ],
+      "metadata": {
+        "id": "PUdhmhz9FkSB"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Note that the function takes a list as input parameter\n",
+        "# (even if we only want to pass one molecule)\n",
+        "circular_bit_fp = rdFingerprintGenerator.GetFPs([molecule])[0]\n",
+        "circular_bit_fp"
+      ],
+      "metadata": {
+        "id": "5wY0TR5DFocC"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print(f\"Print top 400 fingerprint bits:\\n{circular_bit_fp.ToBitString()[:400]}\")"
+      ],
+      "metadata": {
+        "id": "Kcc-7JeIFspR"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "molecules[\"morgan\"] = rdFingerprintGenerator.GetFPs(molecules[\"ROMol\"].tolist())"
+      ],
+      "metadata": {
+        "id": "wdt0ZnleFx5q"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Example molecules\n",
+        "molecule1 = molecules[\"ROMol\"][0]\n",
+        "molecule2 = molecules[\"ROMol\"][1]\n",
+        "\n",
+        "# Example fingerprints\n",
+        "maccs_fp1 = MACCSkeys.GenMACCSKeys(molecule1)\n",
+        "maccs_fp2 = MACCSkeys.GenMACCSKeys(molecule2)"
+      ],
+      "metadata": {
+        "id": "-QcPiu3QF3Eh"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "DataStructs.TanimotoSimilarity(maccs_fp1, maccs_fp2)"
+      ],
+      "metadata": {
+        "id": "Y_DmsFphF5vj"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Define molecule query and list\n",
+        "molecule_query = molecules[\"maccs\"][0]\n",
+        "molecule_list = molecules[\"maccs\"].to_list()\n",
+        "# Calculate similarty values between query and list elements\n",
+        "molecules[\"tanimoto_maccs\"] = DataStructs.BulkTanimotoSimilarity(molecule_query, molecule_list)\n",
+        "molecules[\"dice_maccs\"] = DataStructs.BulkDiceSimilarity(molecule_query, molecule_list)"
+      ],
+      "metadata": {
+        "id": "KXZq_0ghGDyK"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "preview = molecules.sort_values([\"tanimoto_maccs\"], ascending=False).reset_index()\n",
+        "preview[[\"name\", \"tanimoto_maccs\", \"dice_maccs\"]]"
+      ],
+      "metadata": {
+        "id": "oLBgsIQDGIrB"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def draw_ranked_molecules(molecules, sort_by_column):\n",
+        "    \"\"\"\n",
+        "    Draw molecules sorted by a given column.\n",
+        "\n",
+        "    Parameters\n",
+        "    ----------\n",
+        "    molecules : pandas.DataFrame\n",
+        "        Molecules (with \"ROMol\" and \"name\" columns and a column to sort by.\n",
+        "    sort_by_column : str\n",
+        "        Name of the column used to sort the molecules by.\n",
+        "\n",
+        "    Returns\n",
+        "    -------\n",
+        "    Draw.MolsToGridImage\n",
+        "        2D visualization of sorted molecules.\n",
+        "    \"\"\"\n",
+        "\n",
+        "    molecules_sorted = molecules.sort_values([sort_by_column], ascending=False).reset_index()\n",
+        "    return Draw.MolsToGridImage(\n",
+        "        molecules_sorted[\"ROMol\"],\n",
+        "        legends=[\n",
+        "            f\"#{index+1} {molecule['name']}, similarity={molecule[sort_by_column]:.2f}\"\n",
+        "            for index, molecule in molecules_sorted.iterrows()\n",
+        "        ],\n",
+        "        molsPerRow=3,\n",
+        "        subImgSize=(450, 150),\n",
+        "    )"
+      ],
+      "metadata": {
+        "id": "3XGUf57tGRHI"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "draw_ranked_molecules(molecules, \"tanimoto_maccs\")"
+      ],
+      "metadata": {
+        "id": "1GaWEV_xGV8I"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Define molecule query and list\n",
+        "molecule_query = molecules[\"morgan\"][0]\n",
+        "molecule_list = molecules[\"morgan\"].to_list()\n",
+        "# Calculate similarty values between query and list elements\n",
+        "molecules[\"tanimoto_morgan\"] = DataStructs.BulkTanimotoSimilarity(molecule_query, molecule_list)\n",
+        "molecules[\"dice_morgan\"] = DataStructs.BulkDiceSimilarity(molecule_query, molecule_list)"
+      ],
+      "metadata": {
+        "id": "IODdVQqKGgMB"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "preview = molecules.sort_values([\"tanimoto_morgan\"], ascending=False).reset_index()\n",
+        "preview[[\"name\", \"tanimoto_morgan\", \"dice_morgan\", \"tanimoto_maccs\", \"dice_maccs\"]]\n"
+      ],
+      "metadata": {
+        "id": "fivuZHjlGipq"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "draw_ranked_molecules(molecules, \"tanimoto_morgan\")"
+      ],
+      "metadata": {
+        "id": "UHv5-rSKGqbt"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "fig, ax = plt.subplots(figsize=(6, 6))\n",
+        "molecules.plot(\"tanimoto_maccs\", \"tanimoto_morgan\", kind=\"scatter\", ax=ax)\n",
+        "ax.plot([0, 1], [0, 1], \"k--\")\n",
+        "ax.set_xlabel(\"Tanimoto (MACCS)\")\n",
+        "ax.set_ylabel(\"Tanimoto (Morgan)\")\n",
+        "fig;"
+      ],
+      "metadata": {
+        "id": "3o5w6sDyGwYN"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "molecule_dataset = pd.read_csv(\n",
+        "    \"TNFB_compounds_lipinski.csv\",\n",
+        "    usecols=[\"molecule_chembl_id\", \"smiles\", \"pIC50\"],\n",
+        ")\n",
+        "print(f\"Number of molecules in dataset: {len(molecule_dataset)}\")\n",
+        "molecule_dataset.head(5)"
+      ],
+      "metadata": {
+        "id": "wH-Q8hyv9XZn"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "query = Chem.MolFromSmiles(\"O=C(Nc1cc(C(F)(F)F)cc(C(F)(F)F)c1)c1cnc(Cl)nc1C(F)(F)F\")\n",
+        "query"
+      ],
+      "metadata": {
+        "id": "3WmxmCzj97WQ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "maccs_fp_query = MACCSkeys.GenMACCSKeys(query)\n",
+        "circular_fp_query = rdFingerprintGenerator.GetCountFPs([query])[0]"
+      ],
+      "metadata": {
+        "id": "GkcDaxez-OuP"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "PandasTools.AddMoleculeColumnToFrame(molecule_dataset, \"smiles\")\n",
+        "circular_fp_list = rdFingerprintGenerator.GetCountFPs(molecule_dataset[\"ROMol\"].tolist())\n",
+        "maccs_fp_list = molecule_dataset[\"ROMol\"].apply(MACCSkeys.GenMACCSKeys).tolist()"
+      ],
+      "metadata": {
+        "id": "n2bzjaPD-SfH"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "molecule_dataset[\"tanimoto_maccs\"] = DataStructs.BulkTanimotoSimilarity(\n",
+        "    maccs_fp_query, maccs_fp_list\n",
+        ")\n",
+        "molecule_dataset[\"tanimoto_morgan\"] = DataStructs.BulkTanimotoSimilarity(\n",
+        "    circular_fp_query, circular_fp_list\n",
+        ")"
+      ],
+      "metadata": {
+        "id": "fxxsJrac-WAw"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "molecule_dataset[\"dice_maccs\"] = DataStructs.BulkDiceSimilarity(maccs_fp_query, maccs_fp_list)\n",
+        "molecule_dataset[\"dice_morgan\"] = DataStructs.BulkDiceSimilarity(\n",
+        "    circular_fp_query, circular_fp_list\n",
+        ")"
+      ],
+      "metadata": {
+        "id": "PQT4ako_-YoL"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# NBVAL_CHECK_OUTPUT\n",
+        "molecule_dataset[\n",
+        "    [\"smiles\", \"tanimoto_maccs\", \"tanimoto_morgan\", \"dice_maccs\", \"dice_morgan\"]\n",
+        "].head(5)"
+      ],
+      "metadata": {
+        "id": "6ESskO7s-aVb"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Show all columns\n",
+        "molecule_dataset.head(3)"
+      ],
+      "metadata": {
+        "id": "4lvx2Tx--fkQ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "fig, axes = plt.subplots(figsize=(10, 6), nrows=2, ncols=2)\n",
+        "molecule_dataset.hist([\"tanimoto_maccs\"], ax=axes[0, 0])\n",
+        "molecule_dataset.hist([\"tanimoto_morgan\"], ax=axes[0, 1])\n",
+        "molecule_dataset.hist([\"dice_maccs\"], ax=axes[1, 0])\n",
+        "molecule_dataset.hist([\"dice_morgan\"], ax=axes[1, 1])\n",
+        "axes[1, 0].set_xlabel(\"similarity value\")\n",
+        "axes[1, 0].set_ylabel(\"# molecules\")\n",
+        "fig;"
+      ],
+      "metadata": {
+        "id": "7mySir-w-kah"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "fig, axes = plt.subplots(figsize=(12, 6), nrows=1, ncols=2)\n",
+        "\n",
+        "molecule_dataset.plot(\"tanimoto_maccs\", \"dice_maccs\", kind=\"scatter\", ax=axes[0])\n",
+        "axes[0].plot([0, 1], [0, 1], \"k--\")\n",
+        "axes[0].set_xlabel(\"Tanimoto (MACCS)\")\n",
+        "axes[0].set_ylabel(\"Dice (MACCS)\")\n",
+        "\n",
+        "molecule_dataset.plot(\"tanimoto_morgan\", \"dice_morgan\", kind=\"scatter\", ax=axes[1])\n",
+        "axes[1].plot([0, 1], [0, 1], \"k--\")\n",
+        "axes[1].set_xlabel(\"Tanimoto (Morgan)\")\n",
+        "axes[1].set_ylabel(\"Dice (Morgan)\")\n",
+        "\n",
+        "fig;"
+      ],
+      "metadata": {
+        "id": "FMB6Qty1-pbn"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "molecule_dataset.sort_values([\"tanimoto_morgan\"], ascending=False).head(3)"
+      ],
+      "metadata": {
+        "id": "4UzFAWc6-ykA"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "top_n_molecules = 10\n",
+        "top_molecules = molecule_dataset.sort_values([\"tanimoto_morgan\"], ascending=False).reset_index()\n",
+        "top_molecules = top_molecules[:top_n_molecules]\n",
+        "legends = [\n",
+        "    f\"#{index+1} {molecule['molecule_chembl_id']}, pIC50={molecule['pIC50']:.2f}\"\n",
+        "    for index, molecule in top_molecules.iterrows()\n",
+        "]\n",
+        "Chem.Draw.MolsToGridImage(\n",
+        "    mols=[query] + top_molecules[\"ROMol\"].tolist(),\n",
+        "    legends=([\"Gefitinib\"] + legends),\n",
+        "    molsPerRow=3,\n",
+        "    subImgSize=(450, 150),\n",
+        ")"
+      ],
+      "metadata": {
+        "id": "BgcxSuAj-6gv"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "molecule_dataset.head(3)"
+      ],
+      "metadata": {
+        "id": "u0tFvI4Y_Msf"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def get_enrichment_data(molecules, similarity_measure, pic50_cutoff):\n",
+        "    \"\"\"\n",
+        "    Calculates x and y values for enrichment plot:\n",
+        "        x - % ranked dataset\n",
+        "        y - % true actives identified\n",
+        "\n",
+        "    Parameters\n",
+        "    ----------\n",
+        "    molecules : pandas.DataFrame\n",
+        "        Molecules with similarity values to a query molecule.\n",
+        "    similarity_measure : str\n",
+        "        Column name which will be used to sort the DataFrame．\n",
+        "    pic50_cutoff : float\n",
+        "        pIC50 cutoff value used to discriminate active and inactive molecules.\n",
+        "\n",
+        "    Returns\n",
+        "    -------\n",
+        "    pandas.DataFrame\n",
+        "        Enrichment data: Percentage of ranked dataset by similarity vs. percentage of identified true actives.\n",
+        "    \"\"\"\n",
+        "\n",
+        "    # Get number of molecules in data set\n",
+        "    molecules_all = len(molecules)\n",
+        "\n",
+        "    # Get number of active molecules in data set\n",
+        "    actives_all = sum(molecules[\"pIC50\"] >= pic50_cutoff)\n",
+        "\n",
+        "    # Initialize a list that will hold the counter for actives and molecules while iterating through our dataset\n",
+        "    actives_counter_list = []\n",
+        "\n",
+        "    # Initialize counter for actives\n",
+        "    actives_counter = 0\n",
+        "\n",
+        "    # Note: Data must be ranked for enrichment plots:\n",
+        "    # Sort molecules by selected similarity measure\n",
+        "    molecules.sort_values([similarity_measure], ascending=False, inplace=True)\n",
+        "\n",
+        "    # Iterate over the ranked dataset and check each molecule if active (by checking bioactivity)\n",
+        "    for value in molecules[\"pIC50\"]:\n",
+        "        if value >= pic50_cutoff:\n",
+        "            actives_counter += 1\n",
+        "        actives_counter_list.append(actives_counter)\n",
+        "\n",
+        "    # Transform number of molecules into % ranked dataset\n",
+        "    molecules_percentage_list = [i / molecules_all for i in range(1, molecules_all + 1)]\n",
+        "\n",
+        "    # Transform number of actives into % true actives identified\n",
+        "    actives_percentage_list = [i / actives_all for i in actives_counter_list]\n",
+        "\n",
+        "    # Generate DataFrame with x and y values as well as label\n",
+        "    enrichment = pd.DataFrame(\n",
+        "        {\n",
+        "            \"% ranked dataset\": molecules_percentage_list,\n",
+        "            \"% true actives identified\": actives_percentage_list,\n",
+        "        }\n",
+        "    )\n",
+        "\n",
+        "    return enrichment"
+      ],
+      "metadata": {
+        "id": "GupUIRiM_Pin"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "pic50_cutoff = 6.3"
+      ],
+      "metadata": {
+        "id": "KdgWp1l8_T4L"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "similarity_measures = [\"tanimoto_maccs\", \"tanimoto_morgan\"]\n",
+        "enrichment_data = {\n",
+        "    similarity_measure: get_enrichment_data(molecule_dataset, similarity_measure, pic50_cutoff)\n",
+        "    for similarity_measure in similarity_measures\n",
+        "}"
+      ],
+      "metadata": {
+        "id": "mEKlMs7q_XG_"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# NBVAL_CHECK_OUTPUT\n",
+        "enrichment_data[\"tanimoto_maccs\"].head()"
+      ],
+      "metadata": {
+        "id": "58Jqhzk__ZHX"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "fig, ax = plt.subplots(figsize=(6, 6))\n",
+        "\n",
+        "fontsize = 20\n",
+        "\n",
+        "# Plot enrichment data\n",
+        "for similarity_measure, enrichment in enrichment_data.items():\n",
+        "    ax = enrichment.plot(\n",
+        "        ax=ax,\n",
+        "        x=\"% ranked dataset\",\n",
+        "        y=\"% true actives identified\",\n",
+        "        label=similarity_measure,\n",
+        "        alpha=0.5,\n",
+        "        linewidth=4,\n",
+        "    )\n",
+        "ax.set_ylabel(\"% True actives identified\", size=fontsize)\n",
+        "ax.set_xlabel(\"% Ranked dataset\", size=fontsize)\n",
+        "\n",
+        "# Plot optimal curve: Ratio of actives in dataset\n",
+        "ratio_actives = sum(molecule_dataset[\"pIC50\"] >= pic50_cutoff) / len(molecule_dataset)\n",
+        "ax.plot(\n",
+        "    [0, ratio_actives, 1],\n",
+        "    [0, 1, 1],\n",
+        "    label=\"Optimal curve\",\n",
+        "    color=\"black\",\n",
+        "    linestyle=\"--\",\n",
+        ")\n",
+        "\n",
+        "# Plot random curve\n",
+        "ax.plot([0, 1], [0, 1], label=\"Random curve\", color=\"grey\", linestyle=\"--\")\n",
+        "\n",
+        "plt.tick_params(labelsize=16)\n",
+        "plt.legend(\n",
+        "    labels=[\"MACCS\", \"Morgan\", \"Optimal\", \"Random\"],\n",
+        "    loc=(0.5, 0.08),\n",
+        "    fontsize=fontsize,\n",
+        "    labelspacing=0.3,\n",
+        ")\n",
+        "\n",
+        "# Save plot -- use bbox_inches to include text boxes\n",
+        "plt.savefig(\n",
+        "    \"enrichment_plot.png\",\n",
+        "    dpi=300,\n",
+        "    bbox_inches=\"tight\",\n",
+        "    transparent=True,\n",
+        ")\n",
+        "\n",
+        "plt.show()"
+      ],
+      "metadata": {
+        "id": "8is59ZNb_ctY"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def calculate_enrichment_factor(enrichment, ranked_dataset_percentage_cutoff):\n",
+        "    \"\"\"\n",
+        "    Get the experimental enrichment factor for a given percentage of the ranked dataset.\n",
+        "\n",
+        "    Parameters\n",
+        "    ----------\n",
+        "    enrichment : pd.DataFrame\n",
+        "        Enrichment data: Percentage of ranked dataset by similarity vs. percentage of\n",
+        "        identified true actives.\n",
+        "    ranked_dataset_percentage_cutoff : float or int\n",
+        "        Percentage of ranked dataset to be included in enrichment factor calculation.\n",
+        "\n",
+        "    Returns\n",
+        "    -------\n",
+        "    float\n",
+        "        Experimental enrichment factor.\n",
+        "    \"\"\"\n",
+        "\n",
+        "    # Keep only molecules that meet the cutoff\n",
+        "    enrichment = enrichment[\n",
+        "        enrichment[\"% ranked dataset\"] <= ranked_dataset_percentage_cutoff / 100\n",
+        "    ]\n",
+        "    # Get highest percentage of actives and the corresponding percentage of actives\n",
+        "    highest_enrichment = enrichment.iloc[-1]\n",
+        "    enrichment_factor = round(100 * float(highest_enrichment[\"% true actives identified\"]), 1)\n",
+        "    return enrichment_factor"
+      ],
+      "metadata": {
+        "id": "XUWbwjU7_uBR"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def calculate_enrichment_factor_random(ranked_dataset_percentage_cutoff):\n",
+        "    \"\"\"\n",
+        "    Get the random enrichment factor for a given percentage of the ranked dataset.\n",
+        "\n",
+        "    Parameters\n",
+        "    ----------\n",
+        "    ranked_dataset_percentage_cutoff : float or int\n",
+        "        Percentage of ranked dataset to be included in enrichment factor calculation.\n",
+        "\n",
+        "    Returns\n",
+        "    -------\n",
+        "    float\n",
+        "        Random enrichment factor.\n",
+        "    \"\"\"\n",
+        "\n",
+        "    enrichment_factor_random = round(float(ranked_dataset_percentage_cutoff), 1)\n",
+        "    return enrichment_factor_random"
+      ],
+      "metadata": {
+        "id": "wbjqjFRr_wqT"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def calculate_enrichment_factor_optimal(molecules, ranked_dataset_percentage_cutoff, pic50_cutoff):\n",
+        "    \"\"\"\n",
+        "    Get the optimal random enrichment factor for a given percentage of the ranked dataset.\n",
+        "\n",
+        "    Parameters\n",
+        "    ----------\n",
+        "    molecules : pandas.DataFrame\n",
+        "        the DataFrame with all the molecules and pIC50.\n",
+        "    ranked_dataset_percentage_cutoff : float or int\n",
+        "        Percentage of ranked dataset to be included in enrichment factor calculation.\n",
+        "    activity_cutoff: float\n",
+        "        pIC50 cutoff value used to discriminate active and inactive molecules\n",
+        "\n",
+        "    Returns\n",
+        "    -------\n",
+        "    float\n",
+        "        Optimal enrichment factor.\n",
+        "    \"\"\"\n",
+        "\n",
+        "    ratio = sum(molecules[\"pIC50\"] >= pic50_cutoff) / len(molecules) * 100\n",
+        "    if ranked_dataset_percentage_cutoff <= ratio:\n",
+        "        enrichment_factor_optimal = round(100 / ratio * ranked_dataset_percentage_cutoff, 1)\n",
+        "    else:\n",
+        "        enrichment_factor_optimal = 100.0\n",
+        "    return enrichment_factor_optimal"
+      ],
+      "metadata": {
+        "id": "2m4I2IHo_0pj"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "ranked_dataset_percentage_cutoff = 5"
+      ],
+      "metadata": {
+        "id": "WbAfjid7_4MA"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "for similarity_measure, enrichment in enrichment_data.items():\n",
+        "    enrichment_factor = calculate_enrichment_factor(enrichment, ranked_dataset_percentage_cutoff)\n",
+        "    print(\n",
+        "        f\"Experimental EF for {ranked_dataset_percentage_cutoff}% of ranked dataset ({similarity_measure}): {enrichment_factor}%\"\n",
+        "    )"
+      ],
+      "metadata": {
+        "id": "osJhc5pK_5zv"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "enrichment_factor_random = calculate_enrichment_factor_random(ranked_dataset_percentage_cutoff)\n",
+        "print(\n",
+        "    f\"Random EF for {ranked_dataset_percentage_cutoff}% of ranked dataset: {enrichment_factor_random}%\"\n",
+        ")\n",
+        "enrichment_factor_optimal = calculate_enrichment_factor_optimal(\n",
+        "    molecule_dataset, ranked_dataset_percentage_cutoff, pic50_cutoff\n",
+        ")\n",
+        "print(\n",
+        "    f\"Optimal EF for {ranked_dataset_percentage_cutoff}% of ranked dataset: {enrichment_factor_optimal}%\"\n",
+        ")\n",
+        "# NBVAL_CHECK_OUTPUT"
+      ],
+      "metadata": {
+        "id": "dY2atRHUACoz"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file