{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "authorship_tag": "ABX9TyO/Pafq/n0HsCJL97lqmyPJ", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "source": [ "Virtual screening" ], "metadata": { "id": "IwnuqmRtAmlY" } }, { "cell_type": "code", "source": [ "!pip install rdkit" ], "metadata": { "id": "wS_ngR5dEm9p" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "PMKUrKwU_N6_" }, "outputs": [], "source": [ "from pathlib import Path\n", "\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "from rdkit import Chem, DataStructs\n", "from rdkit.Chem import (\n", " PandasTools,\n", " Draw,\n", " Descriptors,\n", " MACCSkeys,\n", " rdFingerprintGenerator,\n", ")" ] }, { "cell_type": "code", "source": [ "# Molecules in SMILES format\n", "molecule_smiles = [\n", " \"CC1C2C(C3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4O)O)O)O)C(=O)N)N(C)C)O\",\n", " \"CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=C(C=C3)O)N)C(=O)O)C\",\n", " \"C1=COC(=C1)CNC2=CC(=C(C=C2C(=O)O)S(=O)(=O)N)Cl\",\n", " \"CCCCCCCCCCCC(=O)OCCOC(=O)CCCCCCCCCCC\",\n", " \"C1NC2=CC(=C(C=C2S(=O)(=O)N1)S(=O)(=O)N)Cl\",\n", " \"CC1=C(C(CCC1)(C)C)C=CC(=CC=CC(=CC(=O)O)C)C\",\n", " \"CC1(C2CC3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4O)O)O)O)C(=O)N)N(C)C)O\",\n", " \"CC1C(CC(=O)C2=C1C=CC=C2O)C(=O)O\",\n", "]\n", "\n", "# List of molecule names\n", "molecule_names = [\n", " \"Doxycycline\",\n", " \"Amoxicilline\",\n", " \"Furosemide\",\n", " \"Glycol dilaurate\",\n", " \"Hydrochlorothiazide\",\n", " \"Isotretinoin\",\n", " \"Tetracycline\",\n", " \"Hemi-cycline D\",\n", "]" ], "metadata": { "id": "9M3RgOqhEvfE" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "molecules = pd.DataFrame({\"smiles\": molecule_smiles, \"name\": molecule_names})\n", "PandasTools.AddMoleculeColumnToFrame(molecules, smilesCol=\"smiles\")\n", "# Show first 2 molecules\n", "molecules.head(2)" ], "metadata": { "id": "iJBPaUowEx7y" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "Draw.MolsToGridImage(\n", " molecules[\"ROMol\"].to_list(),\n", " molsPerRow=3,\n", " subImgSize=(450, 150),\n", " legends=molecules[\"name\"].to_list(),\n", ")" ], "metadata": { "id": "NqrhlCJzE3no" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Note -- we use pandas apply function to apply the MolWt function\n", "# to all ROMol objects in the DataFrame\n", "molecules[\"molecule_weight\"] = molecules.ROMol.apply(Descriptors.MolWt)\n", "# Sort molecules by molecular weight\n", "molecules.sort_values([\"molecule_weight\"], ascending=False, inplace=True)" ], "metadata": { "id": "L3BUmluyFA24" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Show only molecule names and molecular weights\n", "molecules[[\"smiles\", \"name\", \"molecule_weight\"]]" ], "metadata": { "id": "4ndk80ByFEHw" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "Draw.MolsToGridImage(\n", " molecules[\"ROMol\"],\n", " legends=[\n", " f\"{molecule['name']}: {molecule['molecule_weight']:.2f} Da\"\n", " for index, molecule in molecules.iterrows()\n", " ],\n", " subImgSize=(450, 150),\n", " molsPerRow=3,\n", ")" ], "metadata": { "id": "QMDormacFJkh" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "molecule = molecules[\"ROMol\"][0]\n", "molecule" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "EdmfQDZKFPmx", "outputId": "8d6f0066-28da-47df-c51c-790e158ceaf0" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": {}, "execution_count": 10 } ] }, { "cell_type": "code", "source": [ "maccs_fp = MACCSkeys.GenMACCSKeys(molecule)" ], "metadata": { "id": "oYpCd1wnFV4p" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "molecules[\"maccs\"] = molecules.ROMol.apply(MACCSkeys.GenMACCSKeys)" ], "metadata": { "id": "Akm0V6NrFaji" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "circular_int_fp = rdFingerprintGenerator.GetCountFPs([molecule])[0]\n", "circular_int_fp" ], "metadata": { "id": "XhIpW23cFgoR" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "print(f\"Print non-zero elements:\\n{circular_int_fp.GetNonzeroElements()}\")" ], "metadata": { "id": "PUdhmhz9FkSB" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Note that the function takes a list as input parameter\n", "# (even if we only want to pass one molecule)\n", "circular_bit_fp = rdFingerprintGenerator.GetFPs([molecule])[0]\n", "circular_bit_fp" ], "metadata": { "id": "5wY0TR5DFocC" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "print(f\"Print top 400 fingerprint bits:\\n{circular_bit_fp.ToBitString()[:400]}\")" ], "metadata": { "id": "Kcc-7JeIFspR" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "molecules[\"morgan\"] = rdFingerprintGenerator.GetFPs(molecules[\"ROMol\"].tolist())" ], "metadata": { "id": "wdt0ZnleFx5q" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Example molecules\n", "molecule1 = molecules[\"ROMol\"][0]\n", "molecule2 = molecules[\"ROMol\"][1]\n", "\n", "# Example fingerprints\n", "maccs_fp1 = MACCSkeys.GenMACCSKeys(molecule1)\n", "maccs_fp2 = MACCSkeys.GenMACCSKeys(molecule2)" ], "metadata": { "id": "-QcPiu3QF3Eh" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "DataStructs.TanimotoSimilarity(maccs_fp1, maccs_fp2)" ], "metadata": { "id": "Y_DmsFphF5vj" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Define molecule query and list\n", "molecule_query = molecules[\"maccs\"][0]\n", "molecule_list = molecules[\"maccs\"].to_list()\n", "# Calculate similarty values between query and list elements\n", "molecules[\"tanimoto_maccs\"] = DataStructs.BulkTanimotoSimilarity(molecule_query, molecule_list)\n", "molecules[\"dice_maccs\"] = DataStructs.BulkDiceSimilarity(molecule_query, molecule_list)" ], "metadata": { "id": "KXZq_0ghGDyK" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "preview = molecules.sort_values([\"tanimoto_maccs\"], ascending=False).reset_index()\n", "preview[[\"name\", \"tanimoto_maccs\", \"dice_maccs\"]]" ], "metadata": { "id": "oLBgsIQDGIrB" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "def draw_ranked_molecules(molecules, sort_by_column):\n", " \"\"\"\n", " Draw molecules sorted by a given column.\n", "\n", " Parameters\n", " ----------\n", " molecules : pandas.DataFrame\n", " Molecules (with \"ROMol\" and \"name\" columns and a column to sort by.\n", " sort_by_column : str\n", " Name of the column used to sort the molecules by.\n", "\n", " Returns\n", " -------\n", " Draw.MolsToGridImage\n", " 2D visualization of sorted molecules.\n", " \"\"\"\n", "\n", " molecules_sorted = molecules.sort_values([sort_by_column], ascending=False).reset_index()\n", " return Draw.MolsToGridImage(\n", " molecules_sorted[\"ROMol\"],\n", " legends=[\n", " f\"#{index+1} {molecule['name']}, similarity={molecule[sort_by_column]:.2f}\"\n", " for index, molecule in molecules_sorted.iterrows()\n", " ],\n", " molsPerRow=3,\n", " subImgSize=(450, 150),\n", " )" ], "metadata": { "id": "3XGUf57tGRHI" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "draw_ranked_molecules(molecules, \"tanimoto_maccs\")" ], "metadata": { "id": "1GaWEV_xGV8I" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Define molecule query and list\n", "molecule_query = molecules[\"morgan\"][0]\n", "molecule_list = molecules[\"morgan\"].to_list()\n", "# Calculate similarty values between query and list elements\n", "molecules[\"tanimoto_morgan\"] = DataStructs.BulkTanimotoSimilarity(molecule_query, molecule_list)\n", "molecules[\"dice_morgan\"] = DataStructs.BulkDiceSimilarity(molecule_query, molecule_list)" ], "metadata": { "id": "IODdVQqKGgMB" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "preview = molecules.sort_values([\"tanimoto_morgan\"], ascending=False).reset_index()\n", "preview[[\"name\", \"tanimoto_morgan\", \"dice_morgan\", \"tanimoto_maccs\", \"dice_maccs\"]]\n" ], "metadata": { "id": "fivuZHjlGipq" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "draw_ranked_molecules(molecules, \"tanimoto_morgan\")" ], "metadata": { "id": "UHv5-rSKGqbt" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "fig, ax = plt.subplots(figsize=(6, 6))\n", "molecules.plot(\"tanimoto_maccs\", \"tanimoto_morgan\", kind=\"scatter\", ax=ax)\n", "ax.plot([0, 1], [0, 1], \"k--\")\n", "ax.set_xlabel(\"Tanimoto (MACCS)\")\n", "ax.set_ylabel(\"Tanimoto (Morgan)\")\n", "fig;" ], "metadata": { "id": "3o5w6sDyGwYN" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "molecule_dataset = pd.read_csv(\n", " \"TNFB_compounds_lipinski.csv\",\n", " usecols=[\"molecule_chembl_id\", \"smiles\", \"pIC50\"],\n", ")\n", "print(f\"Number of molecules in dataset: {len(molecule_dataset)}\")\n", "molecule_dataset.head(5)" ], "metadata": { "id": "wH-Q8hyv9XZn" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "query = Chem.MolFromSmiles(\"O=C(Nc1cc(C(F)(F)F)cc(C(F)(F)F)c1)c1cnc(Cl)nc1C(F)(F)F\")\n", "query" ], "metadata": { "id": "3WmxmCzj97WQ" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "maccs_fp_query = MACCSkeys.GenMACCSKeys(query)\n", "circular_fp_query = rdFingerprintGenerator.GetCountFPs([query])[0]" ], "metadata": { "id": "GkcDaxez-OuP" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "PandasTools.AddMoleculeColumnToFrame(molecule_dataset, \"smiles\")\n", "circular_fp_list = rdFingerprintGenerator.GetCountFPs(molecule_dataset[\"ROMol\"].tolist())\n", "maccs_fp_list = molecule_dataset[\"ROMol\"].apply(MACCSkeys.GenMACCSKeys).tolist()" ], "metadata": { "id": "n2bzjaPD-SfH" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "molecule_dataset[\"tanimoto_maccs\"] = DataStructs.BulkTanimotoSimilarity(\n", " maccs_fp_query, maccs_fp_list\n", ")\n", "molecule_dataset[\"tanimoto_morgan\"] = DataStructs.BulkTanimotoSimilarity(\n", " circular_fp_query, circular_fp_list\n", ")" ], "metadata": { "id": "fxxsJrac-WAw" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "molecule_dataset[\"dice_maccs\"] = DataStructs.BulkDiceSimilarity(maccs_fp_query, maccs_fp_list)\n", "molecule_dataset[\"dice_morgan\"] = DataStructs.BulkDiceSimilarity(\n", " circular_fp_query, circular_fp_list\n", ")" ], "metadata": { "id": "PQT4ako_-YoL" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# NBVAL_CHECK_OUTPUT\n", "molecule_dataset[\n", " [\"smiles\", \"tanimoto_maccs\", \"tanimoto_morgan\", \"dice_maccs\", \"dice_morgan\"]\n", "].head(5)" ], "metadata": { "id": "6ESskO7s-aVb" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Show all columns\n", "molecule_dataset.head(3)" ], "metadata": { "id": "4lvx2Tx--fkQ" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "fig, axes = plt.subplots(figsize=(10, 6), nrows=2, ncols=2)\n", "molecule_dataset.hist([\"tanimoto_maccs\"], ax=axes[0, 0])\n", "molecule_dataset.hist([\"tanimoto_morgan\"], ax=axes[0, 1])\n", "molecule_dataset.hist([\"dice_maccs\"], ax=axes[1, 0])\n", "molecule_dataset.hist([\"dice_morgan\"], ax=axes[1, 1])\n", "axes[1, 0].set_xlabel(\"similarity value\")\n", "axes[1, 0].set_ylabel(\"# molecules\")\n", "fig;" ], "metadata": { "id": "7mySir-w-kah" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "fig, axes = plt.subplots(figsize=(12, 6), nrows=1, ncols=2)\n", "\n", "molecule_dataset.plot(\"tanimoto_maccs\", \"dice_maccs\", kind=\"scatter\", ax=axes[0])\n", "axes[0].plot([0, 1], [0, 1], \"k--\")\n", "axes[0].set_xlabel(\"Tanimoto (MACCS)\")\n", "axes[0].set_ylabel(\"Dice (MACCS)\")\n", "\n", "molecule_dataset.plot(\"tanimoto_morgan\", \"dice_morgan\", kind=\"scatter\", ax=axes[1])\n", "axes[1].plot([0, 1], [0, 1], \"k--\")\n", "axes[1].set_xlabel(\"Tanimoto (Morgan)\")\n", "axes[1].set_ylabel(\"Dice (Morgan)\")\n", "\n", "fig;" ], "metadata": { "id": "FMB6Qty1-pbn" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "molecule_dataset.sort_values([\"tanimoto_morgan\"], ascending=False).head(3)" ], "metadata": { "id": "4UzFAWc6-ykA" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "top_n_molecules = 10\n", "top_molecules = molecule_dataset.sort_values([\"tanimoto_morgan\"], ascending=False).reset_index()\n", "top_molecules = top_molecules[:top_n_molecules]\n", "legends = [\n", " f\"#{index+1} {molecule['molecule_chembl_id']}, pIC50={molecule['pIC50']:.2f}\"\n", " for index, molecule in top_molecules.iterrows()\n", "]\n", "Chem.Draw.MolsToGridImage(\n", " mols=[query] + top_molecules[\"ROMol\"].tolist(),\n", " legends=([\"Gefitinib\"] + legends),\n", " molsPerRow=3,\n", " subImgSize=(450, 150),\n", ")" ], "metadata": { "id": "BgcxSuAj-6gv" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "molecule_dataset.head(3)" ], "metadata": { "id": "u0tFvI4Y_Msf" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "def get_enrichment_data(molecules, similarity_measure, pic50_cutoff):\n", " \"\"\"\n", " Calculates x and y values for enrichment plot:\n", " x - % ranked dataset\n", " y - % true actives identified\n", "\n", " Parameters\n", " ----------\n", " molecules : pandas.DataFrame\n", " Molecules with similarity values to a query molecule.\n", " similarity_measure : str\n", " Column name which will be used to sort the DataFrame.\n", " pic50_cutoff : float\n", " pIC50 cutoff value used to discriminate active and inactive molecules.\n", "\n", " Returns\n", " -------\n", " pandas.DataFrame\n", " Enrichment data: Percentage of ranked dataset by similarity vs. percentage of identified true actives.\n", " \"\"\"\n", "\n", " # Get number of molecules in data set\n", " molecules_all = len(molecules)\n", "\n", " # Get number of active molecules in data set\n", " actives_all = sum(molecules[\"pIC50\"] >= pic50_cutoff)\n", "\n", " # Initialize a list that will hold the counter for actives and molecules while iterating through our dataset\n", " actives_counter_list = []\n", "\n", " # Initialize counter for actives\n", " actives_counter = 0\n", "\n", " # Note: Data must be ranked for enrichment plots:\n", " # Sort molecules by selected similarity measure\n", " molecules.sort_values([similarity_measure], ascending=False, inplace=True)\n", "\n", " # Iterate over the ranked dataset and check each molecule if active (by checking bioactivity)\n", " for value in molecules[\"pIC50\"]:\n", " if value >= pic50_cutoff:\n", " actives_counter += 1\n", " actives_counter_list.append(actives_counter)\n", "\n", " # Transform number of molecules into % ranked dataset\n", " molecules_percentage_list = [i / molecules_all for i in range(1, molecules_all + 1)]\n", "\n", " # Transform number of actives into % true actives identified\n", " actives_percentage_list = [i / actives_all for i in actives_counter_list]\n", "\n", " # Generate DataFrame with x and y values as well as label\n", " enrichment = pd.DataFrame(\n", " {\n", " \"% ranked dataset\": molecules_percentage_list,\n", " \"% true actives identified\": actives_percentage_list,\n", " }\n", " )\n", "\n", " return enrichment" ], "metadata": { "id": "GupUIRiM_Pin" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "pic50_cutoff = 6.3" ], "metadata": { "id": "KdgWp1l8_T4L" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "similarity_measures = [\"tanimoto_maccs\", \"tanimoto_morgan\"]\n", "enrichment_data = {\n", " similarity_measure: get_enrichment_data(molecule_dataset, similarity_measure, pic50_cutoff)\n", " for similarity_measure in similarity_measures\n", "}" ], "metadata": { "id": "mEKlMs7q_XG_" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# NBVAL_CHECK_OUTPUT\n", "enrichment_data[\"tanimoto_maccs\"].head()" ], "metadata": { "id": "58Jqhzk__ZHX" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "fig, ax = plt.subplots(figsize=(6, 6))\n", "\n", "fontsize = 20\n", "\n", "# Plot enrichment data\n", "for similarity_measure, enrichment in enrichment_data.items():\n", " ax = enrichment.plot(\n", " ax=ax,\n", " x=\"% ranked dataset\",\n", " y=\"% true actives identified\",\n", " label=similarity_measure,\n", " alpha=0.5,\n", " linewidth=4,\n", " )\n", "ax.set_ylabel(\"% True actives identified\", size=fontsize)\n", "ax.set_xlabel(\"% Ranked dataset\", size=fontsize)\n", "\n", "# Plot optimal curve: Ratio of actives in dataset\n", "ratio_actives = sum(molecule_dataset[\"pIC50\"] >= pic50_cutoff) / len(molecule_dataset)\n", "ax.plot(\n", " [0, ratio_actives, 1],\n", " [0, 1, 1],\n", " label=\"Optimal curve\",\n", " color=\"black\",\n", " linestyle=\"--\",\n", ")\n", "\n", "# Plot random curve\n", "ax.plot([0, 1], [0, 1], label=\"Random curve\", color=\"grey\", linestyle=\"--\")\n", "\n", "plt.tick_params(labelsize=16)\n", "plt.legend(\n", " labels=[\"MACCS\", \"Morgan\", \"Optimal\", \"Random\"],\n", " loc=(0.5, 0.08),\n", " fontsize=fontsize,\n", " labelspacing=0.3,\n", ")\n", "\n", "# Save plot -- use bbox_inches to include text boxes\n", "plt.savefig(\n", " \"enrichment_plot.png\",\n", " dpi=300,\n", " bbox_inches=\"tight\",\n", " transparent=True,\n", ")\n", "\n", "plt.show()" ], "metadata": { "id": "8is59ZNb_ctY" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "def calculate_enrichment_factor(enrichment, ranked_dataset_percentage_cutoff):\n", " \"\"\"\n", " Get the experimental enrichment factor for a given percentage of the ranked dataset.\n", "\n", " Parameters\n", " ----------\n", " enrichment : pd.DataFrame\n", " Enrichment data: Percentage of ranked dataset by similarity vs. percentage of\n", " identified true actives.\n", " ranked_dataset_percentage_cutoff : float or int\n", " Percentage of ranked dataset to be included in enrichment factor calculation.\n", "\n", " Returns\n", " -------\n", " float\n", " Experimental enrichment factor.\n", " \"\"\"\n", "\n", " # Keep only molecules that meet the cutoff\n", " enrichment = enrichment[\n", " enrichment[\"% ranked dataset\"] <= ranked_dataset_percentage_cutoff / 100\n", " ]\n", " # Get highest percentage of actives and the corresponding percentage of actives\n", " highest_enrichment = enrichment.iloc[-1]\n", " enrichment_factor = round(100 * float(highest_enrichment[\"% true actives identified\"]), 1)\n", " return enrichment_factor" ], "metadata": { "id": "XUWbwjU7_uBR" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "def calculate_enrichment_factor_random(ranked_dataset_percentage_cutoff):\n", " \"\"\"\n", " Get the random enrichment factor for a given percentage of the ranked dataset.\n", "\n", " Parameters\n", " ----------\n", " ranked_dataset_percentage_cutoff : float or int\n", " Percentage of ranked dataset to be included in enrichment factor calculation.\n", "\n", " Returns\n", " -------\n", " float\n", " Random enrichment factor.\n", " \"\"\"\n", "\n", " enrichment_factor_random = round(float(ranked_dataset_percentage_cutoff), 1)\n", " return enrichment_factor_random" ], "metadata": { "id": "wbjqjFRr_wqT" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "def calculate_enrichment_factor_optimal(molecules, ranked_dataset_percentage_cutoff, pic50_cutoff):\n", " \"\"\"\n", " Get the optimal random enrichment factor for a given percentage of the ranked dataset.\n", "\n", " Parameters\n", " ----------\n", " molecules : pandas.DataFrame\n", " the DataFrame with all the molecules and pIC50.\n", " ranked_dataset_percentage_cutoff : float or int\n", " Percentage of ranked dataset to be included in enrichment factor calculation.\n", " activity_cutoff: float\n", " pIC50 cutoff value used to discriminate active and inactive molecules\n", "\n", " Returns\n", " -------\n", " float\n", " Optimal enrichment factor.\n", " \"\"\"\n", "\n", " ratio = sum(molecules[\"pIC50\"] >= pic50_cutoff) / len(molecules) * 100\n", " if ranked_dataset_percentage_cutoff <= ratio:\n", " enrichment_factor_optimal = round(100 / ratio * ranked_dataset_percentage_cutoff, 1)\n", " else:\n", " enrichment_factor_optimal = 100.0\n", " return enrichment_factor_optimal" ], "metadata": { "id": "2m4I2IHo_0pj" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "ranked_dataset_percentage_cutoff = 5" ], "metadata": { "id": "WbAfjid7_4MA" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "for similarity_measure, enrichment in enrichment_data.items():\n", " enrichment_factor = calculate_enrichment_factor(enrichment, ranked_dataset_percentage_cutoff)\n", " print(\n", " f\"Experimental EF for {ranked_dataset_percentage_cutoff}% of ranked dataset ({similarity_measure}): {enrichment_factor}%\"\n", " )" ], "metadata": { "id": "osJhc5pK_5zv" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "enrichment_factor_random = calculate_enrichment_factor_random(ranked_dataset_percentage_cutoff)\n", "print(\n", " f\"Random EF for {ranked_dataset_percentage_cutoff}% of ranked dataset: {enrichment_factor_random}%\"\n", ")\n", "enrichment_factor_optimal = calculate_enrichment_factor_optimal(\n", " molecule_dataset, ranked_dataset_percentage_cutoff, pic50_cutoff\n", ")\n", "print(\n", " f\"Optimal EF for {ranked_dataset_percentage_cutoff}% of ranked dataset: {enrichment_factor_optimal}%\"\n", ")\n", "# NBVAL_CHECK_OUTPUT" ], "metadata": { "id": "dY2atRHUACoz" }, "execution_count": null, "outputs": [] } ] }