Diff of /ML_1.ipynb [000000] .. [480cb7]

Switch to side-by-side view

--- a
+++ b/ML_1.ipynb
@@ -0,0 +1,1776 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/francescopatane96/Computer_aided_drug_discovery_kit/blob/main/ML_1.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "In this module, you will learn more about the ChEMBL database and how to extract data from it for a target of interest. Data sets can be used for many cheminformatics tasks, eg. similarity search and clustering or machine learning.\n",
+        "\n",
+        "In this notebook you will find compounds which were tested against a specific target and filtering available bioactivity data."
+      ],
+      "metadata": {
+        "id": "Oe1lkXvnZMPn"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "e106nlobq1s0"
+      },
+      "outputs": [],
+      "source": [
+        "! pip install chembl_webresource_client    "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "U5H59_sy50Eh"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install rdkit"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "GheijY-dsL_O"
+      },
+      "outputs": [],
+      "source": [
+        "import pandas as pd\n",
+        "import math\n",
+        "import rdkit\n",
+        "from tqdm.auto import tqdm\n",
+        "from chembl_webresource_client.new_client import new_client\n",
+        "from pandas import DataFrame\n",
+        "import numpy as np\n",
+        "from rdkit import Chem\n",
+        "from rdkit.Chem import Descriptors, Lipinski, PandasTools\n",
+        "import seaborn as sns\n",
+        "import matplotlib.pyplot as plt\n",
+        "from sklearn.model_selection import train_test_split\n",
+        "from sklearn.ensemble import RandomForestRegressor\n",
+        "from sklearn.feature_selection import VarianceThreshold\n",
+        "from pathlib import Path\n",
+        "from zipfile import ZipFile\n",
+        "from tempfile import TemporaryDirectory"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "HERE = Path(_dh[-1])\n",
+        "DATA = HERE / \"data\""
+      ],
+      "metadata": {
+        "id": "1qRTJhkZcEu4"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "create resource objects for API access"
+      ],
+      "metadata": {
+        "id": "9kcRsLOUcyDt"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "targets_api = new_client.target\n",
+        "compounds_api = new_client.molecule\n",
+        "bioactivities_api = new_client.activity"
+      ],
+      "metadata": {
+        "id": "Lit-Q2R8cPWG"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "type(targets_api)  #show the type of the object"
+      ],
+      "metadata": {
+        "id": "XVU-T3BJcUg-"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "uniprot_id = \"P00533\"    #change the uniprot ID for your project"
+      ],
+      "metadata": {
+        "id": "qKpN49tuckve"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Fetch target data from ChEMBL"
+      ],
+      "metadata": {
+        "id": "LXL1ThN-eG_X"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Get target information from ChEMBL but restrict it to specified class only\n",
+        "targets = targets_api.get(target_components__accession=uniprot_id).only(              ##variable that contains the results of the query\n",
+        "    \"target_chembl_id\", \"organism\", \"pref_name\", \"target_type\"\n",
+        ")\n",
+        "print(f'The type of the targets is \"{type(targets)}\"')"
+      ],
+      "metadata": {
+        "id": "4UKb5NCHeRHN"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Download target data from ChEMBL"
+      ],
+      "metadata": {
+        "id": "sSItAfALerr9"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "targets = pd.DataFrame(targets)\n",
+        "targets"
+      ],
+      "metadata": {
+        "id": "D5TXIYlSeqF1"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Select target (ChEMBL ID)"
+      ],
+      "metadata": {
+        "id": "bx1U-xWjfo23"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "target = targets.iloc[0]\n",
+        "target"
+      ],
+      "metadata": {
+        "id": "sewq42tUgorQ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "save chembl id"
+      ],
+      "metadata": {
+        "id": "cbgfetyajN_d"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "target_id = target.target_chembl_id\n",
+        "print(f\"The target ChEMBL ID is {target_id}\")"
+      ],
+      "metadata": {
+        "id": "hJTy18a9iQs1"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Get Bioactivity data"
+      ],
+      "metadata": {
+        "id": "XHHY1Oa6kk8W"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "bioactivities = bioactivities_api.filter(\n",
+        "    target_chembl_id=target_id, type=\"IC50\", relation=\"=\", assay_type=\"B\"\n",
+        ").only(\n",
+        "    \"activity_id\",\n",
+        "    \"assay_chembl_id\",\n",
+        "    \"assay_description\",\n",
+        "    \"assay_type\",\n",
+        "    \"molecule_chembl_id\",\n",
+        "    \"type\",\n",
+        "    \"standard_units\",\n",
+        "    \"relation\",\n",
+        "    \"standard_value\",\n",
+        "    \"target_chembl_id\",\n",
+        "    \"target_organism\",\n",
+        ")\n",
+        "\n",
+        "print(f\"Length and type of bioactivities object: {len(bioactivities)}, {type(bioactivities)}\")"
+      ],
+      "metadata": {
+        "id": "DDowRa21ft0l"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print(f\"Length and type of first element: {len(bioactivities[0])}, {type(bioactivities[0])}\")\n",
+        "bioactivities[0]"
+      ],
+      "metadata": {
+        "id": "87Lg52G3lM_9"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Download Bioactivity data from ChEMBL"
+      ],
+      "metadata": {
+        "id": "h_OiF1rdldYd"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "bioactivities_df = pd.DataFrame.from_records(bioactivities)\n",
+        "print(f\"DataFrame shape: {bioactivities_df.shape}\")\n",
+        "bioactivities_df.head()"
+      ],
+      "metadata": {
+        "id": "tSBwsyeulhJf"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "convert values to nM"
+      ],
+      "metadata": {
+        "id": "kEeVkM9cp03M"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "bioactivities_df['units'].unique()"
+      ],
+      "metadata": {
+        "id": "WxNCpyg8pqDt"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "bioactivities_df.drop([\"units\", \"value\"], axis=1, inplace=True)\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "-2Xi9Z0sqLOc"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "bioactivities_df.head()"
+      ],
+      "metadata": {
+        "id": "5Muz8ErjyvU-"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Preprocess and filter bioactivity data\n",
+        "\n",
+        "1. Convert datatype of “standard_value” from “object” to “float”\n"
+      ],
+      "metadata": {
+        "id": "mL0SzwR1zJSN"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "bioactivities_df.dtypes"
+      ],
+      "metadata": {
+        "id": "VhRtswiNzdE7"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "bioactivities_df = bioactivities_df.astype({\"standard_value\" : \"float64\"})"
+      ],
+      "metadata": {
+        "id": "jPct7o-OzlVT"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "bioactivities_df.dtypes"
+      ],
+      "metadata": {
+        "id": "G24NPnfm0Yf0"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "2. Delete entries with missing values"
+      ],
+      "metadata": {
+        "id": "VwtlFAr60p_W"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "bioactivities_df.dropna(axis=0, how=\"any\", inplace=True)   #drop rows which contain missing values\n",
+        "print(f\"DataFrame shape: {bioactivities_df.shape}\")"
+      ],
+      "metadata": {
+        "id": "4Fh1HrHO0qY7"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "3. Keep only entries with “standard_unit == nM”"
+      ],
+      "metadata": {
+        "id": "X7zrbbG02Dk9"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print(f\"Units in downloaded data: {bioactivities_df['standard_units'].unique()}\")\n",
+        "print(\n",
+        "    f\"Number of non-nM entries:\\\n",
+        "    {bioactivities_df[bioactivities_df['standard_units'] != 'nM'].shape[0]}\"\n",
+        ")"
+      ],
+      "metadata": {
+        "id": "07ywlhOp2Ful"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "bioactivities_df = bioactivities_df[bioactivities_df[\"standard_units\"] == \"nM\"]\n",
+        "print(f\"Units after filtering: {bioactivities_df['standard_units'].unique()}\")"
+      ],
+      "metadata": {
+        "id": "FTwQNUNm3b6T"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print(f\"DataFrame shape: {bioactivities_df.shape}\")"
+      ],
+      "metadata": {
+        "id": "gZXOar0b4JCb"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "4. Delete duplicate molecules"
+      ],
+      "metadata": {
+        "id": "wRL-r3Mk4Qpa"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "bioactivities_df.drop_duplicates(\"molecule_chembl_id\", keep=\"first\", inplace=True)\n",
+        "print(f\"DataFrame shape: {bioactivities_df.shape}\")"
+      ],
+      "metadata": {
+        "id": "HxpzxMOn4RTA"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "5. Reset “DataFrame” index"
+      ],
+      "metadata": {
+        "id": "0TCyGhxc7lG0"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "bioactivities_df.reset_index(drop=True, inplace=True)\n",
+        "bioactivities_df.head()\n"
+      ],
+      "metadata": {
+        "id": "6E8aR1_h5ODX"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "6. Rename columns"
+      ],
+      "metadata": {
+        "id": "tW6-uahs7wrL"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "bioactivities_df.rename(\n",
+        "    columns={\"standard_value\": \"IC50\", \"standard_units\": \"units\"}, inplace=True\n",
+        ")\n",
+        "bioactivities_df.head()"
+      ],
+      "metadata": {
+        "id": "KaAy_0Mc7xOs"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print(f\"DataFrame shape: {bioactivities_df.shape}\")"
+      ],
+      "metadata": {
+        "id": "Hof3VZxZ77gj"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Fetch compound data from ChEMBL"
+      ],
+      "metadata": {
+        "id": "aIS9YaVV8TQK"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "compounds_provider = compounds_api.filter(\n",
+        "    molecule_chembl_id__in=list(bioactivities_df[\"molecule_chembl_id\"])\n",
+        ").only(\"molecule_chembl_id\", \"molecule_structures\")"
+      ],
+      "metadata": {
+        "id": "hXA6DPwi8Vxb"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Download compound data from ChEMBL"
+      ],
+      "metadata": {
+        "id": "0S6rNxRi8flL"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "compounds = list(tqdm(compounds_provider))"
+      ],
+      "metadata": {
+        "id": "yAknPJXu8gOD"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "compounds_df = pd.DataFrame.from_records(\n",
+        "    compounds,\n",
+        ")\n",
+        "print(f\"DataFrame shape: {compounds_df.shape}\")"
+      ],
+      "metadata": {
+        "id": "D5GZ956xStWC"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "compounds_df.head()"
+      ],
+      "metadata": {
+        "id": "rp7AZJuoTAkJ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Preprocess and filter compound data"
+      ],
+      "metadata": {
+        "id": "aYNuT613TY3A"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "1. Remove entries with missing molecule structure entry"
+      ],
+      "metadata": {
+        "id": "mKvosdsvTetA"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "compounds_df.dropna(axis=0, how=\"any\", inplace=True)\n",
+        "print(f\"DataFrame shape: {compounds_df.shape}\")"
+      ],
+      "metadata": {
+        "id": "ih416BHBTcLw"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "2. Delete duplicate molecules"
+      ],
+      "metadata": {
+        "id": "4mn6curmUYgY"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "compounds_df.drop_duplicates(\"molecule_chembl_id\", keep=\"first\", inplace=True)\n",
+        "print(f\"DataFrame shape: {compounds_df.shape}\")"
+      ],
+      "metadata": {
+        "id": "eeDEjxiMUZEU"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "3. Get molecules with canonical SMILES"
+      ],
+      "metadata": {
+        "id": "DNAz4-D2VraP"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "compounds_df.iloc[0].molecule_structures.keys()"
+      ],
+      "metadata": {
+        "id": "34Jox4MjVxlH"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "canonical_smiles = []\n",
+        "\n",
+        "for i, compounds in compounds_df.iterrows():\n",
+        "    try:\n",
+        "        canonical_smiles.append(compounds[\"molecule_structures\"][\"canonical_smiles\"])\n",
+        "    except KeyError:\n",
+        "        canonical_smiles.append(None)\n",
+        "\n",
+        "compounds_df[\"smiles\"] = canonical_smiles\n",
+        "compounds_df.drop(\"molecule_structures\", axis=1, inplace=True)\n",
+        "print(f\"DataFrame shape: {compounds_df.shape}\")"
+      ],
+      "metadata": {
+        "id": "X5n6vGUBWBxw"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "compounds_df.dropna(axis=0, how=\"any\", inplace=True)\n",
+        "print(f\"DataFrame shape: {compounds_df.shape}\")"
+      ],
+      "metadata": {
+        "id": "Dl68xc3NWKag"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Summary of compound and bioactivity data"
+      ],
+      "metadata": {
+        "id": "afNMfjvpWTFd"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print(f\"Bioactivities filtered: {bioactivities_df.shape[0]}\")\n",
+        "bioactivities_df.columns"
+      ],
+      "metadata": {
+        "id": "sGYwdkcCWTq6"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print(f\"Compounds filtered: {compounds_df.shape[0]}\")\n",
+        "compounds_df.columns"
+      ],
+      "metadata": {
+        "id": "7MGMVVJvZt0n"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Merge both datasets"
+      ],
+      "metadata": {
+        "id": "FGcP0j4uZ2nA"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Merge DataFrames\n",
+        "output_df = pd.merge(\n",
+        "    bioactivities_df[[\"molecule_chembl_id\", \"IC50\", \"units\"]],\n",
+        "    compounds_df,\n",
+        "    on=\"molecule_chembl_id\",\n",
+        ")\n",
+        "\n",
+        "# Reset row indices\n",
+        "output_df.reset_index(drop=True, inplace=True)\n",
+        "\n",
+        "print(f\"Dataset with {output_df.shape[0]} entries.\")"
+      ],
+      "metadata": {
+        "id": "0zDTBibAZ_Rf"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "output_df.dtypes"
+      ],
+      "metadata": {
+        "id": "pDdiJPS5a94A"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "output_df.head(10)"
+      ],
+      "metadata": {
+        "id": "aBuWZ-YIbDZY"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Add pIC50 values"
+      ],
+      "metadata": {
+        "id": "PNqDKdFlbH2_"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def convert_ic50_to_pic50(IC50_value):\n",
+        "    pIC50_value = 9 - math.log10(IC50_value)\n",
+        "    return pIC50_value"
+      ],
+      "metadata": {
+        "id": "MSdbsgQgbQvg"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Apply conversion to each row of the compounds DataFrame\n",
+        "output_df[\"pIC50\"] = output_df.apply(lambda x: convert_ic50_to_pic50(x.IC50), axis=1)"
+      ],
+      "metadata": {
+        "id": "gCjIDvlYbVUv"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "output_df.head()"
+      ],
+      "metadata": {
+        "id": "tH7o_kLibyLy"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Draw compound data"
+      ],
+      "metadata": {
+        "id": "Fa3HGN67cNgK"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "output_df.hist(column=\"pIC50\")"
+      ],
+      "metadata": {
+        "id": "AerCnuTFcMaI"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Add molecule column\n",
+        "PandasTools.AddMoleculeColumnToFrame(output_df, smilesCol=\"smiles\")"
+      ],
+      "metadata": {
+        "id": "YAYGFRulcybY"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Sort molecules by pIC50\n",
+        "output_df.sort_values(by=\"pIC50\", ascending=False, inplace=True)"
+      ],
+      "metadata": {
+        "id": "hZFyN8i5c7WK"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Reset index\n",
+        "output_df.reset_index(drop=True, inplace=True)"
+      ],
+      "metadata": {
+        "id": "cahO1MlXc9xo"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "output_df.drop(\"smiles\", axis=1).head(10)"
+      ],
+      "metadata": {
+        "id": "PjF0ghp5dAS3"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print(f\"DataFrame shape: {output_df.shape}\")"
+      ],
+      "metadata": {
+        "id": "vDcnL71Vd3_H"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "output_df.to_csv(\"EGFR_compounds.csv\")\n",
+        "output_df.head()"
+      ],
+      "metadata": {
+        "id": "GuvoBlHmd_zh"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "da qui "
+      ],
+      "metadata": {
+        "id": "koTJopypn5x9"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "YoQHms4Bq-kR"
+      },
+      "outputs": [],
+      "source": [
+        "target = new_client.target                                     \n",
+        "target_query = target.search('acetylcholinesterase')\n",
+        "targets = pd.DataFrame.from_dict(target_query)\n",
+        "pd.set_option('display.max_rows', 100)\n",
+        "pd.set_option('display.max_columns', 100)\n",
+        "print(targets)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "jFyCFTvCrBQ0"
+      },
+      "outputs": [],
+      "source": [
+        "selected_target = targets.target_chembl_id[0]\n",
+        "selected_target"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "O2GG2u3WrEEE"
+      },
+      "outputs": [],
+      "source": [
+        "activity = new_client.activity\n",
+        "res = activity.filter(target_chembl_id=selected_target).filter(standard_type=\"IC50\")\n",
+        "print(res)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "kbOCvFcJrJc9"
+      },
+      "outputs": [],
+      "source": [
+        "df = pd.DataFrame.from_dict(res)\n",
+        "print(df)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rcn6cRGZrNZU"
+      },
+      "outputs": [],
+      "source": [
+        "df.to_csv('acetylcholinesterase_bioactivity_data.csv', index=False)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "wSnzHtqgrQlF"
+      },
+      "outputs": [],
+      "source": [
+        "act_normal = df[df.standard_value.notna()]\n",
+        "act_normal = act_normal[act_normal.canonical_smiles.notna()]\n",
+        "act_normal = act_normal.drop_duplicates(['canonical_smiles'])\n",
+        "act_normal"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "CFiSpv_drTuV"
+      },
+      "outputs": [],
+      "source": [
+        "selection = ['molecule_chembl_id', 'canonical_smiles', 'standard_value']\n",
+        "new_act = act_normal[selection]\n",
+        "new_act"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "VtcNkworrXoO"
+      },
+      "outputs": [],
+      "source": [
+        "new_act.to_csv('acetylcholinesterase_bioactivity_clear', index=False)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "LYZAC9sqrpCt"
+      },
+      "outputs": [],
+      "source": [
+        "# This is temporary line\n",
+        "new_act = pd.read_csv('acetylcholinesterase_bioactivity_clear')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "oJJ-QG1LrsSs"
+      },
+      "outputs": [],
+      "source": [
+        "bioactivity_threshold = []\n",
+        "for i in new_act.standard_value:\n",
+        "  if float(i) >= 10000:\n",
+        "    bioactivity_threshold.append('inactive')\n",
+        "  elif float(i) <= 1000:\n",
+        "    bioactivity_threshold.append('active')\n",
+        "  else:\n",
+        "    bioactivity_threshold.append('intermediate')\n",
+        "bioactivity_class = pd.Series(bioactivity_threshold, name = 'bioactivity_class')\n",
+        "act5 = pd.concat([new_act, bioactivity_class], axis=1)\n",
+        "act5"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "LZ7yjq1ururk"
+      },
+      "outputs": [],
+      "source": [
+        "act5 = act5.dropna()\n",
+        "act5"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "yAXc6YJYrw9d"
+      },
+      "outputs": [],
+      "source": [
+        "act5.to_csv('bioactivity_with_class.csv', index=False)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "7zA_ktpBr0p-"
+      },
+      "outputs": [],
+      "source": [
+        "! wget https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh\n",
+        "! chmod +x Miniconda3-py37_4.8.2-Linux-x86_64.sh\n",
+        "! bash ./Miniconda3-py37_4.8.2-Linux-x86_64.sh -b -f -p /usr/local\n",
+        "! conda install -c rdkit rdkit -y\n",
+        "import sys\n",
+        "sys.path.append('/usr/local/lib/python3.7/site-packages/')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "6kx4rS3gr3sd"
+      },
+      "outputs": [],
+      "source": [
+        "act_nosmiles = act5.drop(columns='canonical_smiles')\n",
+        "smiles = []\n",
+        "\n",
+        "for i in act5.canonical_smiles.tolist():\n",
+        "  cpd = str(i).split('.')\n",
+        "  cpd_longest = max(cpd, key = len)\n",
+        "  smiles.append(cpd_longest)\n",
+        "\n",
+        "smiles = pd.Series(smiles, name='canonical_smiles')\n",
+        "act_clean_smiles = pd.concat([act_nosmiles,smiles], axis=1)\n",
+        "act_clean_smiles"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "NFcvAxZPAnnH"
+      },
+      "outputs": [],
+      "source": [
+        "\n",
+        "def lipinski(smiles, verbose=False):\n",
+        "    moldata = []\n",
+        "    for elem in smiles:\n",
+        "        mol = Chem.MolFromSmiles(elem)\n",
+        "        moldata.append(mol)\n",
+        "    baseData = np.arange(1, 1)\n",
+        "    i = 0\n",
+        "    for mol in moldata:\n",
+        "\n",
+        "        desc_MolWt = Descriptors.MolWt(mol)\n",
+        "        desc_MolLogP = Descriptors.MolLogP(mol)\n",
+        "        desc_NumHDonors = Lipinski.NumHDonors(mol)\n",
+        "        desc_NumHAcceptors = Lipinski.NumHAcceptors(mol)\n",
+        "  \n",
+        "        row = np.array([desc_MolWt,\n",
+        "                        desc_MolLogP,\n",
+        "                        desc_NumHDonors,\n",
+        "                        desc_NumHAcceptors])\n",
+        "        if (i==0):\n",
+        "            baseData = row\n",
+        "        else:\n",
+        "            baseData = np.vstack([baseData, row])\n",
+        "        i = i + 1\n",
+        "\n",
+        "    columnNames =  [\"MW\", \"LogP\", \"NumHDonors\", \"NumHAcceptors\"]\n",
+        "    descriptors = pd.DataFrame(data=baseData, columns=columnNames)\n",
+        "    return descriptors\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "0T2JY6OID6u_"
+      },
+      "outputs": [],
+      "source": [
+        "act_lipinski = lipinski(act_clean_smiles.canonical_smiles)\n",
+        "act_lipinski"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "xLnCc3HtE_Cd"
+      },
+      "outputs": [],
+      "source": [
+        "act_comb = pd.concat([act5, act_lipinski], axis = 1)\n",
+        "act_comb"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "DXatD_w4FW4F"
+      },
+      "source": [
+        "normalizing standard values"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "frzwrxXMFZEk"
+      },
+      "outputs": [],
+      "source": [
+        "def norm_value(input):\n",
+        "    norm = []\n",
+        "\n",
+        "    for i in input['standard_value']:\n",
+        "        if i > 100000000:\n",
+        "          i = 100000000\n",
+        "        norm.append(i)\n",
+        "\n",
+        "    input['standard_value_norm'] = norm\n",
+        "    x = input.drop('standard_value', 1)\n",
+        "        \n",
+        "    return x"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "HuA3tdW0Febi"
+      },
+      "outputs": [],
+      "source": [
+        "act_norm = norm_value(act_comb)\n",
+        "act_norm"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "fJX4BDBdFkMs"
+      },
+      "source": [
+        "converting to pIC50"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "24y3Os7tFi4G"
+      },
+      "outputs": [],
+      "source": [
+        "def pIC50(input):\n",
+        "    pIC50 = []\n",
+        "\n",
+        "    for i in input['standard_value_norm']:\n",
+        "        molar = i*(10**-9) # Converts nM to M\n",
+        "        pIC50.append(-np.log10(molar))\n",
+        "\n",
+        "    input['pIC50'] = pIC50\n",
+        "    x = input.drop('standard_value_norm', 1)\n",
+        "        \n",
+        "    return x"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "7UCx_2ukFqet"
+      },
+      "outputs": [],
+      "source": [
+        "act_final = pIC50(act_norm)\n",
+        "act_final"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ZA0cWxfbHIUT"
+      },
+      "outputs": [],
+      "source": [
+        "act_final.to_csv('bioactvity_pIC50.csv')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "UvNudUP2IUke"
+      },
+      "source": [
+        "Exploratory data analysis (chemical space analysis) via lipinski descriptors"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "OX-2wFNeITs_"
+      },
+      "outputs": [],
+      "source": [
+        "act_fn = act_final[act_final['bioactivity_class'] != 'intermediate']\n",
+        "act_fn"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "t7hwhA53IsMS"
+      },
+      "source": [
+        "frequency of 2 classes (active, inactive)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Gg1yi_pPIqYL"
+      },
+      "outputs": [],
+      "source": [
+        "plt.figure(figsize=(5.5, 5.5))\n",
+        "sns.countplot(x='bioactivity_class', data=act_fn, edgecolor='black')\n",
+        "plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')\n",
+        "plt.ylabel('Frequency', fontsize=14, fontweight='bold')\n",
+        "plt.savefig('plot_bioactivity_class.pdf')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "_OGm6lDzI1dy"
+      },
+      "source": [
+        "making statistical analysis"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rGz2ntRYI4Jb"
+      },
+      "outputs": [],
+      "source": [
+        "def mannwhitney(descriptor, verbose=False):\n",
+        "  from numpy.random import seed\n",
+        "  from numpy.random import randn\n",
+        "  from scipy.stats import mannwhitneyu\n",
+        "\n",
+        "# seed the random number generator\n",
+        "  seed(1)\n",
+        "\n",
+        "# actives and inactives\n",
+        "  selection = [descriptor, 'bioactivity_class']\n",
+        "  df = act_fn[selection]\n",
+        "  active = df[df['bioactivity_class'] == 'active']\n",
+        "  active = active[descriptor]\n",
+        "\n",
+        "  selection = [descriptor, 'bioactivity_class']\n",
+        "  df = act_fn[selection]\n",
+        "  inactive = df[df['bioactivity_class'] == 'inactive']\n",
+        "  inactive = inactive[descriptor]\n",
+        "\n",
+        "# compare samples\n",
+        "  stat, p = mannwhitneyu(active, inactive)\n",
+        "  #print('Statistics=%.3f, p=%.3f' % (stat, p))\n",
+        "\n",
+        "# interpret\n",
+        "  alpha = 0.05\n",
+        "  if p > alpha:\n",
+        "    interpretation = 'Same distribution (fail to reject H0)'\n",
+        "  else:\n",
+        "    interpretation = 'Different distribution (reject H0)'\n",
+        "  \n",
+        "  results = pd.DataFrame({'Descriptor':descriptor,\n",
+        "                          'Statistics':stat,\n",
+        "                          'p':p,\n",
+        "                          'alpha':alpha,\n",
+        "                          'Interpretation':interpretation}, index=[0])\n",
+        "  filename = 'mannwhitneyu_' + descriptor + '.csv'\n",
+        "  results.to_csv(filename)\n",
+        "\n",
+        "  return results"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Y_4FAGSpI8rb"
+      },
+      "outputs": [],
+      "source": [
+        "plt.figure(figsize=(5.5, 5.5))\n",
+        "\n",
+        "sns.boxplot(x = 'bioactivity_class', y = 'pIC50', data = act_fn)\n",
+        "\n",
+        "plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')\n",
+        "plt.ylabel('pIC50 value', fontsize=14, fontweight='bold')\n",
+        "\n",
+        "plt.savefig('plot_ic50.pdf')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "B_PXegE7JCm8"
+      },
+      "outputs": [],
+      "source": [
+        "mannwhitney('pIC50')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "xQ5WSv8JJRPB"
+      },
+      "outputs": [],
+      "source": [
+        "plt.figure(figsize=(5.5, 5.5))\n",
+        "\n",
+        "sns.boxplot(x = 'bioactivity_class', y = 'MW', data = act_fn)\n",
+        "\n",
+        "plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')\n",
+        "plt.ylabel('MW value', fontsize=14, fontweight='bold')\n",
+        "\n",
+        "plt.savefig('plot_MW.pdf')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "GVa0zXv6JU7D"
+      },
+      "outputs": [],
+      "source": [
+        "mannwhitney('MW')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "k-kFqInSJjBV"
+      },
+      "outputs": [],
+      "source": [
+        "plt.figure(figsize=(5.5, 5.5))\n",
+        "\n",
+        "sns.scatterplot(x='MW', y='LogP', data=act_fn, hue='bioactivity_class', size='pIC50', edgecolor='black', alpha=0.7)\n",
+        "\n",
+        "plt.xlabel('MW', fontsize=14, fontweight='bold')\n",
+        "plt.ylabel('LogP', fontsize=14, fontweight='bold')\n",
+        "plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)\n",
+        "plt.savefig('plot_MW_vs_LogP.pdf')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "LfXsfiUUKfzC"
+      },
+      "outputs": [],
+      "source": [
+        "plt.figure(figsize=(5.5, 5.5))\n",
+        "\n",
+        "sns.boxplot(x = 'bioactivity_class', y = 'LogP', data = act_fn)\n",
+        "\n",
+        "plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')\n",
+        "plt.ylabel('LogP value', fontsize=14, fontweight='bold')\n",
+        "\n",
+        "plt.savefig('plot_LogP.pdf')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "wFmRpaSeKj77"
+      },
+      "outputs": [],
+      "source": [
+        "mannwhitney('LogP')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "zw2AGDBGKnE8"
+      },
+      "outputs": [],
+      "source": [
+        "plt.figure(figsize=(5.5, 5.5))\n",
+        "\n",
+        "sns.boxplot(x = 'bioactivity_class', y = 'NumHDonors', data = act_fn)\n",
+        "\n",
+        "plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')\n",
+        "plt.ylabel('NumHDon value', fontsize=14, fontweight='bold')\n",
+        "\n",
+        "plt.savefig('plot_NumHDon.pdf')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "b1Zzb4pzK1e0"
+      },
+      "outputs": [],
+      "source": [
+        "mannwhitney('NumHDonors')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "NEAWEF3dK77x"
+      },
+      "outputs": [],
+      "source": [
+        "plt.figure(figsize=(5.5, 5.5))\n",
+        "\n",
+        "sns.boxplot(x = 'bioactivity_class', y = 'NumHAcceptors', data = act_fn)\n",
+        "\n",
+        "plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')\n",
+        "plt.ylabel('NumHAcc value', fontsize=14, fontweight='bold')\n",
+        "\n",
+        "plt.savefig('plot_NumHAcc.pdf')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "5D2tGmDWLD6p"
+      },
+      "outputs": [],
+      "source": [
+        "mannwhitney('NumHAcceptors')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "S9tPgTEmLIYK"
+      },
+      "source": [
+        "results interpretation\n",
+        "\n",
+        "All of the four Lipinski descriptors exhibited statistically significant difference between active and inactive molecules."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ot6FtEFmLUrT"
+      },
+      "source": [
+        "let's calculate other descriptors with PADEL"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "wWuiZc4xLPK0"
+      },
+      "outputs": [],
+      "source": [
+        "! wget https://github.com/gromdimon/features/raw/main/padel.sh\n",
+        "! wget https://github.com/gromdimon/features/raw/main/padel.zip"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "n2RNWYMCLfAV"
+      },
+      "outputs": [],
+      "source": [
+        "!unzip padel.zip"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "b33r0tPFLkPS"
+      },
+      "outputs": [],
+      "source": [
+        "act_final"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "yHcTh8fhLvdS"
+      },
+      "outputs": [],
+      "source": [
+        "selection = ['canonical_smiles', 'molecule_chembl_id']\n",
+        "act_selected = act_final[selection]\n",
+        "act_selected.to_csv('molecule.smi', sep='\\t', index=False, header=False )"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "BihYB7L9L2jq"
+      },
+      "outputs": [],
+      "source": [
+        "! cat molecule.smi | head -5\n",
+        "! cat molecule.smi | wc -l"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "-Mrzv48XMFDT"
+      },
+      "outputs": [],
+      "source": [
+        "!cat padel.sh"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "1tVryFTxMN0y"
+      },
+      "outputs": [],
+      "source": [
+        "!bash padel.sh"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "XbXYzcKEOK2q"
+      },
+      "outputs": [],
+      "source": [
+        "!ls -l"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Dkv5WDtrOPNj"
+      },
+      "source": [
+        "preparing data for later researchs"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Y8uK0GzqOSpM"
+      },
+      "outputs": [],
+      "source": [
+        "actx = pd.read_csv('descriptors_output.csv')\n",
+        "actx"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "OYJFFGJ6OW8E"
+      },
+      "outputs": [],
+      "source": [
+        "actx = actx.drop(columns='Name')\n",
+        "actx\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "LGCK3PaMTMWw"
+      },
+      "outputs": [],
+      "source": [
+        "actx.to_csv('actx.csv')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "MtWOd6NOTSPX"
+      },
+      "outputs": [],
+      "source": [
+        "acty = act_final['pIC50']\n",
+        "acty"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "UOuJfYsRTYoZ"
+      },
+      "outputs": [],
+      "source": [
+        "actx.shape"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "oVworOtbTa3Y"
+      },
+      "outputs": [],
+      "source": [
+        "acty.shape"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "m8rAgOi4Texp"
+      },
+      "source": [
+        "making new datase"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "IAoSj0dqTd6r"
+      },
+      "outputs": [],
+      "source": [
+        "datasetxy = pd.concat([actx, acty], axis=1)\n",
+        "datasetxy"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "GE_pa80NTnJj"
+      },
+      "outputs": [],
+      "source": [
+        "datasetxy.to_csv('dataset_with_padel_pIC50.csv', index=False)"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "authorship_tag": "ABX9TyPDQWlqT55dqPOAlKArRgo2",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "gpuClass": "standard"
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file