--- a +++ b/ML_1.ipynb @@ -0,0 +1,1776 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "<a href=\"https://colab.research.google.com/github/francescopatane96/Computer_aided_drug_discovery_kit/blob/main/ML_1.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" + ] + }, + { + "cell_type": "markdown", + "source": [ + "In this module, you will learn more about the ChEMBL database and how to extract data from it for a target of interest. Data sets can be used for many cheminformatics tasks, eg. similarity search and clustering or machine learning.\n", + "\n", + "In this notebook you will find compounds which were tested against a specific target and filtering available bioactivity data." + ], + "metadata": { + "id": "Oe1lkXvnZMPn" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "e106nlobq1s0" + }, + "outputs": [], + "source": [ + "! pip install chembl_webresource_client " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "U5H59_sy50Eh" + }, + "outputs": [], + "source": [ + "!pip install rdkit" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GheijY-dsL_O" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import math\n", + "import rdkit\n", + "from tqdm.auto import tqdm\n", + "from chembl_webresource_client.new_client import new_client\n", + "from pandas import DataFrame\n", + "import numpy as np\n", + "from rdkit import Chem\n", + "from rdkit.Chem import Descriptors, Lipinski, PandasTools\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.feature_selection import VarianceThreshold\n", + "from pathlib import Path\n", + "from zipfile import ZipFile\n", + "from tempfile import TemporaryDirectory" + ] + }, + { + "cell_type": "code", + "source": [ + "HERE = Path(_dh[-1])\n", + "DATA = HERE / \"data\"" + ], + "metadata": { + "id": "1qRTJhkZcEu4" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "create resource objects for API access" + ], + "metadata": { + "id": "9kcRsLOUcyDt" + } + }, + { + "cell_type": "code", + "source": [ + "targets_api = new_client.target\n", + "compounds_api = new_client.molecule\n", + "bioactivities_api = new_client.activity" + ], + "metadata": { + "id": "Lit-Q2R8cPWG" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "type(targets_api) #show the type of the object" + ], + "metadata": { + "id": "XVU-T3BJcUg-" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "uniprot_id = \"P00533\" #change the uniprot ID for your project" + ], + "metadata": { + "id": "qKpN49tuckve" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Fetch target data from ChEMBL" + ], + "metadata": { + "id": "LXL1ThN-eG_X" + } + }, + { + "cell_type": "code", + "source": [ + "# Get target information from ChEMBL but restrict it to specified class only\n", + "targets = targets_api.get(target_components__accession=uniprot_id).only( ##variable that contains the results of the query\n", + " \"target_chembl_id\", \"organism\", \"pref_name\", \"target_type\"\n", + ")\n", + "print(f'The type of the targets is \"{type(targets)}\"')" + ], + "metadata": { + "id": "4UKb5NCHeRHN" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Download target data from ChEMBL" + ], + "metadata": { + "id": "sSItAfALerr9" + } + }, + { + "cell_type": "code", + "source": [ + "targets = pd.DataFrame(targets)\n", + "targets" + ], + "metadata": { + "id": "D5TXIYlSeqF1" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Select target (ChEMBL ID)" + ], + "metadata": { + "id": "bx1U-xWjfo23" + } + }, + { + "cell_type": "code", + "source": [ + "target = targets.iloc[0]\n", + "target" + ], + "metadata": { + "id": "sewq42tUgorQ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "save chembl id" + ], + "metadata": { + "id": "cbgfetyajN_d" + } + }, + { + "cell_type": "code", + "source": [ + "target_id = target.target_chembl_id\n", + "print(f\"The target ChEMBL ID is {target_id}\")" + ], + "metadata": { + "id": "hJTy18a9iQs1" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Get Bioactivity data" + ], + "metadata": { + "id": "XHHY1Oa6kk8W" + } + }, + { + "cell_type": "code", + "source": [ + "bioactivities = bioactivities_api.filter(\n", + " target_chembl_id=target_id, type=\"IC50\", relation=\"=\", assay_type=\"B\"\n", + ").only(\n", + " \"activity_id\",\n", + " \"assay_chembl_id\",\n", + " \"assay_description\",\n", + " \"assay_type\",\n", + " \"molecule_chembl_id\",\n", + " \"type\",\n", + " \"standard_units\",\n", + " \"relation\",\n", + " \"standard_value\",\n", + " \"target_chembl_id\",\n", + " \"target_organism\",\n", + ")\n", + "\n", + "print(f\"Length and type of bioactivities object: {len(bioactivities)}, {type(bioactivities)}\")" + ], + "metadata": { + "id": "DDowRa21ft0l" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "print(f\"Length and type of first element: {len(bioactivities[0])}, {type(bioactivities[0])}\")\n", + "bioactivities[0]" + ], + "metadata": { + "id": "87Lg52G3lM_9" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Download Bioactivity data from ChEMBL" + ], + "metadata": { + "id": "h_OiF1rdldYd" + } + }, + { + "cell_type": "code", + "source": [ + "bioactivities_df = pd.DataFrame.from_records(bioactivities)\n", + "print(f\"DataFrame shape: {bioactivities_df.shape}\")\n", + "bioactivities_df.head()" + ], + "metadata": { + "id": "tSBwsyeulhJf" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "convert values to nM" + ], + "metadata": { + "id": "kEeVkM9cp03M" + } + }, + { + "cell_type": "code", + "source": [ + "bioactivities_df['units'].unique()" + ], + "metadata": { + "id": "WxNCpyg8pqDt" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "bioactivities_df.drop([\"units\", \"value\"], axis=1, inplace=True)\n", + "\n" + ], + "metadata": { + "id": "-2Xi9Z0sqLOc" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "bioactivities_df.head()" + ], + "metadata": { + "id": "5Muz8ErjyvU-" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Preprocess and filter bioactivity data\n", + "\n", + "1. Convert datatype of “standard_value” from “object” to “float”\n" + ], + "metadata": { + "id": "mL0SzwR1zJSN" + } + }, + { + "cell_type": "code", + "source": [ + "bioactivities_df.dtypes" + ], + "metadata": { + "id": "VhRtswiNzdE7" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "bioactivities_df = bioactivities_df.astype({\"standard_value\" : \"float64\"})" + ], + "metadata": { + "id": "jPct7o-OzlVT" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "bioactivities_df.dtypes" + ], + "metadata": { + "id": "G24NPnfm0Yf0" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "2. Delete entries with missing values" + ], + "metadata": { + "id": "VwtlFAr60p_W" + } + }, + { + "cell_type": "code", + "source": [ + "bioactivities_df.dropna(axis=0, how=\"any\", inplace=True) #drop rows which contain missing values\n", + "print(f\"DataFrame shape: {bioactivities_df.shape}\")" + ], + "metadata": { + "id": "4Fh1HrHO0qY7" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "3. Keep only entries with “standard_unit == nM”" + ], + "metadata": { + "id": "X7zrbbG02Dk9" + } + }, + { + "cell_type": "code", + "source": [ + "print(f\"Units in downloaded data: {bioactivities_df['standard_units'].unique()}\")\n", + "print(\n", + " f\"Number of non-nM entries:\\\n", + " {bioactivities_df[bioactivities_df['standard_units'] != 'nM'].shape[0]}\"\n", + ")" + ], + "metadata": { + "id": "07ywlhOp2Ful" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "bioactivities_df = bioactivities_df[bioactivities_df[\"standard_units\"] == \"nM\"]\n", + "print(f\"Units after filtering: {bioactivities_df['standard_units'].unique()}\")" + ], + "metadata": { + "id": "FTwQNUNm3b6T" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "print(f\"DataFrame shape: {bioactivities_df.shape}\")" + ], + "metadata": { + "id": "gZXOar0b4JCb" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "4. Delete duplicate molecules" + ], + "metadata": { + "id": "wRL-r3Mk4Qpa" + } + }, + { + "cell_type": "code", + "source": [ + "bioactivities_df.drop_duplicates(\"molecule_chembl_id\", keep=\"first\", inplace=True)\n", + "print(f\"DataFrame shape: {bioactivities_df.shape}\")" + ], + "metadata": { + "id": "HxpzxMOn4RTA" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "5. Reset “DataFrame” index" + ], + "metadata": { + "id": "0TCyGhxc7lG0" + } + }, + { + "cell_type": "code", + "source": [ + "bioactivities_df.reset_index(drop=True, inplace=True)\n", + "bioactivities_df.head()\n" + ], + "metadata": { + "id": "6E8aR1_h5ODX" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "6. Rename columns" + ], + "metadata": { + "id": "tW6-uahs7wrL" + } + }, + { + "cell_type": "code", + "source": [ + "bioactivities_df.rename(\n", + " columns={\"standard_value\": \"IC50\", \"standard_units\": \"units\"}, inplace=True\n", + ")\n", + "bioactivities_df.head()" + ], + "metadata": { + "id": "KaAy_0Mc7xOs" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "print(f\"DataFrame shape: {bioactivities_df.shape}\")" + ], + "metadata": { + "id": "Hof3VZxZ77gj" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Fetch compound data from ChEMBL" + ], + "metadata": { + "id": "aIS9YaVV8TQK" + } + }, + { + "cell_type": "code", + "source": [ + "compounds_provider = compounds_api.filter(\n", + " molecule_chembl_id__in=list(bioactivities_df[\"molecule_chembl_id\"])\n", + ").only(\"molecule_chembl_id\", \"molecule_structures\")" + ], + "metadata": { + "id": "hXA6DPwi8Vxb" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Download compound data from ChEMBL" + ], + "metadata": { + "id": "0S6rNxRi8flL" + } + }, + { + "cell_type": "code", + "source": [ + "compounds = list(tqdm(compounds_provider))" + ], + "metadata": { + "id": "yAknPJXu8gOD" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "compounds_df = pd.DataFrame.from_records(\n", + " compounds,\n", + ")\n", + "print(f\"DataFrame shape: {compounds_df.shape}\")" + ], + "metadata": { + "id": "D5GZ956xStWC" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "compounds_df.head()" + ], + "metadata": { + "id": "rp7AZJuoTAkJ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Preprocess and filter compound data" + ], + "metadata": { + "id": "aYNuT613TY3A" + } + }, + { + "cell_type": "markdown", + "source": [ + "1. Remove entries with missing molecule structure entry" + ], + "metadata": { + "id": "mKvosdsvTetA" + } + }, + { + "cell_type": "code", + "source": [ + "compounds_df.dropna(axis=0, how=\"any\", inplace=True)\n", + "print(f\"DataFrame shape: {compounds_df.shape}\")" + ], + "metadata": { + "id": "ih416BHBTcLw" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "2. Delete duplicate molecules" + ], + "metadata": { + "id": "4mn6curmUYgY" + } + }, + { + "cell_type": "code", + "source": [ + "compounds_df.drop_duplicates(\"molecule_chembl_id\", keep=\"first\", inplace=True)\n", + "print(f\"DataFrame shape: {compounds_df.shape}\")" + ], + "metadata": { + "id": "eeDEjxiMUZEU" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "3. Get molecules with canonical SMILES" + ], + "metadata": { + "id": "DNAz4-D2VraP" + } + }, + { + "cell_type": "code", + "source": [ + "compounds_df.iloc[0].molecule_structures.keys()" + ], + "metadata": { + "id": "34Jox4MjVxlH" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "canonical_smiles = []\n", + "\n", + "for i, compounds in compounds_df.iterrows():\n", + " try:\n", + " canonical_smiles.append(compounds[\"molecule_structures\"][\"canonical_smiles\"])\n", + " except KeyError:\n", + " canonical_smiles.append(None)\n", + "\n", + "compounds_df[\"smiles\"] = canonical_smiles\n", + "compounds_df.drop(\"molecule_structures\", axis=1, inplace=True)\n", + "print(f\"DataFrame shape: {compounds_df.shape}\")" + ], + "metadata": { + "id": "X5n6vGUBWBxw" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "compounds_df.dropna(axis=0, how=\"any\", inplace=True)\n", + "print(f\"DataFrame shape: {compounds_df.shape}\")" + ], + "metadata": { + "id": "Dl68xc3NWKag" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Summary of compound and bioactivity data" + ], + "metadata": { + "id": "afNMfjvpWTFd" + } + }, + { + "cell_type": "code", + "source": [ + "print(f\"Bioactivities filtered: {bioactivities_df.shape[0]}\")\n", + "bioactivities_df.columns" + ], + "metadata": { + "id": "sGYwdkcCWTq6" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "print(f\"Compounds filtered: {compounds_df.shape[0]}\")\n", + "compounds_df.columns" + ], + "metadata": { + "id": "7MGMVVJvZt0n" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Merge both datasets" + ], + "metadata": { + "id": "FGcP0j4uZ2nA" + } + }, + { + "cell_type": "code", + "source": [ + "# Merge DataFrames\n", + "output_df = pd.merge(\n", + " bioactivities_df[[\"molecule_chembl_id\", \"IC50\", \"units\"]],\n", + " compounds_df,\n", + " on=\"molecule_chembl_id\",\n", + ")\n", + "\n", + "# Reset row indices\n", + "output_df.reset_index(drop=True, inplace=True)\n", + "\n", + "print(f\"Dataset with {output_df.shape[0]} entries.\")" + ], + "metadata": { + "id": "0zDTBibAZ_Rf" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "output_df.dtypes" + ], + "metadata": { + "id": "pDdiJPS5a94A" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "output_df.head(10)" + ], + "metadata": { + "id": "aBuWZ-YIbDZY" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Add pIC50 values" + ], + "metadata": { + "id": "PNqDKdFlbH2_" + } + }, + { + "cell_type": "code", + "source": [ + "def convert_ic50_to_pic50(IC50_value):\n", + " pIC50_value = 9 - math.log10(IC50_value)\n", + " return pIC50_value" + ], + "metadata": { + "id": "MSdbsgQgbQvg" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Apply conversion to each row of the compounds DataFrame\n", + "output_df[\"pIC50\"] = output_df.apply(lambda x: convert_ic50_to_pic50(x.IC50), axis=1)" + ], + "metadata": { + "id": "gCjIDvlYbVUv" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "output_df.head()" + ], + "metadata": { + "id": "tH7o_kLibyLy" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Draw compound data" + ], + "metadata": { + "id": "Fa3HGN67cNgK" + } + }, + { + "cell_type": "code", + "source": [ + "output_df.hist(column=\"pIC50\")" + ], + "metadata": { + "id": "AerCnuTFcMaI" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Add molecule column\n", + "PandasTools.AddMoleculeColumnToFrame(output_df, smilesCol=\"smiles\")" + ], + "metadata": { + "id": "YAYGFRulcybY" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Sort molecules by pIC50\n", + "output_df.sort_values(by=\"pIC50\", ascending=False, inplace=True)" + ], + "metadata": { + "id": "hZFyN8i5c7WK" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Reset index\n", + "output_df.reset_index(drop=True, inplace=True)" + ], + "metadata": { + "id": "cahO1MlXc9xo" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "output_df.drop(\"smiles\", axis=1).head(10)" + ], + "metadata": { + "id": "PjF0ghp5dAS3" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "print(f\"DataFrame shape: {output_df.shape}\")" + ], + "metadata": { + "id": "vDcnL71Vd3_H" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "output_df.to_csv(\"EGFR_compounds.csv\")\n", + "output_df.head()" + ], + "metadata": { + "id": "GuvoBlHmd_zh" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "da qui " + ], + "metadata": { + "id": "koTJopypn5x9" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YoQHms4Bq-kR" + }, + "outputs": [], + "source": [ + "target = new_client.target \n", + "target_query = target.search('acetylcholinesterase')\n", + "targets = pd.DataFrame.from_dict(target_query)\n", + "pd.set_option('display.max_rows', 100)\n", + "pd.set_option('display.max_columns', 100)\n", + "print(targets)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jFyCFTvCrBQ0" + }, + "outputs": [], + "source": [ + "selected_target = targets.target_chembl_id[0]\n", + "selected_target" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "O2GG2u3WrEEE" + }, + "outputs": [], + "source": [ + "activity = new_client.activity\n", + "res = activity.filter(target_chembl_id=selected_target).filter(standard_type=\"IC50\")\n", + "print(res)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kbOCvFcJrJc9" + }, + "outputs": [], + "source": [ + "df = pd.DataFrame.from_dict(res)\n", + "print(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rcn6cRGZrNZU" + }, + "outputs": [], + "source": [ + "df.to_csv('acetylcholinesterase_bioactivity_data.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wSnzHtqgrQlF" + }, + "outputs": [], + "source": [ + "act_normal = df[df.standard_value.notna()]\n", + "act_normal = act_normal[act_normal.canonical_smiles.notna()]\n", + "act_normal = act_normal.drop_duplicates(['canonical_smiles'])\n", + "act_normal" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "CFiSpv_drTuV" + }, + "outputs": [], + "source": [ + "selection = ['molecule_chembl_id', 'canonical_smiles', 'standard_value']\n", + "new_act = act_normal[selection]\n", + "new_act" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VtcNkworrXoO" + }, + "outputs": [], + "source": [ + "new_act.to_csv('acetylcholinesterase_bioactivity_clear', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LYZAC9sqrpCt" + }, + "outputs": [], + "source": [ + "# This is temporary line\n", + "new_act = pd.read_csv('acetylcholinesterase_bioactivity_clear')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oJJ-QG1LrsSs" + }, + "outputs": [], + "source": [ + "bioactivity_threshold = []\n", + "for i in new_act.standard_value:\n", + " if float(i) >= 10000:\n", + " bioactivity_threshold.append('inactive')\n", + " elif float(i) <= 1000:\n", + " bioactivity_threshold.append('active')\n", + " else:\n", + " bioactivity_threshold.append('intermediate')\n", + "bioactivity_class = pd.Series(bioactivity_threshold, name = 'bioactivity_class')\n", + "act5 = pd.concat([new_act, bioactivity_class], axis=1)\n", + "act5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LZ7yjq1ururk" + }, + "outputs": [], + "source": [ + "act5 = act5.dropna()\n", + "act5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "yAXc6YJYrw9d" + }, + "outputs": [], + "source": [ + "act5.to_csv('bioactivity_with_class.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7zA_ktpBr0p-" + }, + "outputs": [], + "source": [ + "! wget https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh\n", + "! chmod +x Miniconda3-py37_4.8.2-Linux-x86_64.sh\n", + "! bash ./Miniconda3-py37_4.8.2-Linux-x86_64.sh -b -f -p /usr/local\n", + "! conda install -c rdkit rdkit -y\n", + "import sys\n", + "sys.path.append('/usr/local/lib/python3.7/site-packages/')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6kx4rS3gr3sd" + }, + "outputs": [], + "source": [ + "act_nosmiles = act5.drop(columns='canonical_smiles')\n", + "smiles = []\n", + "\n", + "for i in act5.canonical_smiles.tolist():\n", + " cpd = str(i).split('.')\n", + " cpd_longest = max(cpd, key = len)\n", + " smiles.append(cpd_longest)\n", + "\n", + "smiles = pd.Series(smiles, name='canonical_smiles')\n", + "act_clean_smiles = pd.concat([act_nosmiles,smiles], axis=1)\n", + "act_clean_smiles" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NFcvAxZPAnnH" + }, + "outputs": [], + "source": [ + "\n", + "def lipinski(smiles, verbose=False):\n", + " moldata = []\n", + " for elem in smiles:\n", + " mol = Chem.MolFromSmiles(elem)\n", + " moldata.append(mol)\n", + " baseData = np.arange(1, 1)\n", + " i = 0\n", + " for mol in moldata:\n", + "\n", + " desc_MolWt = Descriptors.MolWt(mol)\n", + " desc_MolLogP = Descriptors.MolLogP(mol)\n", + " desc_NumHDonors = Lipinski.NumHDonors(mol)\n", + " desc_NumHAcceptors = Lipinski.NumHAcceptors(mol)\n", + " \n", + " row = np.array([desc_MolWt,\n", + " desc_MolLogP,\n", + " desc_NumHDonors,\n", + " desc_NumHAcceptors])\n", + " if (i==0):\n", + " baseData = row\n", + " else:\n", + " baseData = np.vstack([baseData, row])\n", + " i = i + 1\n", + "\n", + " columnNames = [\"MW\", \"LogP\", \"NumHDonors\", \"NumHAcceptors\"]\n", + " descriptors = pd.DataFrame(data=baseData, columns=columnNames)\n", + " return descriptors\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0T2JY6OID6u_" + }, + "outputs": [], + "source": [ + "act_lipinski = lipinski(act_clean_smiles.canonical_smiles)\n", + "act_lipinski" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xLnCc3HtE_Cd" + }, + "outputs": [], + "source": [ + "act_comb = pd.concat([act5, act_lipinski], axis = 1)\n", + "act_comb" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DXatD_w4FW4F" + }, + "source": [ + "normalizing standard values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "frzwrxXMFZEk" + }, + "outputs": [], + "source": [ + "def norm_value(input):\n", + " norm = []\n", + "\n", + " for i in input['standard_value']:\n", + " if i > 100000000:\n", + " i = 100000000\n", + " norm.append(i)\n", + "\n", + " input['standard_value_norm'] = norm\n", + " x = input.drop('standard_value', 1)\n", + " \n", + " return x" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "HuA3tdW0Febi" + }, + "outputs": [], + "source": [ + "act_norm = norm_value(act_comb)\n", + "act_norm" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fJX4BDBdFkMs" + }, + "source": [ + "converting to pIC50" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "24y3Os7tFi4G" + }, + "outputs": [], + "source": [ + "def pIC50(input):\n", + " pIC50 = []\n", + "\n", + " for i in input['standard_value_norm']:\n", + " molar = i*(10**-9) # Converts nM to M\n", + " pIC50.append(-np.log10(molar))\n", + "\n", + " input['pIC50'] = pIC50\n", + " x = input.drop('standard_value_norm', 1)\n", + " \n", + " return x" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7UCx_2ukFqet" + }, + "outputs": [], + "source": [ + "act_final = pIC50(act_norm)\n", + "act_final" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZA0cWxfbHIUT" + }, + "outputs": [], + "source": [ + "act_final.to_csv('bioactvity_pIC50.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UvNudUP2IUke" + }, + "source": [ + "Exploratory data analysis (chemical space analysis) via lipinski descriptors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "OX-2wFNeITs_" + }, + "outputs": [], + "source": [ + "act_fn = act_final[act_final['bioactivity_class'] != 'intermediate']\n", + "act_fn" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "t7hwhA53IsMS" + }, + "source": [ + "frequency of 2 classes (active, inactive)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Gg1yi_pPIqYL" + }, + "outputs": [], + "source": [ + "plt.figure(figsize=(5.5, 5.5))\n", + "sns.countplot(x='bioactivity_class', data=act_fn, edgecolor='black')\n", + "plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')\n", + "plt.ylabel('Frequency', fontsize=14, fontweight='bold')\n", + "plt.savefig('plot_bioactivity_class.pdf')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_OGm6lDzI1dy" + }, + "source": [ + "making statistical analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rGz2ntRYI4Jb" + }, + "outputs": [], + "source": [ + "def mannwhitney(descriptor, verbose=False):\n", + " from numpy.random import seed\n", + " from numpy.random import randn\n", + " from scipy.stats import mannwhitneyu\n", + "\n", + "# seed the random number generator\n", + " seed(1)\n", + "\n", + "# actives and inactives\n", + " selection = [descriptor, 'bioactivity_class']\n", + " df = act_fn[selection]\n", + " active = df[df['bioactivity_class'] == 'active']\n", + " active = active[descriptor]\n", + "\n", + " selection = [descriptor, 'bioactivity_class']\n", + " df = act_fn[selection]\n", + " inactive = df[df['bioactivity_class'] == 'inactive']\n", + " inactive = inactive[descriptor]\n", + "\n", + "# compare samples\n", + " stat, p = mannwhitneyu(active, inactive)\n", + " #print('Statistics=%.3f, p=%.3f' % (stat, p))\n", + "\n", + "# interpret\n", + " alpha = 0.05\n", + " if p > alpha:\n", + " interpretation = 'Same distribution (fail to reject H0)'\n", + " else:\n", + " interpretation = 'Different distribution (reject H0)'\n", + " \n", + " results = pd.DataFrame({'Descriptor':descriptor,\n", + " 'Statistics':stat,\n", + " 'p':p,\n", + " 'alpha':alpha,\n", + " 'Interpretation':interpretation}, index=[0])\n", + " filename = 'mannwhitneyu_' + descriptor + '.csv'\n", + " results.to_csv(filename)\n", + "\n", + " return results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Y_4FAGSpI8rb" + }, + "outputs": [], + "source": [ + "plt.figure(figsize=(5.5, 5.5))\n", + "\n", + "sns.boxplot(x = 'bioactivity_class', y = 'pIC50', data = act_fn)\n", + "\n", + "plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')\n", + "plt.ylabel('pIC50 value', fontsize=14, fontweight='bold')\n", + "\n", + "plt.savefig('plot_ic50.pdf')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "B_PXegE7JCm8" + }, + "outputs": [], + "source": [ + "mannwhitney('pIC50')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xQ5WSv8JJRPB" + }, + "outputs": [], + "source": [ + "plt.figure(figsize=(5.5, 5.5))\n", + "\n", + "sns.boxplot(x = 'bioactivity_class', y = 'MW', data = act_fn)\n", + "\n", + "plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')\n", + "plt.ylabel('MW value', fontsize=14, fontweight='bold')\n", + "\n", + "plt.savefig('plot_MW.pdf')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GVa0zXv6JU7D" + }, + "outputs": [], + "source": [ + "mannwhitney('MW')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "k-kFqInSJjBV" + }, + "outputs": [], + "source": [ + "plt.figure(figsize=(5.5, 5.5))\n", + "\n", + "sns.scatterplot(x='MW', y='LogP', data=act_fn, hue='bioactivity_class', size='pIC50', edgecolor='black', alpha=0.7)\n", + "\n", + "plt.xlabel('MW', fontsize=14, fontweight='bold')\n", + "plt.ylabel('LogP', fontsize=14, fontweight='bold')\n", + "plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)\n", + "plt.savefig('plot_MW_vs_LogP.pdf')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LfXsfiUUKfzC" + }, + "outputs": [], + "source": [ + "plt.figure(figsize=(5.5, 5.5))\n", + "\n", + "sns.boxplot(x = 'bioactivity_class', y = 'LogP', data = act_fn)\n", + "\n", + "plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')\n", + "plt.ylabel('LogP value', fontsize=14, fontweight='bold')\n", + "\n", + "plt.savefig('plot_LogP.pdf')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wFmRpaSeKj77" + }, + "outputs": [], + "source": [ + "mannwhitney('LogP')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zw2AGDBGKnE8" + }, + "outputs": [], + "source": [ + "plt.figure(figsize=(5.5, 5.5))\n", + "\n", + "sns.boxplot(x = 'bioactivity_class', y = 'NumHDonors', data = act_fn)\n", + "\n", + "plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')\n", + "plt.ylabel('NumHDon value', fontsize=14, fontweight='bold')\n", + "\n", + "plt.savefig('plot_NumHDon.pdf')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "b1Zzb4pzK1e0" + }, + "outputs": [], + "source": [ + "mannwhitney('NumHDonors')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NEAWEF3dK77x" + }, + "outputs": [], + "source": [ + "plt.figure(figsize=(5.5, 5.5))\n", + "\n", + "sns.boxplot(x = 'bioactivity_class', y = 'NumHAcceptors', data = act_fn)\n", + "\n", + "plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')\n", + "plt.ylabel('NumHAcc value', fontsize=14, fontweight='bold')\n", + "\n", + "plt.savefig('plot_NumHAcc.pdf')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5D2tGmDWLD6p" + }, + "outputs": [], + "source": [ + "mannwhitney('NumHAcceptors')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "S9tPgTEmLIYK" + }, + "source": [ + "results interpretation\n", + "\n", + "All of the four Lipinski descriptors exhibited statistically significant difference between active and inactive molecules." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ot6FtEFmLUrT" + }, + "source": [ + "let's calculate other descriptors with PADEL" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wWuiZc4xLPK0" + }, + "outputs": [], + "source": [ + "! wget https://github.com/gromdimon/features/raw/main/padel.sh\n", + "! wget https://github.com/gromdimon/features/raw/main/padel.zip" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "n2RNWYMCLfAV" + }, + "outputs": [], + "source": [ + "!unzip padel.zip" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "b33r0tPFLkPS" + }, + "outputs": [], + "source": [ + "act_final" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "yHcTh8fhLvdS" + }, + "outputs": [], + "source": [ + "selection = ['canonical_smiles', 'molecule_chembl_id']\n", + "act_selected = act_final[selection]\n", + "act_selected.to_csv('molecule.smi', sep='\\t', index=False, header=False )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BihYB7L9L2jq" + }, + "outputs": [], + "source": [ + "! cat molecule.smi | head -5\n", + "! cat molecule.smi | wc -l" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-Mrzv48XMFDT" + }, + "outputs": [], + "source": [ + "!cat padel.sh" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1tVryFTxMN0y" + }, + "outputs": [], + "source": [ + "!bash padel.sh" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XbXYzcKEOK2q" + }, + "outputs": [], + "source": [ + "!ls -l" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Dkv5WDtrOPNj" + }, + "source": [ + "preparing data for later researchs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Y8uK0GzqOSpM" + }, + "outputs": [], + "source": [ + "actx = pd.read_csv('descriptors_output.csv')\n", + "actx" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "OYJFFGJ6OW8E" + }, + "outputs": [], + "source": [ + "actx = actx.drop(columns='Name')\n", + "actx\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LGCK3PaMTMWw" + }, + "outputs": [], + "source": [ + "actx.to_csv('actx.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MtWOd6NOTSPX" + }, + "outputs": [], + "source": [ + "acty = act_final['pIC50']\n", + "acty" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UOuJfYsRTYoZ" + }, + "outputs": [], + "source": [ + "actx.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oVworOtbTa3Y" + }, + "outputs": [], + "source": [ + "acty.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m8rAgOi4Texp" + }, + "source": [ + "making new datase" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "IAoSj0dqTd6r" + }, + "outputs": [], + "source": [ + "datasetxy = pd.concat([actx, acty], axis=1)\n", + "datasetxy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GE_pa80NTnJj" + }, + "outputs": [], + "source": [ + "datasetxy.to_csv('dataset_with_padel_pIC50.csv', index=False)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [], + "authorship_tag": "ABX9TyPDQWlqT55dqPOAlKArRgo2", + "include_colab_link": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + }, + "gpuClass": "standard" + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file