1776 lines (1776 with data), 43.7 kB
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/github/francescopatane96/Computer_aided_drug_discovery_kit/blob/main/ML_1.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"source": [
"In this module, you will learn more about the ChEMBL database and how to extract data from it for a target of interest. Data sets can be used for many cheminformatics tasks, eg. similarity search and clustering or machine learning.\n",
"\n",
"In this notebook you will find compounds which were tested against a specific target and filtering available bioactivity data."
],
"metadata": {
"id": "Oe1lkXvnZMPn"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "e106nlobq1s0"
},
"outputs": [],
"source": [
"! pip install chembl_webresource_client "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "U5H59_sy50Eh"
},
"outputs": [],
"source": [
"!pip install rdkit"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "GheijY-dsL_O"
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import math\n",
"import rdkit\n",
"from tqdm.auto import tqdm\n",
"from chembl_webresource_client.new_client import new_client\n",
"from pandas import DataFrame\n",
"import numpy as np\n",
"from rdkit import Chem\n",
"from rdkit.Chem import Descriptors, Lipinski, PandasTools\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.feature_selection import VarianceThreshold\n",
"from pathlib import Path\n",
"from zipfile import ZipFile\n",
"from tempfile import TemporaryDirectory"
]
},
{
"cell_type": "code",
"source": [
"HERE = Path(_dh[-1])\n",
"DATA = HERE / \"data\""
],
"metadata": {
"id": "1qRTJhkZcEu4"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"create resource objects for API access"
],
"metadata": {
"id": "9kcRsLOUcyDt"
}
},
{
"cell_type": "code",
"source": [
"targets_api = new_client.target\n",
"compounds_api = new_client.molecule\n",
"bioactivities_api = new_client.activity"
],
"metadata": {
"id": "Lit-Q2R8cPWG"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"type(targets_api) #show the type of the object"
],
"metadata": {
"id": "XVU-T3BJcUg-"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"uniprot_id = \"P00533\" #change the uniprot ID for your project"
],
"metadata": {
"id": "qKpN49tuckve"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Fetch target data from ChEMBL"
],
"metadata": {
"id": "LXL1ThN-eG_X"
}
},
{
"cell_type": "code",
"source": [
"# Get target information from ChEMBL but restrict it to specified class only\n",
"targets = targets_api.get(target_components__accession=uniprot_id).only( ##variable that contains the results of the query\n",
" \"target_chembl_id\", \"organism\", \"pref_name\", \"target_type\"\n",
")\n",
"print(f'The type of the targets is \"{type(targets)}\"')"
],
"metadata": {
"id": "4UKb5NCHeRHN"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Download target data from ChEMBL"
],
"metadata": {
"id": "sSItAfALerr9"
}
},
{
"cell_type": "code",
"source": [
"targets = pd.DataFrame(targets)\n",
"targets"
],
"metadata": {
"id": "D5TXIYlSeqF1"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Select target (ChEMBL ID)"
],
"metadata": {
"id": "bx1U-xWjfo23"
}
},
{
"cell_type": "code",
"source": [
"target = targets.iloc[0]\n",
"target"
],
"metadata": {
"id": "sewq42tUgorQ"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"save chembl id"
],
"metadata": {
"id": "cbgfetyajN_d"
}
},
{
"cell_type": "code",
"source": [
"target_id = target.target_chembl_id\n",
"print(f\"The target ChEMBL ID is {target_id}\")"
],
"metadata": {
"id": "hJTy18a9iQs1"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Get Bioactivity data"
],
"metadata": {
"id": "XHHY1Oa6kk8W"
}
},
{
"cell_type": "code",
"source": [
"bioactivities = bioactivities_api.filter(\n",
" target_chembl_id=target_id, type=\"IC50\", relation=\"=\", assay_type=\"B\"\n",
").only(\n",
" \"activity_id\",\n",
" \"assay_chembl_id\",\n",
" \"assay_description\",\n",
" \"assay_type\",\n",
" \"molecule_chembl_id\",\n",
" \"type\",\n",
" \"standard_units\",\n",
" \"relation\",\n",
" \"standard_value\",\n",
" \"target_chembl_id\",\n",
" \"target_organism\",\n",
")\n",
"\n",
"print(f\"Length and type of bioactivities object: {len(bioactivities)}, {type(bioactivities)}\")"
],
"metadata": {
"id": "DDowRa21ft0l"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"print(f\"Length and type of first element: {len(bioactivities[0])}, {type(bioactivities[0])}\")\n",
"bioactivities[0]"
],
"metadata": {
"id": "87Lg52G3lM_9"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Download Bioactivity data from ChEMBL"
],
"metadata": {
"id": "h_OiF1rdldYd"
}
},
{
"cell_type": "code",
"source": [
"bioactivities_df = pd.DataFrame.from_records(bioactivities)\n",
"print(f\"DataFrame shape: {bioactivities_df.shape}\")\n",
"bioactivities_df.head()"
],
"metadata": {
"id": "tSBwsyeulhJf"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"convert values to nM"
],
"metadata": {
"id": "kEeVkM9cp03M"
}
},
{
"cell_type": "code",
"source": [
"bioactivities_df['units'].unique()"
],
"metadata": {
"id": "WxNCpyg8pqDt"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"bioactivities_df.drop([\"units\", \"value\"], axis=1, inplace=True)\n",
"\n"
],
"metadata": {
"id": "-2Xi9Z0sqLOc"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"bioactivities_df.head()"
],
"metadata": {
"id": "5Muz8ErjyvU-"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Preprocess and filter bioactivity data\n",
"\n",
"1. Convert datatype of “standard_value” from “object” to “float”\n"
],
"metadata": {
"id": "mL0SzwR1zJSN"
}
},
{
"cell_type": "code",
"source": [
"bioactivities_df.dtypes"
],
"metadata": {
"id": "VhRtswiNzdE7"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"bioactivities_df = bioactivities_df.astype({\"standard_value\" : \"float64\"})"
],
"metadata": {
"id": "jPct7o-OzlVT"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"bioactivities_df.dtypes"
],
"metadata": {
"id": "G24NPnfm0Yf0"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"2. Delete entries with missing values"
],
"metadata": {
"id": "VwtlFAr60p_W"
}
},
{
"cell_type": "code",
"source": [
"bioactivities_df.dropna(axis=0, how=\"any\", inplace=True) #drop rows which contain missing values\n",
"print(f\"DataFrame shape: {bioactivities_df.shape}\")"
],
"metadata": {
"id": "4Fh1HrHO0qY7"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"3. Keep only entries with “standard_unit == nM”"
],
"metadata": {
"id": "X7zrbbG02Dk9"
}
},
{
"cell_type": "code",
"source": [
"print(f\"Units in downloaded data: {bioactivities_df['standard_units'].unique()}\")\n",
"print(\n",
" f\"Number of non-nM entries:\\\n",
" {bioactivities_df[bioactivities_df['standard_units'] != 'nM'].shape[0]}\"\n",
")"
],
"metadata": {
"id": "07ywlhOp2Ful"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"bioactivities_df = bioactivities_df[bioactivities_df[\"standard_units\"] == \"nM\"]\n",
"print(f\"Units after filtering: {bioactivities_df['standard_units'].unique()}\")"
],
"metadata": {
"id": "FTwQNUNm3b6T"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"print(f\"DataFrame shape: {bioactivities_df.shape}\")"
],
"metadata": {
"id": "gZXOar0b4JCb"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"4. Delete duplicate molecules"
],
"metadata": {
"id": "wRL-r3Mk4Qpa"
}
},
{
"cell_type": "code",
"source": [
"bioactivities_df.drop_duplicates(\"molecule_chembl_id\", keep=\"first\", inplace=True)\n",
"print(f\"DataFrame shape: {bioactivities_df.shape}\")"
],
"metadata": {
"id": "HxpzxMOn4RTA"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"5. Reset “DataFrame” index"
],
"metadata": {
"id": "0TCyGhxc7lG0"
}
},
{
"cell_type": "code",
"source": [
"bioactivities_df.reset_index(drop=True, inplace=True)\n",
"bioactivities_df.head()\n"
],
"metadata": {
"id": "6E8aR1_h5ODX"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"6. Rename columns"
],
"metadata": {
"id": "tW6-uahs7wrL"
}
},
{
"cell_type": "code",
"source": [
"bioactivities_df.rename(\n",
" columns={\"standard_value\": \"IC50\", \"standard_units\": \"units\"}, inplace=True\n",
")\n",
"bioactivities_df.head()"
],
"metadata": {
"id": "KaAy_0Mc7xOs"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"print(f\"DataFrame shape: {bioactivities_df.shape}\")"
],
"metadata": {
"id": "Hof3VZxZ77gj"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Fetch compound data from ChEMBL"
],
"metadata": {
"id": "aIS9YaVV8TQK"
}
},
{
"cell_type": "code",
"source": [
"compounds_provider = compounds_api.filter(\n",
" molecule_chembl_id__in=list(bioactivities_df[\"molecule_chembl_id\"])\n",
").only(\"molecule_chembl_id\", \"molecule_structures\")"
],
"metadata": {
"id": "hXA6DPwi8Vxb"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Download compound data from ChEMBL"
],
"metadata": {
"id": "0S6rNxRi8flL"
}
},
{
"cell_type": "code",
"source": [
"compounds = list(tqdm(compounds_provider))"
],
"metadata": {
"id": "yAknPJXu8gOD"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"compounds_df = pd.DataFrame.from_records(\n",
" compounds,\n",
")\n",
"print(f\"DataFrame shape: {compounds_df.shape}\")"
],
"metadata": {
"id": "D5GZ956xStWC"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"compounds_df.head()"
],
"metadata": {
"id": "rp7AZJuoTAkJ"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Preprocess and filter compound data"
],
"metadata": {
"id": "aYNuT613TY3A"
}
},
{
"cell_type": "markdown",
"source": [
"1. Remove entries with missing molecule structure entry"
],
"metadata": {
"id": "mKvosdsvTetA"
}
},
{
"cell_type": "code",
"source": [
"compounds_df.dropna(axis=0, how=\"any\", inplace=True)\n",
"print(f\"DataFrame shape: {compounds_df.shape}\")"
],
"metadata": {
"id": "ih416BHBTcLw"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"2. Delete duplicate molecules"
],
"metadata": {
"id": "4mn6curmUYgY"
}
},
{
"cell_type": "code",
"source": [
"compounds_df.drop_duplicates(\"molecule_chembl_id\", keep=\"first\", inplace=True)\n",
"print(f\"DataFrame shape: {compounds_df.shape}\")"
],
"metadata": {
"id": "eeDEjxiMUZEU"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"3. Get molecules with canonical SMILES"
],
"metadata": {
"id": "DNAz4-D2VraP"
}
},
{
"cell_type": "code",
"source": [
"compounds_df.iloc[0].molecule_structures.keys()"
],
"metadata": {
"id": "34Jox4MjVxlH"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"canonical_smiles = []\n",
"\n",
"for i, compounds in compounds_df.iterrows():\n",
" try:\n",
" canonical_smiles.append(compounds[\"molecule_structures\"][\"canonical_smiles\"])\n",
" except KeyError:\n",
" canonical_smiles.append(None)\n",
"\n",
"compounds_df[\"smiles\"] = canonical_smiles\n",
"compounds_df.drop(\"molecule_structures\", axis=1, inplace=True)\n",
"print(f\"DataFrame shape: {compounds_df.shape}\")"
],
"metadata": {
"id": "X5n6vGUBWBxw"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"compounds_df.dropna(axis=0, how=\"any\", inplace=True)\n",
"print(f\"DataFrame shape: {compounds_df.shape}\")"
],
"metadata": {
"id": "Dl68xc3NWKag"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Summary of compound and bioactivity data"
],
"metadata": {
"id": "afNMfjvpWTFd"
}
},
{
"cell_type": "code",
"source": [
"print(f\"Bioactivities filtered: {bioactivities_df.shape[0]}\")\n",
"bioactivities_df.columns"
],
"metadata": {
"id": "sGYwdkcCWTq6"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"print(f\"Compounds filtered: {compounds_df.shape[0]}\")\n",
"compounds_df.columns"
],
"metadata": {
"id": "7MGMVVJvZt0n"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Merge both datasets"
],
"metadata": {
"id": "FGcP0j4uZ2nA"
}
},
{
"cell_type": "code",
"source": [
"# Merge DataFrames\n",
"output_df = pd.merge(\n",
" bioactivities_df[[\"molecule_chembl_id\", \"IC50\", \"units\"]],\n",
" compounds_df,\n",
" on=\"molecule_chembl_id\",\n",
")\n",
"\n",
"# Reset row indices\n",
"output_df.reset_index(drop=True, inplace=True)\n",
"\n",
"print(f\"Dataset with {output_df.shape[0]} entries.\")"
],
"metadata": {
"id": "0zDTBibAZ_Rf"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"output_df.dtypes"
],
"metadata": {
"id": "pDdiJPS5a94A"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"output_df.head(10)"
],
"metadata": {
"id": "aBuWZ-YIbDZY"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Add pIC50 values"
],
"metadata": {
"id": "PNqDKdFlbH2_"
}
},
{
"cell_type": "code",
"source": [
"def convert_ic50_to_pic50(IC50_value):\n",
" pIC50_value = 9 - math.log10(IC50_value)\n",
" return pIC50_value"
],
"metadata": {
"id": "MSdbsgQgbQvg"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Apply conversion to each row of the compounds DataFrame\n",
"output_df[\"pIC50\"] = output_df.apply(lambda x: convert_ic50_to_pic50(x.IC50), axis=1)"
],
"metadata": {
"id": "gCjIDvlYbVUv"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"output_df.head()"
],
"metadata": {
"id": "tH7o_kLibyLy"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Draw compound data"
],
"metadata": {
"id": "Fa3HGN67cNgK"
}
},
{
"cell_type": "code",
"source": [
"output_df.hist(column=\"pIC50\")"
],
"metadata": {
"id": "AerCnuTFcMaI"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Add molecule column\n",
"PandasTools.AddMoleculeColumnToFrame(output_df, smilesCol=\"smiles\")"
],
"metadata": {
"id": "YAYGFRulcybY"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Sort molecules by pIC50\n",
"output_df.sort_values(by=\"pIC50\", ascending=False, inplace=True)"
],
"metadata": {
"id": "hZFyN8i5c7WK"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Reset index\n",
"output_df.reset_index(drop=True, inplace=True)"
],
"metadata": {
"id": "cahO1MlXc9xo"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"output_df.drop(\"smiles\", axis=1).head(10)"
],
"metadata": {
"id": "PjF0ghp5dAS3"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"print(f\"DataFrame shape: {output_df.shape}\")"
],
"metadata": {
"id": "vDcnL71Vd3_H"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"output_df.to_csv(\"EGFR_compounds.csv\")\n",
"output_df.head()"
],
"metadata": {
"id": "GuvoBlHmd_zh"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"da qui "
],
"metadata": {
"id": "koTJopypn5x9"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "YoQHms4Bq-kR"
},
"outputs": [],
"source": [
"target = new_client.target \n",
"target_query = target.search('acetylcholinesterase')\n",
"targets = pd.DataFrame.from_dict(target_query)\n",
"pd.set_option('display.max_rows', 100)\n",
"pd.set_option('display.max_columns', 100)\n",
"print(targets)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "jFyCFTvCrBQ0"
},
"outputs": [],
"source": [
"selected_target = targets.target_chembl_id[0]\n",
"selected_target"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "O2GG2u3WrEEE"
},
"outputs": [],
"source": [
"activity = new_client.activity\n",
"res = activity.filter(target_chembl_id=selected_target).filter(standard_type=\"IC50\")\n",
"print(res)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "kbOCvFcJrJc9"
},
"outputs": [],
"source": [
"df = pd.DataFrame.from_dict(res)\n",
"print(df)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "rcn6cRGZrNZU"
},
"outputs": [],
"source": [
"df.to_csv('acetylcholinesterase_bioactivity_data.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "wSnzHtqgrQlF"
},
"outputs": [],
"source": [
"act_normal = df[df.standard_value.notna()]\n",
"act_normal = act_normal[act_normal.canonical_smiles.notna()]\n",
"act_normal = act_normal.drop_duplicates(['canonical_smiles'])\n",
"act_normal"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "CFiSpv_drTuV"
},
"outputs": [],
"source": [
"selection = ['molecule_chembl_id', 'canonical_smiles', 'standard_value']\n",
"new_act = act_normal[selection]\n",
"new_act"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "VtcNkworrXoO"
},
"outputs": [],
"source": [
"new_act.to_csv('acetylcholinesterase_bioactivity_clear', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "LYZAC9sqrpCt"
},
"outputs": [],
"source": [
"# This is temporary line\n",
"new_act = pd.read_csv('acetylcholinesterase_bioactivity_clear')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "oJJ-QG1LrsSs"
},
"outputs": [],
"source": [
"bioactivity_threshold = []\n",
"for i in new_act.standard_value:\n",
" if float(i) >= 10000:\n",
" bioactivity_threshold.append('inactive')\n",
" elif float(i) <= 1000:\n",
" bioactivity_threshold.append('active')\n",
" else:\n",
" bioactivity_threshold.append('intermediate')\n",
"bioactivity_class = pd.Series(bioactivity_threshold, name = 'bioactivity_class')\n",
"act5 = pd.concat([new_act, bioactivity_class], axis=1)\n",
"act5"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "LZ7yjq1ururk"
},
"outputs": [],
"source": [
"act5 = act5.dropna()\n",
"act5"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "yAXc6YJYrw9d"
},
"outputs": [],
"source": [
"act5.to_csv('bioactivity_with_class.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "7zA_ktpBr0p-"
},
"outputs": [],
"source": [
"! wget https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh\n",
"! chmod +x Miniconda3-py37_4.8.2-Linux-x86_64.sh\n",
"! bash ./Miniconda3-py37_4.8.2-Linux-x86_64.sh -b -f -p /usr/local\n",
"! conda install -c rdkit rdkit -y\n",
"import sys\n",
"sys.path.append('/usr/local/lib/python3.7/site-packages/')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "6kx4rS3gr3sd"
},
"outputs": [],
"source": [
"act_nosmiles = act5.drop(columns='canonical_smiles')\n",
"smiles = []\n",
"\n",
"for i in act5.canonical_smiles.tolist():\n",
" cpd = str(i).split('.')\n",
" cpd_longest = max(cpd, key = len)\n",
" smiles.append(cpd_longest)\n",
"\n",
"smiles = pd.Series(smiles, name='canonical_smiles')\n",
"act_clean_smiles = pd.concat([act_nosmiles,smiles], axis=1)\n",
"act_clean_smiles"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "NFcvAxZPAnnH"
},
"outputs": [],
"source": [
"\n",
"def lipinski(smiles, verbose=False):\n",
" moldata = []\n",
" for elem in smiles:\n",
" mol = Chem.MolFromSmiles(elem)\n",
" moldata.append(mol)\n",
" baseData = np.arange(1, 1)\n",
" i = 0\n",
" for mol in moldata:\n",
"\n",
" desc_MolWt = Descriptors.MolWt(mol)\n",
" desc_MolLogP = Descriptors.MolLogP(mol)\n",
" desc_NumHDonors = Lipinski.NumHDonors(mol)\n",
" desc_NumHAcceptors = Lipinski.NumHAcceptors(mol)\n",
" \n",
" row = np.array([desc_MolWt,\n",
" desc_MolLogP,\n",
" desc_NumHDonors,\n",
" desc_NumHAcceptors])\n",
" if (i==0):\n",
" baseData = row\n",
" else:\n",
" baseData = np.vstack([baseData, row])\n",
" i = i + 1\n",
"\n",
" columnNames = [\"MW\", \"LogP\", \"NumHDonors\", \"NumHAcceptors\"]\n",
" descriptors = pd.DataFrame(data=baseData, columns=columnNames)\n",
" return descriptors\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "0T2JY6OID6u_"
},
"outputs": [],
"source": [
"act_lipinski = lipinski(act_clean_smiles.canonical_smiles)\n",
"act_lipinski"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "xLnCc3HtE_Cd"
},
"outputs": [],
"source": [
"act_comb = pd.concat([act5, act_lipinski], axis = 1)\n",
"act_comb"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "DXatD_w4FW4F"
},
"source": [
"normalizing standard values"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "frzwrxXMFZEk"
},
"outputs": [],
"source": [
"def norm_value(input):\n",
" norm = []\n",
"\n",
" for i in input['standard_value']:\n",
" if i > 100000000:\n",
" i = 100000000\n",
" norm.append(i)\n",
"\n",
" input['standard_value_norm'] = norm\n",
" x = input.drop('standard_value', 1)\n",
" \n",
" return x"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "HuA3tdW0Febi"
},
"outputs": [],
"source": [
"act_norm = norm_value(act_comb)\n",
"act_norm"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "fJX4BDBdFkMs"
},
"source": [
"converting to pIC50"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "24y3Os7tFi4G"
},
"outputs": [],
"source": [
"def pIC50(input):\n",
" pIC50 = []\n",
"\n",
" for i in input['standard_value_norm']:\n",
" molar = i*(10**-9) # Converts nM to M\n",
" pIC50.append(-np.log10(molar))\n",
"\n",
" input['pIC50'] = pIC50\n",
" x = input.drop('standard_value_norm', 1)\n",
" \n",
" return x"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "7UCx_2ukFqet"
},
"outputs": [],
"source": [
"act_final = pIC50(act_norm)\n",
"act_final"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "ZA0cWxfbHIUT"
},
"outputs": [],
"source": [
"act_final.to_csv('bioactvity_pIC50.csv')"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "UvNudUP2IUke"
},
"source": [
"Exploratory data analysis (chemical space analysis) via lipinski descriptors"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "OX-2wFNeITs_"
},
"outputs": [],
"source": [
"act_fn = act_final[act_final['bioactivity_class'] != 'intermediate']\n",
"act_fn"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "t7hwhA53IsMS"
},
"source": [
"frequency of 2 classes (active, inactive)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Gg1yi_pPIqYL"
},
"outputs": [],
"source": [
"plt.figure(figsize=(5.5, 5.5))\n",
"sns.countplot(x='bioactivity_class', data=act_fn, edgecolor='black')\n",
"plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')\n",
"plt.ylabel('Frequency', fontsize=14, fontweight='bold')\n",
"plt.savefig('plot_bioactivity_class.pdf')"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "_OGm6lDzI1dy"
},
"source": [
"making statistical analysis"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "rGz2ntRYI4Jb"
},
"outputs": [],
"source": [
"def mannwhitney(descriptor, verbose=False):\n",
" from numpy.random import seed\n",
" from numpy.random import randn\n",
" from scipy.stats import mannwhitneyu\n",
"\n",
"# seed the random number generator\n",
" seed(1)\n",
"\n",
"# actives and inactives\n",
" selection = [descriptor, 'bioactivity_class']\n",
" df = act_fn[selection]\n",
" active = df[df['bioactivity_class'] == 'active']\n",
" active = active[descriptor]\n",
"\n",
" selection = [descriptor, 'bioactivity_class']\n",
" df = act_fn[selection]\n",
" inactive = df[df['bioactivity_class'] == 'inactive']\n",
" inactive = inactive[descriptor]\n",
"\n",
"# compare samples\n",
" stat, p = mannwhitneyu(active, inactive)\n",
" #print('Statistics=%.3f, p=%.3f' % (stat, p))\n",
"\n",
"# interpret\n",
" alpha = 0.05\n",
" if p > alpha:\n",
" interpretation = 'Same distribution (fail to reject H0)'\n",
" else:\n",
" interpretation = 'Different distribution (reject H0)'\n",
" \n",
" results = pd.DataFrame({'Descriptor':descriptor,\n",
" 'Statistics':stat,\n",
" 'p':p,\n",
" 'alpha':alpha,\n",
" 'Interpretation':interpretation}, index=[0])\n",
" filename = 'mannwhitneyu_' + descriptor + '.csv'\n",
" results.to_csv(filename)\n",
"\n",
" return results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Y_4FAGSpI8rb"
},
"outputs": [],
"source": [
"plt.figure(figsize=(5.5, 5.5))\n",
"\n",
"sns.boxplot(x = 'bioactivity_class', y = 'pIC50', data = act_fn)\n",
"\n",
"plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')\n",
"plt.ylabel('pIC50 value', fontsize=14, fontweight='bold')\n",
"\n",
"plt.savefig('plot_ic50.pdf')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "B_PXegE7JCm8"
},
"outputs": [],
"source": [
"mannwhitney('pIC50')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "xQ5WSv8JJRPB"
},
"outputs": [],
"source": [
"plt.figure(figsize=(5.5, 5.5))\n",
"\n",
"sns.boxplot(x = 'bioactivity_class', y = 'MW', data = act_fn)\n",
"\n",
"plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')\n",
"plt.ylabel('MW value', fontsize=14, fontweight='bold')\n",
"\n",
"plt.savefig('plot_MW.pdf')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "GVa0zXv6JU7D"
},
"outputs": [],
"source": [
"mannwhitney('MW')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "k-kFqInSJjBV"
},
"outputs": [],
"source": [
"plt.figure(figsize=(5.5, 5.5))\n",
"\n",
"sns.scatterplot(x='MW', y='LogP', data=act_fn, hue='bioactivity_class', size='pIC50', edgecolor='black', alpha=0.7)\n",
"\n",
"plt.xlabel('MW', fontsize=14, fontweight='bold')\n",
"plt.ylabel('LogP', fontsize=14, fontweight='bold')\n",
"plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)\n",
"plt.savefig('plot_MW_vs_LogP.pdf')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "LfXsfiUUKfzC"
},
"outputs": [],
"source": [
"plt.figure(figsize=(5.5, 5.5))\n",
"\n",
"sns.boxplot(x = 'bioactivity_class', y = 'LogP', data = act_fn)\n",
"\n",
"plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')\n",
"plt.ylabel('LogP value', fontsize=14, fontweight='bold')\n",
"\n",
"plt.savefig('plot_LogP.pdf')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "wFmRpaSeKj77"
},
"outputs": [],
"source": [
"mannwhitney('LogP')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "zw2AGDBGKnE8"
},
"outputs": [],
"source": [
"plt.figure(figsize=(5.5, 5.5))\n",
"\n",
"sns.boxplot(x = 'bioactivity_class', y = 'NumHDonors', data = act_fn)\n",
"\n",
"plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')\n",
"plt.ylabel('NumHDon value', fontsize=14, fontweight='bold')\n",
"\n",
"plt.savefig('plot_NumHDon.pdf')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "b1Zzb4pzK1e0"
},
"outputs": [],
"source": [
"mannwhitney('NumHDonors')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "NEAWEF3dK77x"
},
"outputs": [],
"source": [
"plt.figure(figsize=(5.5, 5.5))\n",
"\n",
"sns.boxplot(x = 'bioactivity_class', y = 'NumHAcceptors', data = act_fn)\n",
"\n",
"plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')\n",
"plt.ylabel('NumHAcc value', fontsize=14, fontweight='bold')\n",
"\n",
"plt.savefig('plot_NumHAcc.pdf')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "5D2tGmDWLD6p"
},
"outputs": [],
"source": [
"mannwhitney('NumHAcceptors')"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "S9tPgTEmLIYK"
},
"source": [
"results interpretation\n",
"\n",
"All of the four Lipinski descriptors exhibited statistically significant difference between active and inactive molecules."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ot6FtEFmLUrT"
},
"source": [
"let's calculate other descriptors with PADEL"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "wWuiZc4xLPK0"
},
"outputs": [],
"source": [
"! wget https://github.com/gromdimon/features/raw/main/padel.sh\n",
"! wget https://github.com/gromdimon/features/raw/main/padel.zip"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "n2RNWYMCLfAV"
},
"outputs": [],
"source": [
"!unzip padel.zip"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "b33r0tPFLkPS"
},
"outputs": [],
"source": [
"act_final"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "yHcTh8fhLvdS"
},
"outputs": [],
"source": [
"selection = ['canonical_smiles', 'molecule_chembl_id']\n",
"act_selected = act_final[selection]\n",
"act_selected.to_csv('molecule.smi', sep='\\t', index=False, header=False )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "BihYB7L9L2jq"
},
"outputs": [],
"source": [
"! cat molecule.smi | head -5\n",
"! cat molecule.smi | wc -l"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "-Mrzv48XMFDT"
},
"outputs": [],
"source": [
"!cat padel.sh"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "1tVryFTxMN0y"
},
"outputs": [],
"source": [
"!bash padel.sh"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "XbXYzcKEOK2q"
},
"outputs": [],
"source": [
"!ls -l"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Dkv5WDtrOPNj"
},
"source": [
"preparing data for later researchs"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Y8uK0GzqOSpM"
},
"outputs": [],
"source": [
"actx = pd.read_csv('descriptors_output.csv')\n",
"actx"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "OYJFFGJ6OW8E"
},
"outputs": [],
"source": [
"actx = actx.drop(columns='Name')\n",
"actx\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "LGCK3PaMTMWw"
},
"outputs": [],
"source": [
"actx.to_csv('actx.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "MtWOd6NOTSPX"
},
"outputs": [],
"source": [
"acty = act_final['pIC50']\n",
"acty"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "UOuJfYsRTYoZ"
},
"outputs": [],
"source": [
"actx.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "oVworOtbTa3Y"
},
"outputs": [],
"source": [
"acty.shape"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "m8rAgOi4Texp"
},
"source": [
"making new datase"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "IAoSj0dqTd6r"
},
"outputs": [],
"source": [
"datasetxy = pd.concat([actx, acty], axis=1)\n",
"datasetxy"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "GE_pa80NTnJj"
},
"outputs": [],
"source": [
"datasetxy.to_csv('dataset_with_padel_pIC50.csv', index=False)"
]
}
],
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyPDQWlqT55dqPOAlKArRgo2",
"include_colab_link": true
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
},
"gpuClass": "standard"
},
"nbformat": 4,
"nbformat_minor": 0
}