368 lines (368 with data), 8.5 kB
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"machine_shape": "hm",
"authorship_tag": "ABX9TyMy47gMyjSjTg4xuBzieKdO",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU",
"gpuClass": "standard"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/github/francescopatane96/Computer_aided_drug_discovery_kit/blob/main/ML_7_1_prediction.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"source": [
"!pip install rdkit"
],
"metadata": {
"id": "ydp6nIbsoePl"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"%matplotlib inline\n",
"\n",
"import os\n",
"import csv\n",
"import joblib\n",
"\n",
"from rdkit.Chem import MACCSkeys\n",
"\n",
"import pickle\n",
"from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect\n",
"\n",
"from rdkit import rdBase\n",
"\n",
"from rdkit import Chem\n",
"from rdkit.Chem import AllChem\n",
"from rdkit.Chem import Draw\n",
"from IPython.display import display\n",
"\n",
"from rdkit.Chem import PandasTools\n",
"import pandas as pd\n",
"PandasTools.pd = pd\n",
"\n",
"import numpy as np\n",
"\n",
"np.random.seed(69)\n",
"\n",
"from mpl_toolkits.mplot3d import Axes3D\n",
"import matplotlib.pyplot as plt\n",
"\n",
"import random\n",
"\n",
"#from bokeh import mpl\n",
"#from bokeh.plotting import output_notebook, show\n",
"\n",
"print(\"RDKit Version: %s\" % rdBase.rdkitVersion)"
],
"metadata": {
"id": "uWMLelBAoxlG"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!wget https://zenodo.org/record/5172018/files/GDB17.50000000.smi.gz?download=1 #download library"
],
"metadata": {
"id": "56KKO3KdbzKX"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!gunzip mols.smi.gz"
],
"metadata": {
"id": "N42eXKUDb5Pt"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"! wget https://github.com/gromdimon/features/raw/main/padel.sh\n",
"! wget https://github.com/gromdimon/features/raw/main/padel.zip"
],
"metadata": {
"id": "r3EieL3Havs4"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!unzip padel.zip"
],
"metadata": {
"id": "P12HoXP0axYM"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!cat padel.sh"
],
"metadata": {
"id": "5swteLSEa5q4"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"df = pd.read_csv(\"mols.smi\", sep=\"/t\", header=None, \n",
" names=[\"smile\"])\n",
"df"
],
"metadata": {
"id": "DcPIl4PuXuqr"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"mols = df.sample(n = 1000) #you can omit this module\n",
"mols"
],
"metadata": {
"id": "6hxtq1XsZB3i"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"mols.to_csv('molecule_comma.smi')"
],
"metadata": {
"id": "AdeHs7MVzPnY"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"clean and preprocess your compounds library"
],
"metadata": {
"id": "4sLEYxEjvJuG"
}
},
{
"cell_type": "code",
"source": [
"with open('molecule.smi', 'w') as f_out, open('molecule_comma.smi') as f_in:\n",
" for line in f_in:\n",
" new_str = ','.join(line.split(',')[1:])\n",
" f_out.write(new_str)"
],
"metadata": {
"id": "iyuIYt4JzM-I"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"with open('molecule.smi', 'r') as fin:\n",
" data = fin.read().splitlines(True)\n",
"with open('molecule.smi', 'w') as fout:\n",
" fout.writelines(data[1:])"
],
"metadata": {
"id": "aWrRUq4r0_gB"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"df_smi = pd.read_csv('molecule.smi')\n",
"df_smi"
],
"metadata": {
"id": "U4DAR_ELJ_24"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"df_smi_reindexed = df_smi.reset_index()\n",
"df_smi_reindexed"
],
"metadata": {
"id": "RsDglzhD7NEf"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"df_smi_reindexed.to_csv('molecule.smi', sep='\\t', index=False, header=False )"
],
"metadata": {
"id": "wR_riIJWJyRw"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!cat molecule.smi | head -5\n",
"!cat molecule.smi | wc -l"
],
"metadata": {
"id": "c94yiZEma4F9"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"smis = pd.read_csv('molecule.smi', header=None)\n",
"smis\n",
"\n"
],
"metadata": {
"id": "MuGQzLLXK4G3"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"smis = smis.tolist()"
],
"metadata": {
"id": "gbRUztrkMKRn"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"cans = [Chem.MolToSmiles(Chem.MolFromSmiles(smis),True) for smi in smis]\n",
"assert cans[0] == cans[1]"
],
"metadata": {
"id": "0tlNoXYSLLIP"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"before running padel script, you have to remove \"*.smi\" files while retain \"molecule.smi\" file"
],
"metadata": {
"id": "Oa0w_iq3vmOM"
}
},
{
"cell_type": "code",
"source": [
"!bash padel.sh"
],
"metadata": {
"id": "3EXLK96pa69_"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"descriptors_with_name = pd.read_csv('descriptors_output.csv', sep=',')\n",
"descriptors_with_name"
],
"metadata": {
"id": "u1SlhSFAWmor"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"x = x.drop('Name', axis=1)\n",
"x"
],
"metadata": {
"id": "8ZAvQXzRW7dj"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"loaded_rf = joblib.load(\"./random_forest.joblib\")"
],
"metadata": {
"id": "fi4dw4cK1z6K"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"prediction = loaded_rf.predict(x.values)\n",
"prediction"
],
"metadata": {
"id": "jpGsjkRJ16lB"
},
"execution_count": null,
"outputs": []
}
]
}