Diff of /ML_7.ipynb [000000] .. [480cb7]

Switch to side-by-side view

--- a
+++ b/ML_7.ipynb
@@ -0,0 +1,798 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "collapsed_sections": [],
+      "authorship_tag": "ABX9TyMWYYadwmTqLgGlAYdnnepg",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU",
+    "gpuClass": "standard"
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/francescopatane96/Computer_aided_drug_discovery_kit/blob/main/ML_7.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install rdkit   #install rdkit library"
+      ],
+      "metadata": {
+        "id": "sTiafVkFRvo8"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install lazypredict  #install lazypredict"
+      ],
+      "metadata": {
+        "id": "npXUo4YLmFLJ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install git+https://github.com/volkamerlab/teachopencadd.git  #install teachopencadd dependecies "
+      ],
+      "metadata": {
+        "id": "L10Sz7LoR5f0"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "VqMlkn50RdJD"
+      },
+      "outputs": [],
+      "source": [
+        "from pathlib import Path\n",
+        "import seaborn as sns\n",
+        "from sklearn.metrics import confusion_matrix\n",
+        "from warnings import filterwarnings\n",
+        "import time\n",
+        "import lazypredict\n",
+        "from lazypredict.Supervised import LazyRegressor\n",
+        "from lazypredict.Supervised import LazyClassifier\n",
+        "\n",
+        "import pandas as pd\n",
+        "from sklearn.ensemble import RandomForestRegressor\n",
+        "import numpy as np\n",
+        "from sklearn import svm, metrics, clone\n",
+        "from sklearn.ensemble import RandomForestClassifier\n",
+        "from sklearn.neural_network import MLPClassifier\n",
+        "from sklearn.model_selection import KFold, train_test_split\n",
+        "from sklearn.metrics import auc, accuracy_score, recall_score\n",
+        "from sklearn.metrics import roc_curve, roc_auc_score\n",
+        "import matplotlib.pyplot as plt\n",
+        "from rdkit import Chem\n",
+        "from rdkit.Chem import MACCSkeys\n",
+        "from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect\n",
+        "\n",
+        "from sklearn.feature_selection import SelectFromModel\n",
+        "from teachopencadd.utils import seed_everything\n",
+        "\n",
+        "# Silence some expected warnings\n",
+        "filterwarnings(\"ignore\")\n",
+        "# Fix seed for reproducible results\n",
+        "SEED = 44\n",
+        "seed_everything(SEED)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Read data (Lipinski)\n",
+        "chembl_df1 = pd.read_csv(\n",
+        "    \"IDH_compounds_lipinski.csv\",    #read lipinski's descriptors csv\n",
+        "    index_col=0\n",
+        ")\n",
+        "\n",
+        "# Look at head\n",
+        "print(\"Shape of dataframe : \", chembl_df1.shape)\n",
+        "chembl_df1.head()\n"
+      ],
+      "metadata": {
+        "id": "QWPd9haHSfne"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# remove NaN, if present\n",
+        "chembl_df2 = chembl_df1.dropna()"
+      ],
+      "metadata": {
+        "id": "B0m1hX4InpUt"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "chembl_df2.shape"
+      ],
+      "metadata": {
+        "id": "H_DaNn2An0wO"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Keep only the columns we want\n",
+        "chembl_df3 = chembl_df2[[\"molecule_chembl_id\", \"smiles\", \"pIC50\"]]\n",
+        "chembl_df3.head()\n"
+      ],
+      "metadata": {
+        "id": "ui_bJaycSn6u"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "chembl_df4 = chembl_df3.reset_index()\n",
+        "chembl_df4"
+      ],
+      "metadata": {
+        "id": "bi3Gmmt0Oor0"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def filter(IC):\n",
+        "  if IC <= 5.5:\n",
+        "    return 'low'\n",
+        "  if (IC > 5.5 and IC <= 7.5):\n",
+        "    return 'medium'\n",
+        "  if IC > 7.5:\n",
+        "    return 'high'\n",
+        "\n",
+        "chembl_df4['bio_class'] = chembl_df4['pIC50'].apply(filter)\n",
+        "chembl_df4.head(100)"
+      ],
+      "metadata": {
+        "id": "tP-wxfRyUTGj"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "install padel (descriptors calculator)"
+      ],
+      "metadata": {
+        "id": "LFEo-goYsEyo"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "! wget https://github.com/gromdimon/features/raw/main/padel.sh\n",
+        "! wget https://github.com/gromdimon/features/raw/main/padel.zip"
+      ],
+      "metadata": {
+        "id": "9fIJcheD2e8a"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!unzip padel.zip"
+      ],
+      "metadata": {
+        "id": "5dfNxKNO2ktS"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "selection = ['smiles', 'molecule_chembl_id']     #select columns we want to retain\n",
+        "act_selected = chembl_df4[selection]\n",
+        "act_selected.to_csv('molecule.smi', sep='\\t', index=False, header=False )"
+      ],
+      "metadata": {
+        "id": "5ga_3hCr2v7Z"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "! cat molecule.smi | head -5\n",
+        "! cat molecule.smi | wc -l"
+      ],
+      "metadata": {
+        "id": "SMxFKWGY3Cfs"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!cat padel.sh   #read the script"
+      ],
+      "metadata": {
+        "id": "PUPnFvOF3EAp"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!bash padel.sh    #run padel (it reads molecule.smi file that contains canonical form smiles)"
+      ],
+      "metadata": {
+        "id": "P-xS8kRB3F9y"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "actx = pd.read_csv('descriptors_output.csv') #padel generates a file called 'descriptors_output.csv'\n",
+        "actx                                         "
+      ],
+      "metadata": {
+        "id": "yvObjIU33gYa"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "actx_final = actx.drop('Name', axis=1)\n",
+        "actx_final"
+      ],
+      "metadata": {
+        "id": "QFjG5IPQB6cC"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "X = actx_final                  #descriptors\n",
+        "Y = chembl_df4.bio_class  #bioactivity class"
+      ],
+      "metadata": {
+        "id": "vAEvKNRjHq8h"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "random forest and feature selection"
+      ],
+      "metadata": {
+        "id": "9r2sw9PzVmqH"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Spliting data in 80\\20 ratio\n",
+        "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.3, random_state=44)"
+      ],
+      "metadata": {
+        "id": "_crBdOZvHzfJ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Seeing the data that was prepared\n",
+        "X_train.shape, Y_train.shape, X_test.shape, Y_test.shape"
+      ],
+      "metadata": {
+        "id": "ms4H8KktILlb"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "reg = SelectFromModel(RandomForestClassifier(n_estimators = 100))\n",
+        "reg.fit(X_train, Y_train)"
+      ],
+      "metadata": {
+        "id": "7K6R9FTVVq-h"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "reg.get_support()"
+      ],
+      "metadata": {
+        "id": "91FrxCu2XbXU"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "selected_feat= X_train.columns[(reg.get_support())]\n",
+        "len(selected_feat)"
+      ],
+      "metadata": {
+        "id": "yT8MbBVXXrP1"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print(selected_feat)"
+      ],
+      "metadata": {
+        "id": "Ba4bgDi-YLW8"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Defines and builds the lazyclassifier\n",
+        "reg = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)\n",
+        "models_train,predictions_train = reg.fit(X_train, X_train, Y_train, Y_train)"
+      ],
+      "metadata": {
+        "id": "Hj5D9uZAIPNS"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Performance table of the training set (80% subset)\n",
+        "models_train"
+      ],
+      "metadata": {
+        "id": "FZBL_doEI4qi"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Checking the study on a test sample\n",
+        "reg = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)\n",
+        "models_test,predictions_test = reg.fit(X_train,X_test,Y_train,Y_test)"
+      ],
+      "metadata": {
+        "id": "qHNtZTdoJEJD"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "models_test"
+      ],
+      "metadata": {
+        "id": "mxCFrrI_JU7h"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "X_train = X_train.astype('int32')\n",
+        "Y_train = Y_train.astype('string')"
+      ],
+      "metadata": {
+        "id": "ekoYHjCWJ85J"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "model = RandomForestClassifier(n_estimators=100, random_state=44)\n",
+        "model.fit(X_train, Y_train)\n",
+        "r2 = model.score(X_test, Y_test)\n",
+        "r2                                 #show explained variance"
+      ],
+      "metadata": {
+        "id": "jQRueafWKCGK"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# View accuracy score\n",
+        "accuracy_score(Y_test, Y_pred)"
+      ],
+      "metadata": {
+        "id": "I2Etmqbudm92"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Try data with test sample\n",
+        "\n",
+        "Y_pred = model.predict(X_test)\n",
+        "print(Y_pred)"
+      ],
+      "metadata": {
+        "id": "IFcYvrWIKLvZ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Accuracy score is not a great measure of classifier performance when the classes are imbalanced. \n",
+        "\n",
+        "We need more information to define how well the model really performed. \n",
+        "\n",
+        "Did it perform equally well for each class? Were there any pairs of classes it found especially hard to distinguish? Let's find out with a confusion matrix."
+      ],
+      "metadata": {
+        "id": "_2OwPrI0d63x"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "the rows represent the true labels and the columns represent predicted labels. Values on the diagonal represent the number (or percent, in a normalized confusion matrix) of times where the predicted label matches the true label. Values in the other cells represent instances where the classifier mislabeled an observation; the column tells us what the classifier predicted, and the row tells us what the right label was. This is a convenient way to spot areas where the model may need a little extra training."
+      ],
+      "metadata": {
+        "id": "SQUEry3Eelym"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "confusion_matrix(Y_test, Y_pred)"
+      ],
+      "metadata": {
+        "id": "EI--sFdkeeU2"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Get and reshape confusion matrix data\n",
+        "matrix = confusion_matrix(Y_test, Y_pred)\n",
+        "matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]\n",
+        "\n",
+        "# Build the plot\n",
+        "plt.figure(figsize=(16,7))\n",
+        "sns.set(font_scale=1.4)\n",
+        "sns.heatmap(matrix, annot=True, annot_kws={'size':10},\n",
+        "            cmap=plt.cm.Greens, linewidths=0.2)\n",
+        "\n",
+        "# Add labels to the plot\n",
+        "class_names = ['low', 'medium', 'high']\n",
+        "tick_marks = np.arange(len(class_names))\n",
+        "tick_marks2 = tick_marks + 0.5\n",
+        "plt.xticks(tick_marks, class_names, rotation=25)\n",
+        "plt.yticks(tick_marks2, class_names, rotation=0)\n",
+        "plt.xlabel('Predicted label')\n",
+        "plt.ylabel('True label')\n",
+        "plt.title('Confusion Matrix for Random Forest Model')\n",
+        "plt.show()"
+      ],
+      "metadata": {
+        "id": "R-a1ayaKfoNm"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Our classifier struggled at predicting the medium and low labels. about half times, medium and low were mislabeled as high bioactivity."
+      ],
+      "metadata": {
+        "id": "pjdOSFo4hKlk"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "This example demonstrates the use of permutation_test_score to evaluate the significance of a cross-validated score using permutations.\n",
+        "\n",
+        "- We will also generate some random feature data (i.e., 20 features), uncorrelated with the class labels in the dataset.\n",
+        "\n",
+        "- Next, we calculate the permutation_test_score using the original dataset, which strongly predict the labels and the randomly generated features and iris labels, which should have no dependency between features and labels. We use the SVC classifier and Accuracy score to evaluate the model at each round.\n",
+        "\n",
+        "- permutation_test_score generates a null distribution by calculating the accuracy of the classifier on 1000 different permutations of the dataset, where features remain the same but labels undergo different permutations. This is the distribution for the null hypothesis which states there is no dependency between the features and labels. An empirical p-value is then calculated as the percentage of permutations for which the score obtained is greater that the score obtained using the original data."
+      ],
+      "metadata": {
+        "id": "Ml18oFkMp1vX"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "n_uncorrelated_features = 20\n",
+        "rng = np.random.RandomState(seed=44)\n",
+        "# Use same number of samples as in iris and 20 features\n",
+        "X_rand = rng.normal(size=(X.shape[0], n_uncorrelated_features))"
+      ],
+      "metadata": {
+        "id": "Utd1dYLVpe5o"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from sklearn.svm import SVC\n",
+        "from sklearn.model_selection import StratifiedKFold\n",
+        "from sklearn.model_selection import permutation_test_score\n",
+        "\n",
+        "cv = StratifiedKFold(5, shuffle=True, random_state=44)\n",
+        "\n",
+        "score, perm_scores, pvalue = permutation_test_score(\n",
+        "    model, X, Y, scoring=\"accuracy\", cv=cv, n_permutations=100\n",
+        ")\n",
+        "\n",
+        "score_rand, perm_scores_rand, pvalue_rand = permutation_test_score(\n",
+        "    model, X_rand, Y, scoring=\"accuracy\", cv=cv, n_permutations=100\n",
+        ")"
+      ],
+      "metadata": {
+        "id": "uRQjbEYKiwme"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "fig, ax = plt.subplots()\n",
+        "\n",
+        "ax.hist(perm_scores, bins=20, density=True)\n",
+        "ax.axvline(score, ls=\"--\", color=\"r\")\n",
+        "score_label = f\"Score on original\\ndata: {score:.2f}\\n(p-value: {pvalue:.3f})\"\n",
+        "ax.text(0.7, 10, score_label, fontsize=12)\n",
+        "ax.set_xlabel(\"Accuracy score\")\n",
+        "_ = ax.set_ylabel(\"Probability\")"
+      ],
+      "metadata": {
+        "id": "tHgSa_UQt5de"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Below we plot a histogram of the permutation scores (the null distribution). The red line indicates the score obtained by the classifier on the original data. The score is much better than those obtained by using permuted data and the p-value is thus very low. This indicates that there is a low likelihood that this good score would be obtained by chance alone. It provides evidence that the dataset contains real dependency between features and labels and the classifier was able to utilize this to obtain good results."
+      ],
+      "metadata": {
+        "id": "WcnirjK5uT_m"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Below we plot the null distribution for the randomized data. The permutation scores are similar to those obtained using the original iris dataset because the permutation always destroys any feature label dependency present. The score obtained on the original randomized data in this case though, is very poor. This results in a large p-value, confirming that there was no feature label dependency in the original data."
+      ],
+      "metadata": {
+        "id": "QJgZ8dWTu-T8"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "fig, ax = plt.subplots()\n",
+        "\n",
+        "ax.hist(perm_scores_rand, bins=20, density=True)\n",
+        "ax.set_xlim(0.13)\n",
+        "ax.axvline(score_rand, ls=\"--\", color=\"r\")\n",
+        "score_label = f\"Score on original\\ndata: {score_rand:.2f}\\n(p-value: {pvalue_rand:.3f})\"\n",
+        "ax.text(0.14, 7.5, score_label, fontsize=12)\n",
+        "ax.set_xlabel(\"Accuracy score\")\n",
+        "ax.set_ylabel(\"Probability\")\n",
+        "plt.show()"
+      ],
+      "metadata": {
+        "id": "vAOCja2jucGY"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "generate model as joblib Object"
+      ],
+      "metadata": {
+        "id": "LaTzuIhrviJf"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import joblib"
+      ],
+      "metadata": {
+        "id": "wL73UzIj1dDk"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "joblib.dump(model, \"./random_forest.joblib\")    #save the model"
+      ],
+      "metadata": {
+        "id": "ohKdeEYr1aub"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "loaded_rf = joblib.load(\"./random_forest.joblib\")    #load the model"
+      ],
+      "metadata": {
+        "id": "npsMsHmf1per"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "loaded_rf.predict(actx_df)              #predict pIC50"
+      ],
+      "metadata": {
+        "id": "OumE5xsW1rTs"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import seaborn as sns\n",
+        "import matplotlib.pyplot as plt\n",
+        "\n",
+        "sns.set(color_codes=True)\n",
+        "sns.set_style(\"white\")\n",
+        "\n",
+        "ax = sns.regplot(Y_test, Y_pred, scatter_kws={'alpha':0.4, 'color': 'black'}, line_kws={'color': 'red'})\n",
+        "ax.set_xlabel('Experimental pIC50', fontsize='large', fontweight='bold', color='red')\n",
+        "ax.set_ylabel('Predicted pIC50', fontsize='large', fontweight='bold', color='red')\n",
+        "ax.set_xlim(0, 12)\n",
+        "ax.set_ylim(0, 12)\n",
+        "ax.figure.set_size_inches(10, 10)\n",
+        "plt.show"
+      ],
+      "metadata": {
+        "id": "hbhEE08H2Mi5"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from sklearn.model_selection import cross_val_score\n",
+        "scores = cross_val_score(model, X, Y, cv=12)\n",
+        "print(scores)"
+      ],
+      "metadata": {
+        "id": "TKy6YOTc2xPa",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "4765eacc-a829-44bf-b530-23014778e101"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "[0.5503876  0.78294574 0.75193798 0.75968992 0.82945736 0.78294574\n",
+            " 0.7751938  0.69767442 0.68217054 0.734375   0.671875   0.6015625 ]\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "r2_training = model.score(X_train, Y_train)\n",
+        "print(r2_training)\n",
+        "r2_test = model.score(X_test, Y_test)\n",
+        "print(r2_test)"
+      ],
+      "metadata": {
+        "id": "EhpSFlju2gxj",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "2e65dd2d-7ea7-4bda-81d9-29d7462c5992"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "0.9472710453283997\n",
+            "0.7198275862068966\n"
+          ]
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file