[f6a06b]: / model.ipynb

Download this file

479 lines (478 with data), 12.4 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df=pd.read_csv('diabetes.csv')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#lets describe the data\n",
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#infromation of dataset\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#any null values \n",
    "#not neccessary in above information we can see\n",
    "df.isnull().values.any()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#histogram\n",
    "df.hist(bins=10,figsize=(10,10))\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#correlation\n",
    "\n",
    "sns.heatmap(df.corr())\n",
    "# we can see skin thickness,insulin,pregnencies and age are full independent to each other\n",
    "#age and pregencies has negative correlation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#lets count total outcome in each target 0 1\n",
    "#0 means no diabeted\n",
    "#1 means patient with diabtes\n",
    "sns.countplot(y=df['Outcome'],palette='Set1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.set(style=\"ticks\")\n",
    "sns.pairplot(df, hue=\"Outcome\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#box plot for outlier visualization\n",
    "sns.set(style=\"whitegrid\")\n",
    "df.boxplot(figsize=(15,6))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#box plot\n",
    "sns.set(style=\"whitegrid\")\n",
    "\n",
    "sns.set(rc={'figure.figsize':(4,2)})\n",
    "sns.boxplot(x=df['Insulin'])\n",
    "plt.show()\n",
    "sns.boxplot(x=df['BloodPressure'])\n",
    "plt.show()\n",
    "sns.boxplot(x=df['DiabetesPedigreeFunction'])\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#outlier remove\n",
    "\n",
    "Q1=df.quantile(0.25)\n",
    "Q3=df.quantile(0.75)\n",
    "IQR=Q3-Q1\n",
    "\n",
    "print(\"---Q1--- \\n\",Q1)\n",
    "print(\"\\n---Q3--- \\n\",Q3)\n",
    "print(\"\\n---IQR---\\n\",IQR)\n",
    "\n",
    "#print((df < (Q1 - 1.5 * IQR))|(df > (Q3 + 1.5 * IQR)))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#outlier remove\n",
    "df_out = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]\n",
    "df.shape,df_out.shape\n",
    "#more than 80 records deleted"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Scatter matrix after removing outlier\n",
    "sns.set(style=\"ticks\")\n",
    "sns.pairplot(df_out, hue=\"Outcome\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#lets extract features and targets\n",
    "X=df_out.drop(columns=['Outcome'])\n",
    "y=df_out['Outcome']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Splitting train test data 80 20 ratio\n",
    "from sklearn.model_selection import train_test_split\n",
    "train_X,test_X,train_y,test_y=train_test_split(X,y,test_size=0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_X.shape,test_X.shape,train_y.shape,test_y.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import confusion_matrix,accuracy_score,make_scorer\n",
    "from sklearn.model_selection import cross_validate\n",
    "\n",
    "def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]\n",
    "def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]\n",
    "def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]\n",
    "def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]\n",
    "\n",
    "#cross validation purpose\n",
    "scoring = {'accuracy': make_scorer(accuracy_score),'prec': 'precision'}\n",
    "scoring = {'tp': make_scorer(tp), 'tn': make_scorer(tn),\n",
    "           'fp': make_scorer(fp), 'fn': make_scorer(fn)}\n",
    "\n",
    "def display_result(result):\n",
    "    print(\"TP: \",result['test_tp'])\n",
    "    print(\"TN: \",result['test_tn'])\n",
    "    print(\"FN: \",result['test_fn'])\n",
    "    print(\"FP: \",result['test_fp'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Lets build the model\n",
    "\n",
    "#Logistic Regression\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import roc_auc_score\n",
    "\n",
    "acc=[]\n",
    "roc=[]\n",
    "\n",
    "clf=LogisticRegression()\n",
    "clf.fit(train_X,train_y)\n",
    "y_pred=clf.predict(test_X)\n",
    "#find accuracy\n",
    "ac=accuracy_score(test_y,y_pred)\n",
    "acc.append(ac)\n",
    "\n",
    "#find the ROC_AOC curve\n",
    "rc=roc_auc_score(test_y,y_pred)\n",
    "roc.append(rc)\n",
    "print(\"\\nAccuracy {0} ROC {1}\".format(ac,rc))\n",
    "\n",
    "#cross val score\n",
    "result=cross_validate(clf,train_X,train_y,scoring=scoring,cv=10)\n",
    "display_result(result)\n",
    "\n",
    "#display predicted values uncomment below line\n",
    "#pd.DataFrame(data={'Actual':test_y,'Predicted':y_pred}).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Support Vector Machine\n",
    "from sklearn.svm import SVC\n",
    "\n",
    "clf=SVC(kernel='linear')\n",
    "clf.fit(train_X,train_y)\n",
    "y_pred=clf.predict(test_X)\n",
    "#find accuracy\n",
    "ac=accuracy_score(test_y,y_pred)\n",
    "acc.append(ac)\n",
    "\n",
    "#find the ROC_AOC curve\n",
    "rc=roc_auc_score(test_y,y_pred)\n",
    "roc.append(rc)\n",
    "print(\"\\nAccuracy {0} ROC {1}\".format(ac,rc))\n",
    "\n",
    "#cross val score\n",
    "result=cross_validate(clf,train_X,train_y,scoring=scoring,cv=10)\n",
    "display_result(result)\n",
    "\n",
    "#display predicted values uncomment below line\n",
    "#pd.DataFrame(data={'Actual':test_y,'Predicted':y_pred}).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#KNN\n",
    "\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "\n",
    "clf=KNeighborsClassifier(n_neighbors=3)\n",
    "clf.fit(train_X,train_y)\n",
    "y_pred=clf.predict(test_X)\n",
    "#find accuracy\n",
    "ac=accuracy_score(test_y,y_pred)\n",
    "acc.append(ac)\n",
    "\n",
    "#find the ROC_AOC curve\n",
    "rc=roc_auc_score(test_y,y_pred)\n",
    "roc.append(rc)\n",
    "print(\"\\nAccuracy {0} ROC {1}\".format(ac,rc))\n",
    "\n",
    "#cross val score\n",
    "result=cross_validate(clf,train_X,train_y,scoring=scoring,cv=10)\n",
    "display_result(result)\n",
    "\n",
    "#display predicted values uncomment below line\n",
    "#pd.DataFrame(data={'Actual':test_y,'Predicted':y_pred}).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Random forest\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "clf=RandomForestClassifier()\n",
    "clf.fit(train_X,train_y)\n",
    "\n",
    "y_pred=clf.predict(test_X)\n",
    "#find accuracy\n",
    "ac=accuracy_score(test_y,y_pred)\n",
    "acc.append(ac)\n",
    "\n",
    "#find the ROC_AOC curve\n",
    "rc=roc_auc_score(test_y,y_pred)\n",
    "roc.append(rc)\n",
    "print(\"\\nAccuracy {0} ROC {1}\".format(ac,rc))\n",
    "\n",
    "#cross val score\n",
    "result=cross_validate(clf,train_X,train_y,scoring=scoring,cv=10)\n",
    "display_result(result)\n",
    "\n",
    "#display predicted values uncomment below line\n",
    "#pd.DataFrame(data={'Actual':test_y,'Predicted':y_pred}).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Naive Bayes Theorem\n",
    "#import library\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "\n",
    "clf=GaussianNB()\n",
    "clf.fit(train_X,train_y)\n",
    "y_pred=clf.predict(test_X)\n",
    "#find accuracy\n",
    "ac=accuracy_score(test_y,y_pred)\n",
    "acc.append(ac)\n",
    "\n",
    "#find the ROC_AOC curve\n",
    "rc=roc_auc_score(test_y,y_pred)\n",
    "roc.append(rc)\n",
    "print(\"\\nAccuracy {0} ROC {1}\".format(ac,rc))\n",
    "\n",
    "#cross val score\n",
    "result=cross_validate(clf,train_X,train_y,scoring=scoring,cv=10)\n",
    "display_result(result)\n",
    "\n",
    "#display predicted values uncomment below line\n",
    "#pd.DataFrame(data={'Actual':test_y,'Predicted':y_pred}).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Gradient Boosting Classifier\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "clf=GradientBoostingClassifier(n_estimators=50,learning_rate=0.2)\n",
    "clf.fit(train_X,train_y)\n",
    "y_pred=clf.predict(test_X)\n",
    "#find accuracy\n",
    "ac=accuracy_score(test_y,y_pred)\n",
    "acc.append(ac)\n",
    "\n",
    "#find the ROC_AOC curve\n",
    "rc=roc_auc_score(test_y,y_pred)\n",
    "roc.append(rc)\n",
    "print(\"\\nAccuracy {0} ROC {1}\".format(ac,rc))\n",
    "\n",
    "#cross val score\n",
    "result=cross_validate(clf,train_X,train_y,scoring=scoring,cv=10)\n",
    "display_result(result)\n",
    "\n",
    "#display predicted values uncomment below line\n",
    "#pd.DataFrame(data={'Actual':test_y,'Predicted':y_pred}).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#lets plot the bar graph\n",
    "\n",
    "ax=plt.figure(figsize=(9,4))\n",
    "plt.bar(['Logistic Regression','SVM','KNN','Random Forest','Naivye Bayes','Gradient Boosting'],acc,label='Accuracy')\n",
    "plt.ylabel('Accuracy Score')\n",
    "plt.xlabel('Algortihms')\n",
    "plt.show()\n",
    "\n",
    "ax=plt.figure(figsize=(9,4))\n",
    "plt.bar(['Logistic Regression','SVM','KNN','Random Forest','Naivye Bayes','Gradient Boosting'],roc,label='ROC AUC')\n",
    "plt.ylabel('ROC AUC')\n",
    "plt.xlabel('Algortihms')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Great....\n",
    "#Random forest has highest accuracy 98% and ROC_AUC curve 97%\n",
    "#model can be improve more if we take same count of labels\n",
    "#in our model 30% is diabetic and 70% no diabetic patient\n",
    "\n",
    "#model can be improve with fine tunning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}