Diabetes-Risk / Git / Diff of /model.ipynb

Models:
RaymondKing/
Diabetes-Risk
Downloads: 1
Diff of /model.ipynb [000000] .. [f6a06b]
Switch to side-by-side view

--- a
+++ b/model.ipynb
@@ -0,0 +1,478 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df=pd.read_csv('diabetes.csv')\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#lets describe the data\n",
+    "df.describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#infromation of dataset\n",
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#any null values \n",
+    "#not neccessary in above information we can see\n",
+    "df.isnull().values.any()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#histogram\n",
+    "df.hist(bins=10,figsize=(10,10))\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#correlation\n",
+    "\n",
+    "sns.heatmap(df.corr())\n",
+    "# we can see skin thickness,insulin,pregnencies and age are full independent to each other\n",
+    "#age and pregencies has negative correlation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#lets count total outcome in each target 0 1\n",
+    "#0 means no diabeted\n",
+    "#1 means patient with diabtes\n",
+    "sns.countplot(y=df['Outcome'],palette='Set1')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sns.set(style=\"ticks\")\n",
+    "sns.pairplot(df, hue=\"Outcome\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#box plot for outlier visualization\n",
+    "sns.set(style=\"whitegrid\")\n",
+    "df.boxplot(figsize=(15,6))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#box plot\n",
+    "sns.set(style=\"whitegrid\")\n",
+    "\n",
+    "sns.set(rc={'figure.figsize':(4,2)})\n",
+    "sns.boxplot(x=df['Insulin'])\n",
+    "plt.show()\n",
+    "sns.boxplot(x=df['BloodPressure'])\n",
+    "plt.show()\n",
+    "sns.boxplot(x=df['DiabetesPedigreeFunction'])\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#outlier remove\n",
+    "\n",
+    "Q1=df.quantile(0.25)\n",
+    "Q3=df.quantile(0.75)\n",
+    "IQR=Q3-Q1\n",
+    "\n",
+    "print(\"---Q1--- \\n\",Q1)\n",
+    "print(\"\\n---Q3--- \\n\",Q3)\n",
+    "print(\"\\n---IQR---\\n\",IQR)\n",
+    "\n",
+    "#print((df < (Q1 - 1.5 * IQR))|(df > (Q3 + 1.5 * IQR)))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#outlier remove\n",
+    "df_out = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]\n",
+    "df.shape,df_out.shape\n",
+    "#more than 80 records deleted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Scatter matrix after removing outlier\n",
+    "sns.set(style=\"ticks\")\n",
+    "sns.pairplot(df_out, hue=\"Outcome\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#lets extract features and targets\n",
+    "X=df_out.drop(columns=['Outcome'])\n",
+    "y=df_out['Outcome']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Splitting train test data 80 20 ratio\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "train_X,test_X,train_y,test_y=train_test_split(X,y,test_size=0.2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_X.shape,test_X.shape,train_y.shape,test_y.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.metrics import confusion_matrix,accuracy_score,make_scorer\n",
+    "from sklearn.model_selection import cross_validate\n",
+    "\n",
+    "def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]\n",
+    "def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]\n",
+    "def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]\n",
+    "def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]\n",
+    "\n",
+    "#cross validation purpose\n",
+    "scoring = {'accuracy': make_scorer(accuracy_score),'prec': 'precision'}\n",
+    "scoring = {'tp': make_scorer(tp), 'tn': make_scorer(tn),\n",
+    "           'fp': make_scorer(fp), 'fn': make_scorer(fn)}\n",
+    "\n",
+    "def display_result(result):\n",
+    "    print(\"TP: \",result['test_tp'])\n",
+    "    print(\"TN: \",result['test_tn'])\n",
+    "    print(\"FN: \",result['test_fn'])\n",
+    "    print(\"FP: \",result['test_fp'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Lets build the model\n",
+    "\n",
+    "#Logistic Regression\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.metrics import roc_auc_score\n",
+    "\n",
+    "acc=[]\n",
+    "roc=[]\n",
+    "\n",
+    "clf=LogisticRegression()\n",
+    "clf.fit(train_X,train_y)\n",
+    "y_pred=clf.predict(test_X)\n",
+    "#find accuracy\n",
+    "ac=accuracy_score(test_y,y_pred)\n",
+    "acc.append(ac)\n",
+    "\n",
+    "#find the ROC_AOC curve\n",
+    "rc=roc_auc_score(test_y,y_pred)\n",
+    "roc.append(rc)\n",
+    "print(\"\\nAccuracy {0} ROC {1}\".format(ac,rc))\n",
+    "\n",
+    "#cross val score\n",
+    "result=cross_validate(clf,train_X,train_y,scoring=scoring,cv=10)\n",
+    "display_result(result)\n",
+    "\n",
+    "#display predicted values uncomment below line\n",
+    "#pd.DataFrame(data={'Actual':test_y,'Predicted':y_pred}).head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Support Vector Machine\n",
+    "from sklearn.svm import SVC\n",
+    "\n",
+    "clf=SVC(kernel='linear')\n",
+    "clf.fit(train_X,train_y)\n",
+    "y_pred=clf.predict(test_X)\n",
+    "#find accuracy\n",
+    "ac=accuracy_score(test_y,y_pred)\n",
+    "acc.append(ac)\n",
+    "\n",
+    "#find the ROC_AOC curve\n",
+    "rc=roc_auc_score(test_y,y_pred)\n",
+    "roc.append(rc)\n",
+    "print(\"\\nAccuracy {0} ROC {1}\".format(ac,rc))\n",
+    "\n",
+    "#cross val score\n",
+    "result=cross_validate(clf,train_X,train_y,scoring=scoring,cv=10)\n",
+    "display_result(result)\n",
+    "\n",
+    "#display predicted values uncomment below line\n",
+    "#pd.DataFrame(data={'Actual':test_y,'Predicted':y_pred}).head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#KNN\n",
+    "\n",
+    "from sklearn.neighbors import KNeighborsClassifier\n",
+    "\n",
+    "clf=KNeighborsClassifier(n_neighbors=3)\n",
+    "clf.fit(train_X,train_y)\n",
+    "y_pred=clf.predict(test_X)\n",
+    "#find accuracy\n",
+    "ac=accuracy_score(test_y,y_pred)\n",
+    "acc.append(ac)\n",
+    "\n",
+    "#find the ROC_AOC curve\n",
+    "rc=roc_auc_score(test_y,y_pred)\n",
+    "roc.append(rc)\n",
+    "print(\"\\nAccuracy {0} ROC {1}\".format(ac,rc))\n",
+    "\n",
+    "#cross val score\n",
+    "result=cross_validate(clf,train_X,train_y,scoring=scoring,cv=10)\n",
+    "display_result(result)\n",
+    "\n",
+    "#display predicted values uncomment below line\n",
+    "#pd.DataFrame(data={'Actual':test_y,'Predicted':y_pred}).head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Random forest\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "\n",
+    "clf=RandomForestClassifier()\n",
+    "clf.fit(train_X,train_y)\n",
+    "\n",
+    "y_pred=clf.predict(test_X)\n",
+    "#find accuracy\n",
+    "ac=accuracy_score(test_y,y_pred)\n",
+    "acc.append(ac)\n",
+    "\n",
+    "#find the ROC_AOC curve\n",
+    "rc=roc_auc_score(test_y,y_pred)\n",
+    "roc.append(rc)\n",
+    "print(\"\\nAccuracy {0} ROC {1}\".format(ac,rc))\n",
+    "\n",
+    "#cross val score\n",
+    "result=cross_validate(clf,train_X,train_y,scoring=scoring,cv=10)\n",
+    "display_result(result)\n",
+    "\n",
+    "#display predicted values uncomment below line\n",
+    "#pd.DataFrame(data={'Actual':test_y,'Predicted':y_pred}).head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Naive Bayes Theorem\n",
+    "#import library\n",
+    "from sklearn.naive_bayes import GaussianNB\n",
+    "\n",
+    "clf=GaussianNB()\n",
+    "clf.fit(train_X,train_y)\n",
+    "y_pred=clf.predict(test_X)\n",
+    "#find accuracy\n",
+    "ac=accuracy_score(test_y,y_pred)\n",
+    "acc.append(ac)\n",
+    "\n",
+    "#find the ROC_AOC curve\n",
+    "rc=roc_auc_score(test_y,y_pred)\n",
+    "roc.append(rc)\n",
+    "print(\"\\nAccuracy {0} ROC {1}\".format(ac,rc))\n",
+    "\n",
+    "#cross val score\n",
+    "result=cross_validate(clf,train_X,train_y,scoring=scoring,cv=10)\n",
+    "display_result(result)\n",
+    "\n",
+    "#display predicted values uncomment below line\n",
+    "#pd.DataFrame(data={'Actual':test_y,'Predicted':y_pred}).head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Gradient Boosting Classifier\n",
+    "from sklearn.ensemble import GradientBoostingClassifier\n",
+    "clf=GradientBoostingClassifier(n_estimators=50,learning_rate=0.2)\n",
+    "clf.fit(train_X,train_y)\n",
+    "y_pred=clf.predict(test_X)\n",
+    "#find accuracy\n",
+    "ac=accuracy_score(test_y,y_pred)\n",
+    "acc.append(ac)\n",
+    "\n",
+    "#find the ROC_AOC curve\n",
+    "rc=roc_auc_score(test_y,y_pred)\n",
+    "roc.append(rc)\n",
+    "print(\"\\nAccuracy {0} ROC {1}\".format(ac,rc))\n",
+    "\n",
+    "#cross val score\n",
+    "result=cross_validate(clf,train_X,train_y,scoring=scoring,cv=10)\n",
+    "display_result(result)\n",
+    "\n",
+    "#display predicted values uncomment below line\n",
+    "#pd.DataFrame(data={'Actual':test_y,'Predicted':y_pred}).head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#lets plot the bar graph\n",
+    "\n",
+    "ax=plt.figure(figsize=(9,4))\n",
+    "plt.bar(['Logistic Regression','SVM','KNN','Random Forest','Naivye Bayes','Gradient Boosting'],acc,label='Accuracy')\n",
+    "plt.ylabel('Accuracy Score')\n",
+    "plt.xlabel('Algortihms')\n",
+    "plt.show()\n",
+    "\n",
+    "ax=plt.figure(figsize=(9,4))\n",
+    "plt.bar(['Logistic Regression','SVM','KNN','Random Forest','Naivye Bayes','Gradient Boosting'],roc,label='ROC AUC')\n",
+    "plt.ylabel('ROC AUC')\n",
+    "plt.xlabel('Algortihms')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Great....\n",
+    "#Random forest has highest accuracy 98% and ROC_AUC curve 97%\n",
+    "#model can be improve more if we take same count of labels\n",
+    "#in our model 30% is diabetic and 70% no diabetic patient\n",
+    "\n",
+    "#model can be improve with fine tunning"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}