--- a +++ b/model.ipynb @@ -0,0 +1,478 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df=pd.read_csv('diabetes.csv')\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#lets describe the data\n", + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#infromation of dataset\n", + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#any null values \n", + "#not neccessary in above information we can see\n", + "df.isnull().values.any()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#histogram\n", + "df.hist(bins=10,figsize=(10,10))\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#correlation\n", + "\n", + "sns.heatmap(df.corr())\n", + "# we can see skin thickness,insulin,pregnencies and age are full independent to each other\n", + "#age and pregencies has negative correlation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#lets count total outcome in each target 0 1\n", + "#0 means no diabeted\n", + "#1 means patient with diabtes\n", + "sns.countplot(y=df['Outcome'],palette='Set1')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.set(style=\"ticks\")\n", + "sns.pairplot(df, hue=\"Outcome\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#box plot for outlier visualization\n", + "sns.set(style=\"whitegrid\")\n", + "df.boxplot(figsize=(15,6))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#box plot\n", + "sns.set(style=\"whitegrid\")\n", + "\n", + "sns.set(rc={'figure.figsize':(4,2)})\n", + "sns.boxplot(x=df['Insulin'])\n", + "plt.show()\n", + "sns.boxplot(x=df['BloodPressure'])\n", + "plt.show()\n", + "sns.boxplot(x=df['DiabetesPedigreeFunction'])\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#outlier remove\n", + "\n", + "Q1=df.quantile(0.25)\n", + "Q3=df.quantile(0.75)\n", + "IQR=Q3-Q1\n", + "\n", + "print(\"---Q1--- \\n\",Q1)\n", + "print(\"\\n---Q3--- \\n\",Q3)\n", + "print(\"\\n---IQR---\\n\",IQR)\n", + "\n", + "#print((df < (Q1 - 1.5 * IQR))|(df > (Q3 + 1.5 * IQR)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#outlier remove\n", + "df_out = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]\n", + "df.shape,df_out.shape\n", + "#more than 80 records deleted" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Scatter matrix after removing outlier\n", + "sns.set(style=\"ticks\")\n", + "sns.pairplot(df_out, hue=\"Outcome\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#lets extract features and targets\n", + "X=df_out.drop(columns=['Outcome'])\n", + "y=df_out['Outcome']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Splitting train test data 80 20 ratio\n", + "from sklearn.model_selection import train_test_split\n", + "train_X,test_X,train_y,test_y=train_test_split(X,y,test_size=0.2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_X.shape,test_X.shape,train_y.shape,test_y.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import confusion_matrix,accuracy_score,make_scorer\n", + "from sklearn.model_selection import cross_validate\n", + "\n", + "def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]\n", + "def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]\n", + "def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]\n", + "def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]\n", + "\n", + "#cross validation purpose\n", + "scoring = {'accuracy': make_scorer(accuracy_score),'prec': 'precision'}\n", + "scoring = {'tp': make_scorer(tp), 'tn': make_scorer(tn),\n", + " 'fp': make_scorer(fp), 'fn': make_scorer(fn)}\n", + "\n", + "def display_result(result):\n", + " print(\"TP: \",result['test_tp'])\n", + " print(\"TN: \",result['test_tn'])\n", + " print(\"FN: \",result['test_fn'])\n", + " print(\"FP: \",result['test_fp'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Lets build the model\n", + "\n", + "#Logistic Regression\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import roc_auc_score\n", + "\n", + "acc=[]\n", + "roc=[]\n", + "\n", + "clf=LogisticRegression()\n", + "clf.fit(train_X,train_y)\n", + "y_pred=clf.predict(test_X)\n", + "#find accuracy\n", + "ac=accuracy_score(test_y,y_pred)\n", + "acc.append(ac)\n", + "\n", + "#find the ROC_AOC curve\n", + "rc=roc_auc_score(test_y,y_pred)\n", + "roc.append(rc)\n", + "print(\"\\nAccuracy {0} ROC {1}\".format(ac,rc))\n", + "\n", + "#cross val score\n", + "result=cross_validate(clf,train_X,train_y,scoring=scoring,cv=10)\n", + "display_result(result)\n", + "\n", + "#display predicted values uncomment below line\n", + "#pd.DataFrame(data={'Actual':test_y,'Predicted':y_pred}).head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Support Vector Machine\n", + "from sklearn.svm import SVC\n", + "\n", + "clf=SVC(kernel='linear')\n", + "clf.fit(train_X,train_y)\n", + "y_pred=clf.predict(test_X)\n", + "#find accuracy\n", + "ac=accuracy_score(test_y,y_pred)\n", + "acc.append(ac)\n", + "\n", + "#find the ROC_AOC curve\n", + "rc=roc_auc_score(test_y,y_pred)\n", + "roc.append(rc)\n", + "print(\"\\nAccuracy {0} ROC {1}\".format(ac,rc))\n", + "\n", + "#cross val score\n", + "result=cross_validate(clf,train_X,train_y,scoring=scoring,cv=10)\n", + "display_result(result)\n", + "\n", + "#display predicted values uncomment below line\n", + "#pd.DataFrame(data={'Actual':test_y,'Predicted':y_pred}).head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#KNN\n", + "\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "\n", + "clf=KNeighborsClassifier(n_neighbors=3)\n", + "clf.fit(train_X,train_y)\n", + "y_pred=clf.predict(test_X)\n", + "#find accuracy\n", + "ac=accuracy_score(test_y,y_pred)\n", + "acc.append(ac)\n", + "\n", + "#find the ROC_AOC curve\n", + "rc=roc_auc_score(test_y,y_pred)\n", + "roc.append(rc)\n", + "print(\"\\nAccuracy {0} ROC {1}\".format(ac,rc))\n", + "\n", + "#cross val score\n", + "result=cross_validate(clf,train_X,train_y,scoring=scoring,cv=10)\n", + "display_result(result)\n", + "\n", + "#display predicted values uncomment below line\n", + "#pd.DataFrame(data={'Actual':test_y,'Predicted':y_pred}).head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Random forest\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "clf=RandomForestClassifier()\n", + "clf.fit(train_X,train_y)\n", + "\n", + "y_pred=clf.predict(test_X)\n", + "#find accuracy\n", + "ac=accuracy_score(test_y,y_pred)\n", + "acc.append(ac)\n", + "\n", + "#find the ROC_AOC curve\n", + "rc=roc_auc_score(test_y,y_pred)\n", + "roc.append(rc)\n", + "print(\"\\nAccuracy {0} ROC {1}\".format(ac,rc))\n", + "\n", + "#cross val score\n", + "result=cross_validate(clf,train_X,train_y,scoring=scoring,cv=10)\n", + "display_result(result)\n", + "\n", + "#display predicted values uncomment below line\n", + "#pd.DataFrame(data={'Actual':test_y,'Predicted':y_pred}).head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Naive Bayes Theorem\n", + "#import library\n", + "from sklearn.naive_bayes import GaussianNB\n", + "\n", + "clf=GaussianNB()\n", + "clf.fit(train_X,train_y)\n", + "y_pred=clf.predict(test_X)\n", + "#find accuracy\n", + "ac=accuracy_score(test_y,y_pred)\n", + "acc.append(ac)\n", + "\n", + "#find the ROC_AOC curve\n", + "rc=roc_auc_score(test_y,y_pred)\n", + "roc.append(rc)\n", + "print(\"\\nAccuracy {0} ROC {1}\".format(ac,rc))\n", + "\n", + "#cross val score\n", + "result=cross_validate(clf,train_X,train_y,scoring=scoring,cv=10)\n", + "display_result(result)\n", + "\n", + "#display predicted values uncomment below line\n", + "#pd.DataFrame(data={'Actual':test_y,'Predicted':y_pred}).head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Gradient Boosting Classifier\n", + "from sklearn.ensemble import GradientBoostingClassifier\n", + "clf=GradientBoostingClassifier(n_estimators=50,learning_rate=0.2)\n", + "clf.fit(train_X,train_y)\n", + "y_pred=clf.predict(test_X)\n", + "#find accuracy\n", + "ac=accuracy_score(test_y,y_pred)\n", + "acc.append(ac)\n", + "\n", + "#find the ROC_AOC curve\n", + "rc=roc_auc_score(test_y,y_pred)\n", + "roc.append(rc)\n", + "print(\"\\nAccuracy {0} ROC {1}\".format(ac,rc))\n", + "\n", + "#cross val score\n", + "result=cross_validate(clf,train_X,train_y,scoring=scoring,cv=10)\n", + "display_result(result)\n", + "\n", + "#display predicted values uncomment below line\n", + "#pd.DataFrame(data={'Actual':test_y,'Predicted':y_pred}).head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#lets plot the bar graph\n", + "\n", + "ax=plt.figure(figsize=(9,4))\n", + "plt.bar(['Logistic Regression','SVM','KNN','Random Forest','Naivye Bayes','Gradient Boosting'],acc,label='Accuracy')\n", + "plt.ylabel('Accuracy Score')\n", + "plt.xlabel('Algortihms')\n", + "plt.show()\n", + "\n", + "ax=plt.figure(figsize=(9,4))\n", + "plt.bar(['Logistic Regression','SVM','KNN','Random Forest','Naivye Bayes','Gradient Boosting'],roc,label='ROC AUC')\n", + "plt.ylabel('ROC AUC')\n", + "plt.xlabel('Algortihms')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Great....\n", + "#Random forest has highest accuracy 98% and ROC_AUC curve 97%\n", + "#model can be improve more if we take same count of labels\n", + "#in our model 30% is diabetic and 70% no diabetic patient\n", + "\n", + "#model can be improve with fine tunning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}