479 lines (478 with data), 12.4 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df=pd.read_csv('diabetes.csv')\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#lets describe the data\n",
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#infromation of dataset\n",
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#any null values \n",
"#not neccessary in above information we can see\n",
"df.isnull().values.any()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#histogram\n",
"df.hist(bins=10,figsize=(10,10))\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#correlation\n",
"\n",
"sns.heatmap(df.corr())\n",
"# we can see skin thickness,insulin,pregnencies and age are full independent to each other\n",
"#age and pregencies has negative correlation"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#lets count total outcome in each target 0 1\n",
"#0 means no diabeted\n",
"#1 means patient with diabtes\n",
"sns.countplot(y=df['Outcome'],palette='Set1')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sns.set(style=\"ticks\")\n",
"sns.pairplot(df, hue=\"Outcome\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#box plot for outlier visualization\n",
"sns.set(style=\"whitegrid\")\n",
"df.boxplot(figsize=(15,6))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#box plot\n",
"sns.set(style=\"whitegrid\")\n",
"\n",
"sns.set(rc={'figure.figsize':(4,2)})\n",
"sns.boxplot(x=df['Insulin'])\n",
"plt.show()\n",
"sns.boxplot(x=df['BloodPressure'])\n",
"plt.show()\n",
"sns.boxplot(x=df['DiabetesPedigreeFunction'])\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#outlier remove\n",
"\n",
"Q1=df.quantile(0.25)\n",
"Q3=df.quantile(0.75)\n",
"IQR=Q3-Q1\n",
"\n",
"print(\"---Q1--- \\n\",Q1)\n",
"print(\"\\n---Q3--- \\n\",Q3)\n",
"print(\"\\n---IQR---\\n\",IQR)\n",
"\n",
"#print((df < (Q1 - 1.5 * IQR))|(df > (Q3 + 1.5 * IQR)))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#outlier remove\n",
"df_out = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]\n",
"df.shape,df_out.shape\n",
"#more than 80 records deleted"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Scatter matrix after removing outlier\n",
"sns.set(style=\"ticks\")\n",
"sns.pairplot(df_out, hue=\"Outcome\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#lets extract features and targets\n",
"X=df_out.drop(columns=['Outcome'])\n",
"y=df_out['Outcome']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Splitting train test data 80 20 ratio\n",
"from sklearn.model_selection import train_test_split\n",
"train_X,test_X,train_y,test_y=train_test_split(X,y,test_size=0.2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"train_X.shape,test_X.shape,train_y.shape,test_y.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.metrics import confusion_matrix,accuracy_score,make_scorer\n",
"from sklearn.model_selection import cross_validate\n",
"\n",
"def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]\n",
"def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]\n",
"def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]\n",
"def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]\n",
"\n",
"#cross validation purpose\n",
"scoring = {'accuracy': make_scorer(accuracy_score),'prec': 'precision'}\n",
"scoring = {'tp': make_scorer(tp), 'tn': make_scorer(tn),\n",
" 'fp': make_scorer(fp), 'fn': make_scorer(fn)}\n",
"\n",
"def display_result(result):\n",
" print(\"TP: \",result['test_tp'])\n",
" print(\"TN: \",result['test_tn'])\n",
" print(\"FN: \",result['test_fn'])\n",
" print(\"FP: \",result['test_fp'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Lets build the model\n",
"\n",
"#Logistic Regression\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.metrics import roc_auc_score\n",
"\n",
"acc=[]\n",
"roc=[]\n",
"\n",
"clf=LogisticRegression()\n",
"clf.fit(train_X,train_y)\n",
"y_pred=clf.predict(test_X)\n",
"#find accuracy\n",
"ac=accuracy_score(test_y,y_pred)\n",
"acc.append(ac)\n",
"\n",
"#find the ROC_AOC curve\n",
"rc=roc_auc_score(test_y,y_pred)\n",
"roc.append(rc)\n",
"print(\"\\nAccuracy {0} ROC {1}\".format(ac,rc))\n",
"\n",
"#cross val score\n",
"result=cross_validate(clf,train_X,train_y,scoring=scoring,cv=10)\n",
"display_result(result)\n",
"\n",
"#display predicted values uncomment below line\n",
"#pd.DataFrame(data={'Actual':test_y,'Predicted':y_pred}).head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Support Vector Machine\n",
"from sklearn.svm import SVC\n",
"\n",
"clf=SVC(kernel='linear')\n",
"clf.fit(train_X,train_y)\n",
"y_pred=clf.predict(test_X)\n",
"#find accuracy\n",
"ac=accuracy_score(test_y,y_pred)\n",
"acc.append(ac)\n",
"\n",
"#find the ROC_AOC curve\n",
"rc=roc_auc_score(test_y,y_pred)\n",
"roc.append(rc)\n",
"print(\"\\nAccuracy {0} ROC {1}\".format(ac,rc))\n",
"\n",
"#cross val score\n",
"result=cross_validate(clf,train_X,train_y,scoring=scoring,cv=10)\n",
"display_result(result)\n",
"\n",
"#display predicted values uncomment below line\n",
"#pd.DataFrame(data={'Actual':test_y,'Predicted':y_pred}).head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#KNN\n",
"\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"\n",
"clf=KNeighborsClassifier(n_neighbors=3)\n",
"clf.fit(train_X,train_y)\n",
"y_pred=clf.predict(test_X)\n",
"#find accuracy\n",
"ac=accuracy_score(test_y,y_pred)\n",
"acc.append(ac)\n",
"\n",
"#find the ROC_AOC curve\n",
"rc=roc_auc_score(test_y,y_pred)\n",
"roc.append(rc)\n",
"print(\"\\nAccuracy {0} ROC {1}\".format(ac,rc))\n",
"\n",
"#cross val score\n",
"result=cross_validate(clf,train_X,train_y,scoring=scoring,cv=10)\n",
"display_result(result)\n",
"\n",
"#display predicted values uncomment below line\n",
"#pd.DataFrame(data={'Actual':test_y,'Predicted':y_pred}).head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Random forest\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"\n",
"clf=RandomForestClassifier()\n",
"clf.fit(train_X,train_y)\n",
"\n",
"y_pred=clf.predict(test_X)\n",
"#find accuracy\n",
"ac=accuracy_score(test_y,y_pred)\n",
"acc.append(ac)\n",
"\n",
"#find the ROC_AOC curve\n",
"rc=roc_auc_score(test_y,y_pred)\n",
"roc.append(rc)\n",
"print(\"\\nAccuracy {0} ROC {1}\".format(ac,rc))\n",
"\n",
"#cross val score\n",
"result=cross_validate(clf,train_X,train_y,scoring=scoring,cv=10)\n",
"display_result(result)\n",
"\n",
"#display predicted values uncomment below line\n",
"#pd.DataFrame(data={'Actual':test_y,'Predicted':y_pred}).head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Naive Bayes Theorem\n",
"#import library\n",
"from sklearn.naive_bayes import GaussianNB\n",
"\n",
"clf=GaussianNB()\n",
"clf.fit(train_X,train_y)\n",
"y_pred=clf.predict(test_X)\n",
"#find accuracy\n",
"ac=accuracy_score(test_y,y_pred)\n",
"acc.append(ac)\n",
"\n",
"#find the ROC_AOC curve\n",
"rc=roc_auc_score(test_y,y_pred)\n",
"roc.append(rc)\n",
"print(\"\\nAccuracy {0} ROC {1}\".format(ac,rc))\n",
"\n",
"#cross val score\n",
"result=cross_validate(clf,train_X,train_y,scoring=scoring,cv=10)\n",
"display_result(result)\n",
"\n",
"#display predicted values uncomment below line\n",
"#pd.DataFrame(data={'Actual':test_y,'Predicted':y_pred}).head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Gradient Boosting Classifier\n",
"from sklearn.ensemble import GradientBoostingClassifier\n",
"clf=GradientBoostingClassifier(n_estimators=50,learning_rate=0.2)\n",
"clf.fit(train_X,train_y)\n",
"y_pred=clf.predict(test_X)\n",
"#find accuracy\n",
"ac=accuracy_score(test_y,y_pred)\n",
"acc.append(ac)\n",
"\n",
"#find the ROC_AOC curve\n",
"rc=roc_auc_score(test_y,y_pred)\n",
"roc.append(rc)\n",
"print(\"\\nAccuracy {0} ROC {1}\".format(ac,rc))\n",
"\n",
"#cross val score\n",
"result=cross_validate(clf,train_X,train_y,scoring=scoring,cv=10)\n",
"display_result(result)\n",
"\n",
"#display predicted values uncomment below line\n",
"#pd.DataFrame(data={'Actual':test_y,'Predicted':y_pred}).head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#lets plot the bar graph\n",
"\n",
"ax=plt.figure(figsize=(9,4))\n",
"plt.bar(['Logistic Regression','SVM','KNN','Random Forest','Naivye Bayes','Gradient Boosting'],acc,label='Accuracy')\n",
"plt.ylabel('Accuracy Score')\n",
"plt.xlabel('Algortihms')\n",
"plt.show()\n",
"\n",
"ax=plt.figure(figsize=(9,4))\n",
"plt.bar(['Logistic Regression','SVM','KNN','Random Forest','Naivye Bayes','Gradient Boosting'],roc,label='ROC AUC')\n",
"plt.ylabel('ROC AUC')\n",
"plt.xlabel('Algortihms')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Great....\n",
"#Random forest has highest accuracy 98% and ROC_AUC curve 97%\n",
"#model can be improve more if we take same count of labels\n",
"#in our model 30% is diabetic and 70% no diabetic patient\n",
"\n",
"#model can be improve with fine tunning"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}