169 lines (168 with data), 6.9 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.63 0.63 0.63 147705\n",
" 1 0.63 0.63 0.63 147958\n",
"\n",
" accuracy 0.63 295663\n",
" macro avg 0.63 0.63 0.63 295663\n",
"weighted avg 0.63 0.63 0.63 295663\n",
"\n",
"[[93021 54684]\n",
" [54020 93938]]\n",
" precision recall f1-score support\n",
"\n",
" 0 0.72 0.72 0.72 147705\n",
" 1 0.72 0.72 0.72 147958\n",
"\n",
" accuracy 0.72 295663\n",
" macro avg 0.72 0.72 0.72 295663\n",
"weighted avg 0.72 0.72 0.72 295663\n",
"\n",
"[[106993 40712]\n",
" [ 41075 106883]]\n"
]
}
],
"source": [
"import numpy as np\n",
"import pandas as pd \n",
"import matplotlib.pylab as plt \n",
"from matplotlib import pyplot as plt1\n",
"import seaborn as sns \n",
"%matplotlib inline \n",
"\n",
"# read the datafile using panda library. ensure right file location on machine. \n",
"data = pd.read_csv(r\"C:\\Users\\SAARTH CHAHAL\\Desktop\\Programming\\AIML\\smoking_driking_dataset_Ver01.csv\")\n",
"# EDA (Exploratory Data Analysis): \n",
"# Determine number of rows and colums in the provided data\n",
"data.shape \n",
"data.head()\n",
"data.nunique(axis=0)\n",
"data_cleaned = data.dropna(axis=0)\n",
"data_cleaned = data_cleaned[data_cleaned['waistline'].between(25,150)] \n",
"# sight_left above 5 is based on observation of the data \n",
"data_cleaned = data_cleaned[data_cleaned['sight_left'] < 5 ]\n",
"# sight_left above 5 is based on observation of the data \n",
"data_cleaned = data_cleaned[data_cleaned['sight_right'] < 5 ]\n",
"#since in correlation down the line we will require all number we will need to drop sex which takes string as input. \n",
"data_cleaned = data_cleaned.drop('sex',axis=1) \n",
"# convert drinker as Y or N \n",
"data_cleaned['DRK_YN'] = np.where(data_cleaned['DRK_YN'] == 'Y', 1,0 ) \n",
"dfdata= pd.DataFrame(data_cleaned) \n",
"\n",
"\n",
"# Learning model : Decison Tree -> Random Forest. \n",
"\n",
"from sklearn.model_selection import train_test_split \n",
"# Train model \n",
"# Data consist of key health parameters in X1 array that contains the features to train on, \n",
"# And a y1 array(SMK_stat_type_cd) with the target variable, \n",
"X1=dfdata[['tot_chole','HDL_chole','LDL_chole','triglyceride','hemoglobin',\n",
" 'urine_protein','serum_creatinine','SGOT_AST','SGOT_ALT','gamma_GTP',\n",
" 'waistline','age','SBP','DBP','BLDS','height','weight','sight_left','sight_right']]\n",
"y1=dfdata['SMK_stat_type_cd']\n",
"# Data consist of key health parameters inarray that contains the features to train on, \n",
"# And a y2 array(DRK_YN)\n",
"X2=dfdata[['tot_chole','HDL_chole','LDL_chole','triglyceride','hemoglobin',\n",
" 'urine_protein','serum_creatinine','SGOT_AST','SGOT_ALT','gamma_GTP',\n",
" 'waistline','age','SBP','DBP','BLDS','height','weight','sight_left','sight_right']]\n",
"y2=dfdata['DRK_YN']\n",
"\n",
"\n",
"# Train test split. test split is 30 % train set is 70 % \n",
"X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.3)\n",
"X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3)\n",
"\n",
"## Loading the Decison Tree . \n",
"# Start with training a single decision tree. X1 set for Smokers and X2 set for Drikers \n",
"\n",
"from sklearn.tree import DecisionTreeClassifier \n",
"\n",
"dtree1 = DecisionTreeClassifier() \n",
"dtree1.fit(X1_train,y1_train) \n",
"\n",
"dtree2 = DecisionTreeClassifier() \n",
"dtree2.fit(X2_train,y2_train) \n",
"\n",
"# Start evaluating the decison tree and prediction on Training data \n",
"# Predict class or regression value for X.\n",
"# For a classification model, the predicted class for each sample in X is returned.\n",
"# For a regression model, the predicted value based on X is returned.\n",
"\n",
"predictions1 = dtree1.predict(X1_test)\n",
"predictions2 = dtree2.predict(X2_test)\n",
"\n",
"from sklearn.metrics import classification_report,confusion_matrix \n",
"\n",
"# Confusion matrix to evaluate the accuracy of a classification. \n",
"# Classfication Report. Builds a text report showing the main classification metrics \n",
"\n",
"print(classification_report(y1_test,predictions1)) \n",
"print(confusion_matrix(y1_test,predictions1)) \n",
"\n",
"print(classification_report(y2_test,predictions2)) \n",
"print(confusion_matrix(y2_test,predictions2)) \n",
"\n",
"\n",
"\n",
"## compare the decision tree model to a random forest. \n",
"\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"\n",
"# A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples \n",
"# of the dataset and uses averaging to improve the predictive accuracy and control over-fitting.\n",
"# The sub-sample size is controlled with the max_samples parameter if bootstrap=True (default), \n",
"# otherwise the whole dataset is used to build each tree.\n",
"\n",
"rfc1 = RandomForestClassifier(n_estimators=100)\n",
"rfc2 = RandomForestClassifier(n_estimators=100)\n",
"\n",
"rfc1.fit(X1_train, y1_train)\n",
"rfc1_pred = rfc1.predict(X1_test) \n",
"\n",
"rfc2.fit(X2_train, y2_train)\n",
"rfc2_pred = rfc2.predict(X2_test) \n",
"\n",
"print(classification_report(y1_test,rfc1_pred))\n",
"print(confusion_matrix(y1_test,rfc1_pred))\n",
"\n",
"print(classification_report(y2_test,rfc2_pred))\n",
"print(confusion_matrix(y2_test,rfc2_pred))\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}