--- a +++ b/Decision Tree and Random Forest.ipynb @@ -0,0 +1,168 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.63 0.63 0.63 147705\n", + " 1 0.63 0.63 0.63 147958\n", + "\n", + " accuracy 0.63 295663\n", + " macro avg 0.63 0.63 0.63 295663\n", + "weighted avg 0.63 0.63 0.63 295663\n", + "\n", + "[[93021 54684]\n", + " [54020 93938]]\n", + " precision recall f1-score support\n", + "\n", + " 0 0.72 0.72 0.72 147705\n", + " 1 0.72 0.72 0.72 147958\n", + "\n", + " accuracy 0.72 295663\n", + " macro avg 0.72 0.72 0.72 295663\n", + "weighted avg 0.72 0.72 0.72 295663\n", + "\n", + "[[106993 40712]\n", + " [ 41075 106883]]\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd \n", + "import matplotlib.pylab as plt \n", + "from matplotlib import pyplot as plt1\n", + "import seaborn as sns \n", + "%matplotlib inline \n", + "\n", + "# read the datafile using panda library. ensure right file location on machine. \n", + "data = pd.read_csv(r\"C:\\Users\\SAARTH CHAHAL\\Desktop\\Programming\\AIML\\smoking_driking_dataset_Ver01.csv\")\n", + "# EDA (Exploratory Data Analysis): \n", + "# Determine number of rows and colums in the provided data\n", + "data.shape \n", + "data.head()\n", + "data.nunique(axis=0)\n", + "data_cleaned = data.dropna(axis=0)\n", + "data_cleaned = data_cleaned[data_cleaned['waistline'].between(25,150)] \n", + "# sight_left above 5 is based on observation of the data \n", + "data_cleaned = data_cleaned[data_cleaned['sight_left'] < 5 ]\n", + "# sight_left above 5 is based on observation of the data \n", + "data_cleaned = data_cleaned[data_cleaned['sight_right'] < 5 ]\n", + "#since in correlation down the line we will require all number we will need to drop sex which takes string as input. \n", + "data_cleaned = data_cleaned.drop('sex',axis=1) \n", + "# convert drinker as Y or N \n", + "data_cleaned['DRK_YN'] = np.where(data_cleaned['DRK_YN'] == 'Y', 1,0 ) \n", + "dfdata= pd.DataFrame(data_cleaned) \n", + "\n", + "\n", + "# Learning model : Decison Tree -> Random Forest. \n", + "\n", + "from sklearn.model_selection import train_test_split \n", + "# Train model \n", + "# Data consist of key health parameters in X1 array that contains the features to train on, \n", + "# And a y1 array(SMK_stat_type_cd) with the target variable, \n", + "X1=dfdata[['tot_chole','HDL_chole','LDL_chole','triglyceride','hemoglobin',\n", + " 'urine_protein','serum_creatinine','SGOT_AST','SGOT_ALT','gamma_GTP',\n", + " 'waistline','age','SBP','DBP','BLDS','height','weight','sight_left','sight_right']]\n", + "y1=dfdata['SMK_stat_type_cd']\n", + "# Data consist of key health parameters inarray that contains the features to train on, \n", + "# And a y2 array(DRK_YN)\n", + "X2=dfdata[['tot_chole','HDL_chole','LDL_chole','triglyceride','hemoglobin',\n", + " 'urine_protein','serum_creatinine','SGOT_AST','SGOT_ALT','gamma_GTP',\n", + " 'waistline','age','SBP','DBP','BLDS','height','weight','sight_left','sight_right']]\n", + "y2=dfdata['DRK_YN']\n", + "\n", + "\n", + "# Train test split. test split is 30 % train set is 70 % \n", + "X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.3)\n", + "X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3)\n", + "\n", + "## Loading the Decison Tree . \n", + "# Start with training a single decision tree. X1 set for Smokers and X2 set for Drikers \n", + "\n", + "from sklearn.tree import DecisionTreeClassifier \n", + "\n", + "dtree1 = DecisionTreeClassifier() \n", + "dtree1.fit(X1_train,y1_train) \n", + "\n", + "dtree2 = DecisionTreeClassifier() \n", + "dtree2.fit(X2_train,y2_train) \n", + "\n", + "# Start evaluating the decison tree and prediction on Training data \n", + "# Predict class or regression value for X.\n", + "# For a classification model, the predicted class for each sample in X is returned.\n", + "# For a regression model, the predicted value based on X is returned.\n", + "\n", + "predictions1 = dtree1.predict(X1_test)\n", + "predictions2 = dtree2.predict(X2_test)\n", + "\n", + "from sklearn.metrics import classification_report,confusion_matrix \n", + "\n", + "# Confusion matrix to evaluate the accuracy of a classification. \n", + "# Classfication Report. Builds a text report showing the main classification metrics \n", + "\n", + "print(classification_report(y1_test,predictions1)) \n", + "print(confusion_matrix(y1_test,predictions1)) \n", + "\n", + "print(classification_report(y2_test,predictions2)) \n", + "print(confusion_matrix(y2_test,predictions2)) \n", + "\n", + "\n", + "\n", + "## compare the decision tree model to a random forest. \n", + "\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "# A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples \n", + "# of the dataset and uses averaging to improve the predictive accuracy and control over-fitting.\n", + "# The sub-sample size is controlled with the max_samples parameter if bootstrap=True (default), \n", + "# otherwise the whole dataset is used to build each tree.\n", + "\n", + "rfc1 = RandomForestClassifier(n_estimators=100)\n", + "rfc2 = RandomForestClassifier(n_estimators=100)\n", + "\n", + "rfc1.fit(X1_train, y1_train)\n", + "rfc1_pred = rfc1.predict(X1_test) \n", + "\n", + "rfc2.fit(X2_train, y2_train)\n", + "rfc2_pred = rfc2.predict(X2_test) \n", + "\n", + "print(classification_report(y1_test,rfc1_pred))\n", + "print(confusion_matrix(y1_test,rfc1_pred))\n", + "\n", + "print(classification_report(y2_test,rfc2_pred))\n", + "print(confusion_matrix(y2_test,rfc2_pred))\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}