[1f0bb9]: / Decision Tree and Random Forest.ipynb

Download this file

169 lines (168 with data), 6.9 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.63      0.63      0.63    147705\n",
      "           1       0.63      0.63      0.63    147958\n",
      "\n",
      "    accuracy                           0.63    295663\n",
      "   macro avg       0.63      0.63      0.63    295663\n",
      "weighted avg       0.63      0.63      0.63    295663\n",
      "\n",
      "[[93021 54684]\n",
      " [54020 93938]]\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.72      0.72      0.72    147705\n",
      "           1       0.72      0.72      0.72    147958\n",
      "\n",
      "    accuracy                           0.72    295663\n",
      "   macro avg       0.72      0.72      0.72    295663\n",
      "weighted avg       0.72      0.72      0.72    295663\n",
      "\n",
      "[[106993  40712]\n",
      " [ 41075 106883]]\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd  \n",
    "import matplotlib.pylab as plt \n",
    "from matplotlib import pyplot as plt1\n",
    "import seaborn as sns \n",
    "%matplotlib inline  \n",
    "\n",
    "# read the datafile using panda library.  ensure right file location on machine. \n",
    "data = pd.read_csv(r\"C:\\Users\\SAARTH CHAHAL\\Desktop\\Programming\\AIML\\smoking_driking_dataset_Ver01.csv\")\n",
    "# EDA (Exploratory Data Analysis): \n",
    "# Determine number of rows and colums in the provided data\n",
    "data.shape \n",
    "data.head()\n",
    "data.nunique(axis=0)\n",
    "data_cleaned = data.dropna(axis=0)\n",
    "data_cleaned = data_cleaned[data_cleaned['waistline'].between(25,150)]   \n",
    "# sight_left above 5 is based on observation of the data  \n",
    "data_cleaned = data_cleaned[data_cleaned['sight_left'] < 5 ]\n",
    "# sight_left above 5 is based on observation of the data  \n",
    "data_cleaned = data_cleaned[data_cleaned['sight_right'] < 5 ]\n",
    "#since in correlation down the line we will require all number we will need to drop sex which takes string as input.  \n",
    "data_cleaned = data_cleaned.drop('sex',axis=1) \n",
    "# convert drinker as Y or N \n",
    "data_cleaned['DRK_YN'] = np.where(data_cleaned['DRK_YN'] == 'Y', 1,0 ) \n",
    "dfdata= pd.DataFrame(data_cleaned)  \n",
    "\n",
    "\n",
    "# Learning model : Decison Tree ->  Random Forest. \n",
    "\n",
    "from sklearn.model_selection import train_test_split \n",
    "# Train  model  \n",
    "# Data  consist of key health parameters in X1 array  that contains the features to train on, \n",
    "# And a y1 array(SMK_stat_type_cd) with the target variable, \n",
    "X1=dfdata[['tot_chole','HDL_chole','LDL_chole','triglyceride','hemoglobin',\n",
    "           'urine_protein','serum_creatinine','SGOT_AST','SGOT_ALT','gamma_GTP',\n",
    "           'waistline','age','SBP','DBP','BLDS','height','weight','sight_left','sight_right']]\n",
    "y1=dfdata['SMK_stat_type_cd']\n",
    "# Data consist of key health parameters inarray that contains the features to train on, \n",
    "# And a y2 array(DRK_YN)\n",
    "X2=dfdata[['tot_chole','HDL_chole','LDL_chole','triglyceride','hemoglobin',\n",
    "           'urine_protein','serum_creatinine','SGOT_AST','SGOT_ALT','gamma_GTP',\n",
    "           'waistline','age','SBP','DBP','BLDS','height','weight','sight_left','sight_right']]\n",
    "y2=dfdata['DRK_YN']\n",
    "\n",
    "\n",
    "# Train test split. test split is 30 % train set is 70 % \n",
    "X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.3)\n",
    "X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3)\n",
    "\n",
    "## Loading the Decison Tree .    \n",
    "# Start with training a single decision tree.   X1 set for Smokers and X2 set for Drikers \n",
    "\n",
    "from sklearn.tree import DecisionTreeClassifier \n",
    "\n",
    "dtree1 = DecisionTreeClassifier() \n",
    "dtree1.fit(X1_train,y1_train) \n",
    "\n",
    "dtree2 = DecisionTreeClassifier() \n",
    "dtree2.fit(X2_train,y2_train) \n",
    "\n",
    "# Start evaluating the decison tree and prediction on Training data  \n",
    "# Predict class or regression value for X.\n",
    "# For a classification model, the predicted class for each sample in X is returned.\n",
    "# For a regression model, the predicted value based on X is returned.\n",
    "\n",
    "predictions1 = dtree1.predict(X1_test)\n",
    "predictions2 = dtree2.predict(X2_test)\n",
    "\n",
    "from sklearn.metrics import classification_report,confusion_matrix \n",
    "\n",
    "# Confusion matrix to evaluate the accuracy of a classification. \n",
    "# Classfication Report. Builds a text report showing the main classification metrics \n",
    "\n",
    "print(classification_report(y1_test,predictions1)) \n",
    "print(confusion_matrix(y1_test,predictions1)) \n",
    "\n",
    "print(classification_report(y2_test,predictions2)) \n",
    "print(confusion_matrix(y2_test,predictions2)) \n",
    "\n",
    "\n",
    "\n",
    "## compare the decision tree model to a random forest. \n",
    "\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "# A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples \n",
    "# of the dataset and uses averaging to improve the predictive accuracy and control over-fitting.\n",
    "# The sub-sample size is controlled with the max_samples parameter if bootstrap=True (default), \n",
    "#  otherwise the whole dataset is used to build each tree.\n",
    "\n",
    "rfc1 = RandomForestClassifier(n_estimators=100)\n",
    "rfc2 = RandomForestClassifier(n_estimators=100)\n",
    "\n",
    "rfc1.fit(X1_train, y1_train)\n",
    "rfc1_pred = rfc1.predict(X1_test) \n",
    "\n",
    "rfc2.fit(X2_train, y2_train)\n",
    "rfc2_pred = rfc2.predict(X2_test) \n",
    "\n",
    "print(classification_report(y1_test,rfc1_pred))\n",
    "print(confusion_matrix(y1_test,rfc1_pred))\n",
    "\n",
    "print(classification_report(y2_test,rfc2_pred))\n",
    "print(confusion_matrix(y2_test,rfc2_pred))\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}