Switch to unified view

a b/Decision Tree and Random Forest.ipynb
1
{
2
 "cells": [
3
  {
4
   "cell_type": "code",
5
   "execution_count": 2,
6
   "metadata": {},
7
   "outputs": [
8
    {
9
     "name": "stdout",
10
     "output_type": "stream",
11
     "text": [
12
      "              precision    recall  f1-score   support\n",
13
      "\n",
14
      "           0       0.63      0.63      0.63    147705\n",
15
      "           1       0.63      0.63      0.63    147958\n",
16
      "\n",
17
      "    accuracy                           0.63    295663\n",
18
      "   macro avg       0.63      0.63      0.63    295663\n",
19
      "weighted avg       0.63      0.63      0.63    295663\n",
20
      "\n",
21
      "[[93021 54684]\n",
22
      " [54020 93938]]\n",
23
      "              precision    recall  f1-score   support\n",
24
      "\n",
25
      "           0       0.72      0.72      0.72    147705\n",
26
      "           1       0.72      0.72      0.72    147958\n",
27
      "\n",
28
      "    accuracy                           0.72    295663\n",
29
      "   macro avg       0.72      0.72      0.72    295663\n",
30
      "weighted avg       0.72      0.72      0.72    295663\n",
31
      "\n",
32
      "[[106993  40712]\n",
33
      " [ 41075 106883]]\n"
34
     ]
35
    }
36
   ],
37
   "source": [
38
    "import numpy as np\n",
39
    "import pandas as pd  \n",
40
    "import matplotlib.pylab as plt \n",
41
    "from matplotlib import pyplot as plt1\n",
42
    "import seaborn as sns \n",
43
    "%matplotlib inline  \n",
44
    "\n",
45
    "# read the datafile using panda library.  ensure right file location on machine. \n",
46
    "data = pd.read_csv(r\"C:\\Users\\SAARTH CHAHAL\\Desktop\\Programming\\AIML\\smoking_driking_dataset_Ver01.csv\")\n",
47
    "# EDA (Exploratory Data Analysis): \n",
48
    "# Determine number of rows and colums in the provided data\n",
49
    "data.shape \n",
50
    "data.head()\n",
51
    "data.nunique(axis=0)\n",
52
    "data_cleaned = data.dropna(axis=0)\n",
53
    "data_cleaned = data_cleaned[data_cleaned['waistline'].between(25,150)]   \n",
54
    "# sight_left above 5 is based on observation of the data  \n",
55
    "data_cleaned = data_cleaned[data_cleaned['sight_left'] < 5 ]\n",
56
    "# sight_left above 5 is based on observation of the data  \n",
57
    "data_cleaned = data_cleaned[data_cleaned['sight_right'] < 5 ]\n",
58
    "#since in correlation down the line we will require all number we will need to drop sex which takes string as input.  \n",
59
    "data_cleaned = data_cleaned.drop('sex',axis=1) \n",
60
    "# convert drinker as Y or N \n",
61
    "data_cleaned['DRK_YN'] = np.where(data_cleaned['DRK_YN'] == 'Y', 1,0 ) \n",
62
    "dfdata= pd.DataFrame(data_cleaned)  \n",
63
    "\n",
64
    "\n",
65
    "# Learning model : Decison Tree ->  Random Forest. \n",
66
    "\n",
67
    "from sklearn.model_selection import train_test_split \n",
68
    "# Train  model  \n",
69
    "# Data  consist of key health parameters in X1 array  that contains the features to train on, \n",
70
    "# And a y1 array(SMK_stat_type_cd) with the target variable, \n",
71
    "X1=dfdata[['tot_chole','HDL_chole','LDL_chole','triglyceride','hemoglobin',\n",
72
    "           'urine_protein','serum_creatinine','SGOT_AST','SGOT_ALT','gamma_GTP',\n",
73
    "           'waistline','age','SBP','DBP','BLDS','height','weight','sight_left','sight_right']]\n",
74
    "y1=dfdata['SMK_stat_type_cd']\n",
75
    "# Data consist of key health parameters inarray that contains the features to train on, \n",
76
    "# And a y2 array(DRK_YN)\n",
77
    "X2=dfdata[['tot_chole','HDL_chole','LDL_chole','triglyceride','hemoglobin',\n",
78
    "           'urine_protein','serum_creatinine','SGOT_AST','SGOT_ALT','gamma_GTP',\n",
79
    "           'waistline','age','SBP','DBP','BLDS','height','weight','sight_left','sight_right']]\n",
80
    "y2=dfdata['DRK_YN']\n",
81
    "\n",
82
    "\n",
83
    "# Train test split. test split is 30 % train set is 70 % \n",
84
    "X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.3)\n",
85
    "X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3)\n",
86
    "\n",
87
    "## Loading the Decison Tree .    \n",
88
    "# Start with training a single decision tree.   X1 set for Smokers and X2 set for Drikers \n",
89
    "\n",
90
    "from sklearn.tree import DecisionTreeClassifier \n",
91
    "\n",
92
    "dtree1 = DecisionTreeClassifier() \n",
93
    "dtree1.fit(X1_train,y1_train) \n",
94
    "\n",
95
    "dtree2 = DecisionTreeClassifier() \n",
96
    "dtree2.fit(X2_train,y2_train) \n",
97
    "\n",
98
    "# Start evaluating the decison tree and prediction on Training data  \n",
99
    "# Predict class or regression value for X.\n",
100
    "# For a classification model, the predicted class for each sample in X is returned.\n",
101
    "# For a regression model, the predicted value based on X is returned.\n",
102
    "\n",
103
    "predictions1 = dtree1.predict(X1_test)\n",
104
    "predictions2 = dtree2.predict(X2_test)\n",
105
    "\n",
106
    "from sklearn.metrics import classification_report,confusion_matrix \n",
107
    "\n",
108
    "# Confusion matrix to evaluate the accuracy of a classification. \n",
109
    "# Classfication Report. Builds a text report showing the main classification metrics \n",
110
    "\n",
111
    "print(classification_report(y1_test,predictions1)) \n",
112
    "print(confusion_matrix(y1_test,predictions1)) \n",
113
    "\n",
114
    "print(classification_report(y2_test,predictions2)) \n",
115
    "print(confusion_matrix(y2_test,predictions2)) \n",
116
    "\n",
117
    "\n",
118
    "\n",
119
    "## compare the decision tree model to a random forest. \n",
120
    "\n",
121
    "from sklearn.ensemble import RandomForestClassifier\n",
122
    "\n",
123
    "# A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples \n",
124
    "# of the dataset and uses averaging to improve the predictive accuracy and control over-fitting.\n",
125
    "# The sub-sample size is controlled with the max_samples parameter if bootstrap=True (default), \n",
126
    "#  otherwise the whole dataset is used to build each tree.\n",
127
    "\n",
128
    "rfc1 = RandomForestClassifier(n_estimators=100)\n",
129
    "rfc2 = RandomForestClassifier(n_estimators=100)\n",
130
    "\n",
131
    "rfc1.fit(X1_train, y1_train)\n",
132
    "rfc1_pred = rfc1.predict(X1_test) \n",
133
    "\n",
134
    "rfc2.fit(X2_train, y2_train)\n",
135
    "rfc2_pred = rfc2.predict(X2_test) \n",
136
    "\n",
137
    "print(classification_report(y1_test,rfc1_pred))\n",
138
    "print(confusion_matrix(y1_test,rfc1_pred))\n",
139
    "\n",
140
    "print(classification_report(y2_test,rfc2_pred))\n",
141
    "print(confusion_matrix(y2_test,rfc2_pred))\n",
142
    "\n"
143
   ]
144
  }
145
 ],
146
 "metadata": {
147
  "kernelspec": {
148
   "display_name": "Python 3",
149
   "language": "python",
150
   "name": "python3"
151
  },
152
  "language_info": {
153
   "codemirror_mode": {
154
    "name": "ipython",
155
    "version": 3
156
   },
157
   "file_extension": ".py",
158
   "mimetype": "text/x-python",
159
   "name": "python",
160
   "nbconvert_exporter": "python",
161
   "pygments_lexer": "ipython3",
162
   "version": "3.11.5"
163
  },
164
  "orig_nbformat": 4
165
 },
166
 "nbformat": 4,
167
 "nbformat_minor": 2
168
}