ML-model-to-conduct-Explo / Git / Diff of /Decision Tree and Random Forest.ipynb

Models:
RichardZick/
ML-model-to-conduct-Explo
Downloads: 1
Diff of /Decision Tree and Random Forest.ipynb [000000] .. [1f0bb9]
Switch to side-by-side view

--- a
+++ b/Decision Tree and Random Forest.ipynb
@@ -0,0 +1,168 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       0.63      0.63      0.63    147705\n",
+      "           1       0.63      0.63      0.63    147958\n",
+      "\n",
+      "    accuracy                           0.63    295663\n",
+      "   macro avg       0.63      0.63      0.63    295663\n",
+      "weighted avg       0.63      0.63      0.63    295663\n",
+      "\n",
+      "[[93021 54684]\n",
+      " [54020 93938]]\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       0.72      0.72      0.72    147705\n",
+      "           1       0.72      0.72      0.72    147958\n",
+      "\n",
+      "    accuracy                           0.72    295663\n",
+      "   macro avg       0.72      0.72      0.72    295663\n",
+      "weighted avg       0.72      0.72      0.72    295663\n",
+      "\n",
+      "[[106993  40712]\n",
+      " [ 41075 106883]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd  \n",
+    "import matplotlib.pylab as plt \n",
+    "from matplotlib import pyplot as plt1\n",
+    "import seaborn as sns \n",
+    "%matplotlib inline  \n",
+    "\n",
+    "# read the datafile using panda library.  ensure right file location on machine. \n",
+    "data = pd.read_csv(r\"C:\\Users\\SAARTH CHAHAL\\Desktop\\Programming\\AIML\\smoking_driking_dataset_Ver01.csv\")\n",
+    "# EDA (Exploratory Data Analysis): \n",
+    "# Determine number of rows and colums in the provided data\n",
+    "data.shape \n",
+    "data.head()\n",
+    "data.nunique(axis=0)\n",
+    "data_cleaned = data.dropna(axis=0)\n",
+    "data_cleaned = data_cleaned[data_cleaned['waistline'].between(25,150)]   \n",
+    "# sight_left above 5 is based on observation of the data  \n",
+    "data_cleaned = data_cleaned[data_cleaned['sight_left'] < 5 ]\n",
+    "# sight_left above 5 is based on observation of the data  \n",
+    "data_cleaned = data_cleaned[data_cleaned['sight_right'] < 5 ]\n",
+    "#since in correlation down the line we will require all number we will need to drop sex which takes string as input.  \n",
+    "data_cleaned = data_cleaned.drop('sex',axis=1) \n",
+    "# convert drinker as Y or N \n",
+    "data_cleaned['DRK_YN'] = np.where(data_cleaned['DRK_YN'] == 'Y', 1,0 ) \n",
+    "dfdata= pd.DataFrame(data_cleaned)  \n",
+    "\n",
+    "\n",
+    "# Learning model : Decison Tree ->  Random Forest. \n",
+    "\n",
+    "from sklearn.model_selection import train_test_split \n",
+    "# Train  model  \n",
+    "# Data  consist of key health parameters in X1 array  that contains the features to train on, \n",
+    "# And a y1 array(SMK_stat_type_cd) with the target variable, \n",
+    "X1=dfdata[['tot_chole','HDL_chole','LDL_chole','triglyceride','hemoglobin',\n",
+    "           'urine_protein','serum_creatinine','SGOT_AST','SGOT_ALT','gamma_GTP',\n",
+    "           'waistline','age','SBP','DBP','BLDS','height','weight','sight_left','sight_right']]\n",
+    "y1=dfdata['SMK_stat_type_cd']\n",
+    "# Data consist of key health parameters inarray that contains the features to train on, \n",
+    "# And a y2 array(DRK_YN)\n",
+    "X2=dfdata[['tot_chole','HDL_chole','LDL_chole','triglyceride','hemoglobin',\n",
+    "           'urine_protein','serum_creatinine','SGOT_AST','SGOT_ALT','gamma_GTP',\n",
+    "           'waistline','age','SBP','DBP','BLDS','height','weight','sight_left','sight_right']]\n",
+    "y2=dfdata['DRK_YN']\n",
+    "\n",
+    "\n",
+    "# Train test split. test split is 30 % train set is 70 % \n",
+    "X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.3)\n",
+    "X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3)\n",
+    "\n",
+    "## Loading the Decison Tree .    \n",
+    "# Start with training a single decision tree.   X1 set for Smokers and X2 set for Drikers \n",
+    "\n",
+    "from sklearn.tree import DecisionTreeClassifier \n",
+    "\n",
+    "dtree1 = DecisionTreeClassifier() \n",
+    "dtree1.fit(X1_train,y1_train) \n",
+    "\n",
+    "dtree2 = DecisionTreeClassifier() \n",
+    "dtree2.fit(X2_train,y2_train) \n",
+    "\n",
+    "# Start evaluating the decison tree and prediction on Training data  \n",
+    "# Predict class or regression value for X.\n",
+    "# For a classification model, the predicted class for each sample in X is returned.\n",
+    "# For a regression model, the predicted value based on X is returned.\n",
+    "\n",
+    "predictions1 = dtree1.predict(X1_test)\n",
+    "predictions2 = dtree2.predict(X2_test)\n",
+    "\n",
+    "from sklearn.metrics import classification_report,confusion_matrix \n",
+    "\n",
+    "# Confusion matrix to evaluate the accuracy of a classification. \n",
+    "# Classfication Report. Builds a text report showing the main classification metrics \n",
+    "\n",
+    "print(classification_report(y1_test,predictions1)) \n",
+    "print(confusion_matrix(y1_test,predictions1)) \n",
+    "\n",
+    "print(classification_report(y2_test,predictions2)) \n",
+    "print(confusion_matrix(y2_test,predictions2)) \n",
+    "\n",
+    "\n",
+    "\n",
+    "## compare the decision tree model to a random forest. \n",
+    "\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "\n",
+    "# A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples \n",
+    "# of the dataset and uses averaging to improve the predictive accuracy and control over-fitting.\n",
+    "# The sub-sample size is controlled with the max_samples parameter if bootstrap=True (default), \n",
+    "#  otherwise the whole dataset is used to build each tree.\n",
+    "\n",
+    "rfc1 = RandomForestClassifier(n_estimators=100)\n",
+    "rfc2 = RandomForestClassifier(n_estimators=100)\n",
+    "\n",
+    "rfc1.fit(X1_train, y1_train)\n",
+    "rfc1_pred = rfc1.predict(X1_test) \n",
+    "\n",
+    "rfc2.fit(X2_train, y2_train)\n",
+    "rfc2_pred = rfc2.predict(X2_test) \n",
+    "\n",
+    "print(classification_report(y1_test,rfc1_pred))\n",
+    "print(confusion_matrix(y1_test,rfc1_pred))\n",
+    "\n",
+    "print(classification_report(y2_test,rfc2_pred))\n",
+    "print(confusion_matrix(y2_test,rfc2_pred))\n",
+    "\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}