Switch to unified view

a b/Linear Regression Model.ipynb
1
{
2
 "cells": [
3
  {
4
   "cell_type": "code",
5
   "execution_count": null,
6
   "metadata": {},
7
   "outputs": [],
8
   "source": [
9
    "import numpy as np\n",
10
    "import pandas as pd  \n",
11
    "import matplotlib.pylab as plt \n",
12
    "from matplotlib import pyplot as plt1\n",
13
    "import seaborn as sns \n",
14
    "from sklearn.model_selection import train_test_split \n",
15
    "# read the datafile using panda library.  ensure right file location on machine. \n",
16
    "data = pd.read_csv(r\"C:\\Users\\SAARTH CHAHAL\\Desktop\\Programming\\AIML\\smoking_driking_dataset_Ver01.csv\")\n",
17
    "# EDA (Exploratory Data Analysis): \n",
18
    "# Determine number of rows and colums in the provided data\n",
19
    "data.shape \n",
20
    "# print(data.shape)\n",
21
    "data.head()\n",
22
    "data.columns\n",
23
    "data.nunique(axis=0)\n",
24
    "\n",
25
    "# Describe to Understanding the dataset. \n",
26
    "data.describe().apply(lambda s: s.apply(lambda x: format(x, 'f')))\n",
27
    "\n",
28
    "# Cleaning dataset by removing null values using method.  on visual examination of data there was no null value\n",
29
    "data_cleaned = data.dropna(axis=0)\n",
30
    "print(data_cleaned.shape) \n",
31
    "\n",
32
    "# Cleaning  dataset by removing outliers \n",
33
    "# waistline range of 25 to 150 is based on observation of the data \n",
34
    "data_cleaned = data_cleaned[data_cleaned['waistline'].between(25,150)]   \n",
35
    "# sight_left above 5 is based on observation of the data  \n",
36
    "data_cleaned = data_cleaned[data_cleaned['sight_left'] < 5 ]\n",
37
    "# sight_right above 5 is based on observation of the data  \n",
38
    "data_cleaned = data_cleaned[data_cleaned['sight_right'] < 5 ]\n",
39
    "#since in correlation down the line we will require all number we will need to drop sex which takes string as input.  \n",
40
    "data_cleaned = data_cleaned.drop('sex',axis=1) \n",
41
    "# convert drinker as Y or N \n",
42
    "data_cleaned['DRK_YN'] = np.where(data_cleaned['DRK_YN'] == 'Y', 1,0 ) \n",
43
    "\n",
44
    "\n",
45
    "data_cleaned.shape\n",
46
    "\n",
47
    "data_cleaned.describe().apply(lambda s: s.apply(lambda x: format(x, 'f')))\n",
48
    "\n",
49
    "# Able to reduce 5803 records which are outliers in the data. \n",
50
    "# not changing any data related to BP as data seems to be in range. \n",
51
    "\n",
52
    "# Data Plotting exercise\n",
53
    "# to analyze relation ship between variables. \n",
54
    "# calculate the correlation matrix.   \n",
55
    "# There are too many variables to produce more readable correlation matrix and heatmap\n",
56
    "# Created 2 smaller array for matrix and heatmap for smoke and drink correlation \n",
57
    "\n",
58
    "dfdata= pd.DataFrame(data_cleaned) \n",
59
    "dfdata_smk=dfdata[['tot_chole','HDL_chole','LDL_chole','triglyceride','hemoglobin','SMK_stat_type_cd']]\n",
60
    "dfdata_drk=dfdata[['urine_protein','serum_creatinine','SGOT_AST','SGOT_ALT','gamma_GTP','DRK_YN']]\n",
61
    "corr_matrix_smk = dfdata_smk.corr()\n",
62
    "corr_matrix_drk = dfdata_drk.corr()\n",
63
    "\n",
64
    "# plot the heatmap \n",
65
    "sns.heatmap(corr_matrix_smk, xticklabels=corr_matrix_smk.columns, yticklabels=corr_matrix_smk.columns, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))\n",
66
    "sns.heatmap(corr_matrix_drk, xticklabels=corr_matrix_drk.columns, yticklabels=corr_matrix_drk.columns, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))\n",
67
    "\n",
68
    "# scatter plots for two variables \n",
69
    "dfdata.plot(kind='scatter', x='tot_chole', y='SMK_stat_type_cd')\n",
70
    "dfdata.plot(kind='scatter', x='SGOT_AST', y='DRK_YN') \n",
71
    "\n",
72
    "# sns.pairplot for few variables \n",
73
    "sns.pairplot ( dfdata ,\n",
74
    "x_vars=[\"age\" ,\"waistline\", \"tot_chole\" ,  \"SGOT_AST\"  , \"SMK_stat_type_cd\"   , \"DRK_YN\" ], \n",
75
    "y_vars=[\"age\" ,\"waistline\", \"tot_chole\" ,  \"SGOT_AST\"] , ) \n",
76
    "\n",
77
    "# Model training Module \n",
78
    "# Learning model \n",
79
    "\n",
80
    "from sklearn.model_selection import train_test_split \n",
81
    "# Train learning regression model  \n",
82
    "# We will need to first split up our data into an X1 array(cholesterol)  that contains the features to train on, \n",
83
    "# And a y1 array(SMK_stat_type_cd) with the target variable, \n",
84
    "X1=dfdata[['tot_chole','HDL_chole','LDL_chole','triglyceride','hemoglobin']]\n",
85
    "y1=dfdata['SMK_stat_type_cd']\n",
86
    "# split up our data into an X2 array(Kidney function) that contains the features to train on, \n",
87
    "# And a y2 array(DRK_YN)\n",
88
    "X2=dfdata[['urine_protein','serum_creatinine','SGOT_AST','SGOT_ALT','gamma_GTP']]\n",
89
    "y2=dfdata['DRK_YN']\n",
90
    "\n",
91
    "# Train test split. test split is 40 % train set is 60 % \n",
92
    "X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.4, random_state=42)\n",
93
    "X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.4, random_state=42)\n",
94
    "\n",
95
    "# #Loading the linear regression Model\n",
96
    "\n",
97
    "from sklearn.linear_model import LinearRegression \n",
98
    "\n",
99
    "lm1 = LinearRegression() \n",
100
    "lm2= LinearRegression() \n",
101
    "lm1.fit(X1_train,y1_train)  \n",
102
    "lm2.fit(X2_train,y2_train) \n",
103
    "# prediction on Training data  \n",
104
    "# prediction on Training data \n",
105
    "training_data_prediction1 = lm1.predict(X1_train) \n",
106
    "training_data_prediction2 = lm2.predict(X2_train) \n",
107
    "\n",
108
    "# Model evlauation.  \n",
109
    "# Let's evaluate the model by checking out it's coefficients and how we can interpret them.\n",
110
    "print(lm1.intercept_)\n",
111
    "coeff_df1 = pd.DataFrame(lm1.coef_,X1.columns,columns=['Coefficient'])\n",
112
    "coeff_df1 \n",
113
    "print(lm2.intercept_)\n",
114
    "coeff_df2 = pd.DataFrame(lm2.coef_,X2.columns,columns=['Coefficient'])\n",
115
    "coeff_df2 \n",
116
    "## interpreting the coefficient.\n",
117
    "# For every one unit change in smoke status there is negative impact on Cholestrol ( refelcted as negative)\n",
118
    "# and increase in  triglyceride and  hemoglobin which negatively affect the health indicator. \n",
119
    "\n",
120
    "# # Prediction from Model. \n",
121
    "\n",
122
    "predictions = lm1.predict(X1_test)\n",
123
    "predictions = lm2.predict(X2_test)\n",
124
    "plt1.scatter(y1_test,predictions)\n",
125
    "sns.displot((y1_test-predictions),bins=50); \n",
126
    "plt1.scatter(y2_test,predictions)\n",
127
    "sns.displot((y2_test-predictions),bins=50);\n",
128
    "\n",
129
    "# Regression Evaluation Metrics\n",
130
    "# Here are three common evaluation metrics for regression problems:\n",
131
    "# Mean Absolute Error** (MAE) is the mean of the absolute value of the errors: is the easiest to understand, because it's the average error.\n",
132
    "# Mean Squared Error** (MSE) is the mean of the squared errors: is more popular than MAE, because MSE \"punishes\" larger errors, which tends to be useful in the real world.\n",
133
    "# Root Mean Squared Error** (RMSE) is the square root of the mean of the squared errors: is even more popular than MSE, because RMSE is interpretable in the \"y\" units.\n",
134
    "\n",
135
    "from sklearn import metrics\n",
136
    "print('MAE:1',metrics.mean_absolute_error(y1_test, predictions))\n",
137
    "print('MSE:1',metrics.mean_squared_error(y1_test, predictions))\n",
138
    "print('RMSE:1',np.sqrt(metrics.mean_squared_error(y1_test, predictions)))\n",
139
    "\n",
140
    "print('MAE:2',metrics.mean_absolute_error(y2_test, predictions))\n",
141
    "print('MSE:2',metrics.mean_squared_error(y2_test, predictions))\n",
142
    "print('RMSE:2',np.sqrt(metrics.mean_squared_error(y2_test, predictions)))\n",
143
    "\n",
144
    "\n",
145
    "\n"
146
   ]
147
  }
148
 ],
149
 "metadata": {
150
  "kernelspec": {
151
   "display_name": "Python 3",
152
   "language": "python",
153
   "name": "python3"
154
  },
155
  "language_info": {
156
   "codemirror_mode": {
157
    "name": "ipython",
158
    "version": 3
159
   },
160
   "file_extension": ".py",
161
   "mimetype": "text/x-python",
162
   "name": "python",
163
   "nbconvert_exporter": "python",
164
   "pygments_lexer": "ipython3",
165
   "version": "3.11.5"
166
  },
167
  "orig_nbformat": 4
168
 },
169
 "nbformat": 4,
170
 "nbformat_minor": 2
171
}